diff --git a/.gitignore b/.gitignore index ea136b3a0655592a9c1de55fd139bf0b46030dfd..7521fdc239c30c1b48e9f00b33d8e8cd090a4136 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,8 @@ autosubmit/simple_test.py .vscode/ .vscode autosubmit.egg-info/ +venv/ +.pytest_cache/ +.cache/ +teeeest.py +test-unthreaded.py \ No newline at end of file diff --git a/VERSION b/VERSION index ad59f742d41a5920e82962f7131fdc79d22b454d..454731f6d5a86d05098cc6907f7c18a1e918048e 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.14.0b +3.15.0b diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 19dc23bafa68b27530a331331d5c40a762f90987..0050026a809efd6d4908b1ab9a69f75b5100accb 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# PYTHON_ARGCOMPLETE_OK # Copyright 2015-2020 Earth Sciences Department, BSC-CNS @@ -58,19 +57,22 @@ import locale from distutils.util import strtobool from log.log import Log, AutosubmitError, AutosubmitCritical from typing import Set +import sqlite3 -try: - import dialog -except Exception: - dialog = None +#try: +# import dialog +#except Exception: +# dialog = None +dialog = None from time import sleep -import argparse, argcomplete +import argparse import subprocess import json import tarfile import time import copy import os +import glob import pwd import sys import shutil @@ -162,7 +164,7 @@ class Autosubmit: parser.add_argument('-v', '--version', action='version', version=Autosubmit.autosubmit_version) parser.add_argument('-lf', '--logfile', choices=('NO_LOG', 'INFO', 'WARNING', 'DEBUG'), - default='WARNING', type=str, + default='DEBUG', type=str, help="sets file's log level.") parser.add_argument('-lc', '--logconsole', choices=('NO_LOG', 'INFO', 'WARNING', 'DEBUG'), default='INFO', type=str, @@ -581,14 +583,13 @@ class Autosubmit: # Changelog subparsers.add_parser('changelog', description='show changelog') - argcomplete.autocomplete(parser) args = parser.parse_args() except Exception as e: if type(e) is SystemExit: # Version keyword force an exception in parse arg due and os_exit(0) but the program is succesfully finished - if e.message == 0: + if str(e) == 0: print(Autosubmit.autosubmit_version) os._exit(0) raise AutosubmitCritical( @@ -672,7 +673,7 @@ class Autosubmit: print(f.read()) return True return False - elif args.command == 'dbfix': + elif args.command == 'dbfix': return Autosubmit.database_fix(args.expid) elif args.command == 'pklfix': return Autosubmit.pkl_fix(args.expid) @@ -691,22 +692,28 @@ class Autosubmit: else: expid_less.append("migrate") # pickup import platform - host = platform.node() + fullhost = platform.node() + if "." in fullhost: + host = fullhost.split(".")[0] + elif "," in fullhost: + host = fullhost.split(",")[0] + else: + host = fullhost forbidden = BasicConfig.DENIED_HOSTS authorized = BasicConfig.ALLOWED_HOSTS - message = "Command: {0} is not allowed to run in host: {1}.\n".format(args.command.upper(),host) + message = "Command: {0} is not allowed to run in host: {1}.\n".format(args.command.upper(),fullhost) message += "List of permissions as follows:Command | hosts \nAllowed hosts\n" for command in BasicConfig.ALLOWED_HOSTS: message += " {0}:{1} \n".format(command,BasicConfig.ALLOWED_HOSTS[command]) message += "Denied hosts\n" for command in BasicConfig.DENIED_HOSTS: message += " {0}:{1} \n".format(command,BasicConfig.DENIED_HOSTS[command]) - message += "[Command: autosubmit {0}] is not allowed to run in [host: {1}].".format(args.command.upper(), host) + message += "[Command: autosubmit {0}] is not allowed to run in [host: {1}].".format(args.command.upper(), fullhost) if args.command in BasicConfig.DENIED_HOSTS: - if 'all' in BasicConfig.DENIED_HOSTS[args.command] or host in BasicConfig.DENIED_HOSTS[args.command]: + if 'all' in BasicConfig.DENIED_HOSTS[args.command] or host in BasicConfig.DENIED_HOSTS[args.command] or fullhost in BasicConfig.DENIED_HOSTS[args.command]: raise AutosubmitCritical(message, 7071) if args.command in BasicConfig.ALLOWED_HOSTS: - if 'all' not in BasicConfig.ALLOWED_HOSTS[args.command] and host not in BasicConfig.ALLOWED_HOSTS[args.command]: + if 'all' not in BasicConfig.ALLOWED_HOSTS[args.command] and not (host in BasicConfig.ALLOWED_HOSTS[args.command] or fullhost in BasicConfig.ALLOWED_HOSTS[args.command]): raise AutosubmitCritical(message, 7071) if expid != 'None' and args.command not in expid_less and args.command not in global_log_command: as_conf = AutosubmitConfig(expid, BasicConfig, ConfigParserFactory()) @@ -755,18 +762,19 @@ class Autosubmit: force_update_version = args.update_version else: force_update_version = False - if force_update_version: - if as_conf.get_version() != Autosubmit.autosubmit_version: - Log.info("The {2} experiment {0} version is being updated to {1} for match autosubmit version", - as_conf.get_version(), Autosubmit.autosubmit_version, expid) - as_conf.set_version(Autosubmit.autosubmit_version) - else: - if as_conf.get_version() is not None and as_conf.get_version() != Autosubmit.autosubmit_version: - raise AutosubmitCritical( - "Current experiment uses ({0}) which is not the running Autosubmit version \nPlease, update the experiment version if you wish to continue using AutoSubmit {1}\nYou can achieve this using the command autosubmit updateversion {2} \n" - "Or with the -v parameter: autosubmit {3} {2} -v ".format(as_conf.get_version(), - Autosubmit.autosubmit_version, expid,args.command), - 7067) + if args.command not in ["upgrade","updateversion"]: + if force_update_version: + if as_conf.get_version() != Autosubmit.autosubmit_version: + Log.info("The {2} experiment {0} version is being updated to {1} for match autosubmit version", + as_conf.get_version(), Autosubmit.autosubmit_version, expid) + as_conf.set_version(Autosubmit.autosubmit_version) + else: + if as_conf.get_version() is not None and as_conf.get_version() != Autosubmit.autosubmit_version: + raise AutosubmitCritical( + "Current experiment uses ({0}) which is not the running Autosubmit version \nPlease, update the experiment version if you wish to continue using AutoSubmit {1}\nYou can achieve this using the command autosubmit updateversion {2} \n" + "Or with the -v parameter: autosubmit {3} {2} -v ".format(as_conf.get_version(), + Autosubmit.autosubmit_version, expid,args.command), + 7014) else: if expid == 'None': exp_id = "" @@ -780,19 +788,24 @@ class Autosubmit: args.command + exp_id + '.log'), "out", log_level) Log.set_file(os.path.join(BasicConfig.GLOBAL_LOG_DIR, args.command + exp_id + '_err.log'), "err") - #Enforce LANG=C + # Enforce LANG=UTF-8 try: try: - locale.setlocale(locale.LC_ALL,'C.UTF-8') - except: - locale.setlocale(locale.LC_ALL, 'C.utf8') - except: + locale.setlocale(locale.LC_ALL, 'C.UTF-8') + except Exception as e: + try: + locale.setlocale(locale.LC_ALL, 'C.utf8') + except Exception as e: + try: + locale.setlocale(locale.LC_ALL, 'en_GB') + except Exception as e: + locale.setlocale(locale.LC_ALL, 'es_ES') + except Exception as e: Log.info("Locale C.utf8 is not found, using '{0}' as fallback".format("C")) locale.setlocale(locale.LC_ALL, 'C') Log.info( "Autosubmit is running with {0}", Autosubmit.autosubmit_version) - @staticmethod def _check_ownership(expid,raise_error=False): """ @@ -816,6 +829,16 @@ class Autosubmit: :return: True if succesfully deleted, False otherwise :rtype: boolean """ + message = "The {0} experiment was removed from the local disk and from the database.".format(expid_delete) + message+= " Note that this action does not delete any data written by the experiment.\n" + message+= "Complete list of files/directories deleted:\n" + for root, dirs, files in os.walk(os.path.join(BasicConfig.LOCAL_ROOT_DIR, expid_delete)): + for dir_ in dirs: + message += os.path.join(root, dir_) + "\n" + message += os.path.join(BasicConfig.LOCAL_ROOT_DIR, BasicConfig.STRUCTURES_DIR, + "structure_{0}.db".format(expid_delete)) + "\n" + message += os.path.join(BasicConfig.LOCAL_ROOT_DIR, BasicConfig.JOBDATA_DIR, + "job_data_{0}.db".format(expid_delete)) + "\n" owner,eadmin,currentOwner = Autosubmit._check_ownership(expid_delete) if expid_delete == '' or expid_delete is None and not os.path.exists(os.path.join(BasicConfig.LOCAL_ROOT_DIR,expid_delete)): Log.printlog("Experiment directory does not exist.",Log.WARNING) @@ -834,28 +857,28 @@ class Autosubmit: if ret: Log.result("Experiment {0} deleted".format(expid_delete)) except BaseException as e: - error_message += 'Can not delete experiment entry: {0}\n'.format(e.message) + error_message += 'Can not delete experiment entry: {0}\n'.format(str(e)) Log.info("Removing experiment directory...") try: shutil.rmtree(os.path.join(BasicConfig.LOCAL_ROOT_DIR, expid_delete)) except BaseException as e: - error_message += 'Can not delete directory: {0}\n'.format(e.message) + error_message += 'Can not delete directory: {0}\n'.format(str(e)) try: Log.info("Removing Structure db...") structures_path = os.path.join(BasicConfig.LOCAL_ROOT_DIR, BasicConfig.STRUCTURES_DIR, "structure_{0}.db".format(expid_delete)) if os.path.exists(structures_path): os.remove(structures_path) except BaseException as e: - error_message += 'Can not delete structure: {0}\n'.format(e.message) + error_message += 'Can not delete structure: {0}\n'.format(str(e)) try: Log.info("Removing job_data db...") job_data_path = os.path.join(BasicConfig.LOCAL_ROOT_DIR, BasicConfig.JOBDATA_DIR, "job_data_{0}.db".format(expid_delete)) if os.path.exists(job_data_path): os.remove(job_data_path) except BaseException as e: - error_message += 'Can not delete job_data: {0}\n'.format(e.message) + error_message += 'Can not delete job_data: {0}\n'.format(str(e)) except OSError as e: - error_message += 'Can not delete directory: {0}\n'.format(e.message) + error_message += 'Can not delete directory: {0}\n'.format(str(e)) else: if not eadmin: raise AutosubmitCritical( @@ -863,6 +886,7 @@ class Autosubmit: else: raise AutosubmitCritical( 'Current user is not the owner of the experiment. {0} can not be deleted!'.format(expid_delete), 7012) + Log.printlog(message, Log.RESULT) except Exception as e: # Avoid calling Log at this point since it is possible that tmp folder is already deleted. error_message += "Couldn't delete the experiment".format(e.message) @@ -903,7 +927,6 @@ class Autosubmit: os.mkdir(os.path.join( BasicConfig.LOCAL_ROOT_DIR, exp_id, 'conf')) Log.info("Copying config files...") - # autosubmit config and experiment copied from AS. files = resource_listdir('autosubmit.config', 'files') for filename in files: @@ -934,7 +957,7 @@ class Autosubmit: Autosubmit._prepare_conf_files( exp_id, hpc, Autosubmit.autosubmit_version, dummy, copy_id) except (OSError, IOError) as e: - Autosubmit._delete_expid(exp_id) + Autosubmit._delete_expid(exp_id, True) raise AutosubmitCritical( "Couldn't create a new experiment, permissions?", 7012, e.message) except BaseException as e: @@ -1347,6 +1370,7 @@ class Autosubmit: for wrapper_section in as_conf.get_wrapper_multi(): wrapper_jobs[wrapper_section] = as_conf.get_wrapper_jobs(wrapper_section) wrapper_jobs["wrapper"] = as_conf.get_wrapper_jobs("wrapper") + # Log.warning("Aux Job_list was generated successfully") submitter = Autosubmit._get_submitter(as_conf) submitter.load_platforms(as_conf) @@ -1372,8 +1396,8 @@ class Autosubmit: while job_list.get_active(): Autosubmit.submit_ready_jobs(as_conf, job_list, platforms_to_test, packages_persistence, True, only_wrappers, hold=False) - for job in job_list.get_uncompleted_and_not_waiting(): - job.status = Status.COMPLETED + #for job in job_list.get_uncompleted_and_not_waiting(): + # job.status = Status.COMPLETED job_list.update_list(as_conf, False) @staticmethod @@ -1455,8 +1479,10 @@ class Autosubmit: if job.platform_name is None: job.platform_name = hpcarch # noinspection PyTypeChecker - job.platform = submitter.platforms[job.platform_name.lower( - )] + try: + job.platform = submitter.platforms[job.platform_name.lower()] + except: + raise AutosubmitCritical("hpcarch={0} not found in the platforms configuration file".format(job.platform_name), 7014) # noinspection PyTypeChecker if job.status not in (Status.COMPLETED, Status.SUSPENDED): platforms_to_test.add(job.platform) @@ -1553,10 +1579,14 @@ class Autosubmit: # Historical Database: Can create a new run if there is a difference in the number of jobs or if the current run does not exist. exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) exp_history.initialize_database() - exp_history.process_status_changes(job_list.get_job_list(), as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), current_config=as_conf.get_full_config_as_json()) + exp_history.process_status_changes(job_list.get_job_list(), as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), current_config=as_conf.get_full_config_as_json()) + Autosubmit.database_backup(expid) except Exception as e: - # This error is important - raise AutosubmitCritical("Error while processing historical database.", 7005, str(e)) + try: + Autosubmit.database_fix(expid) + # This error is important + except: + pass try: ExperimentStatus(expid).set_as_running() except Exception as e: @@ -1569,7 +1599,8 @@ class Autosubmit: Log.result("Only jobs with member value in {0} or no member will be allowed in this run. Also, those jobs already SUBMITTED, QUEUING, or RUNNING will be allowed to complete and will be tracked.".format( str(allowed_members))) except AutosubmitCritical as e: - raise AutosubmitCritical(e.message, 7067, e.trace) + e.message += " HINT: check the CUSTOM_DIRECTIVE syntax in your jobs configuration files." + raise AutosubmitCritical(e.message, 7014, e.trace) except Exception as e: raise AutosubmitCritical( "Error in run initialization", 7014, str(e)) # Changing default to 7014 @@ -1578,7 +1609,7 @@ class Autosubmit: if unparsed_two_step_start != "": job_list.parse_jobs_by_filter(unparsed_two_step_start) - main_loop_retrials = 3650 # Hard limit of tries 3650 tries at 15-120seconds sleep each try + main_loop_retrials = 11250*2 # Hard limit of tries ( 48h min 72h max), 2 retrials per stop # establish the connection to all platforms Autosubmit.restore_platforms(platforms_to_test) @@ -1589,6 +1620,7 @@ class Autosubmit: # AUTOSUBMIT - MAIN LOOP ######################### # Main loop. Finishing when all jobs have been submitted + while job_list.get_active(): #Log.info("FD: {0}".format(log.fd_show.fd_table_status_str())) try: @@ -1659,7 +1691,11 @@ class Autosubmit: Log.debug('Checking Wrapper {0}'.format(str(job_id))) wrapper_job.checked_time = datetime.datetime.now() # This is where wrapper will be checked on the slurm platform, update takes place. - platform.check_job(wrapper_job) + try: + platform.check_job(wrapper_job,is_wrapper=True) + except BaseException as e: + job_list.save() + raise AutosubmitError("The communication with {0} went wrong while checking wrapper {1}\n{2}".format(platform.name,wrapper_job.id,str(e))) #Log.info("FD 3Wrapper checked: {0}".format(log.fd_show.fd_table_status_str())) try: if wrapper_job.status != wrapper_job.new_status: @@ -1671,8 +1707,12 @@ class Autosubmit: "Wrapper is in Unknown Status couldn't get wrapper parameters", 7050) # New status will be saved and inner_jobs will be checked. - wrapper_job.check_status( - wrapper_job.new_status) + try: + wrapper_job.check_status(wrapper_job.new_status) + except: + job_list.save() + raise AutosubmitError("The communication with {0} went wrong while checking the inner_jobs of {1}\n{2}".format(platform.name,wrapper_job.id,str(e))) + # Erase from packages if the wrapper failed to be queued ( Hold Admin bug ) if wrapper_job.status == Status.WAITING: for inner_job in wrapper_job.job_list: @@ -1731,10 +1771,9 @@ class Autosubmit: # Check slurm single jobs, the other platforms has already been checked. for platform_jobs in slurm: platform = platform_jobs[0] - jobs_to_check = platform_jobs[1] Log.debug("Checking all jobs at once") platform.check_Alljobs( - platform_jobs[3], jobs_to_check, as_conf.get_copy_remote_logs()) + platform_jobs[3], as_conf) #Log.info("FD slurm jobs: {0}".format(log.fd_show.fd_table_status_str())) for j_Indx in xrange(0, len(platform_jobs[3])): @@ -1756,6 +1795,7 @@ class Autosubmit: as_conf, submitter=submitter) job_list.save() if len(job_list.get_ready()) > 0: + Log.debug("Reloading configuration each Autosubmit iteration") save = Autosubmit.submit_ready_jobs( as_conf, job_list, platforms_to_test, packages_persistence, hold=False) job_list.update_list(as_conf, submitter=submitter) @@ -1767,9 +1807,24 @@ class Autosubmit: job_list.update_list(as_conf, submitter=submitter) job_list.save() # Safe spot to store changes - exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) - if len(job_changes_tracker) > 0: - exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + try: + exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, + historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) + if len(job_changes_tracker) > 0: + exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + Autosubmit.database_backup(expid) + except BaseException as e: + Log.printlog("Historic database seems corrupted, AS will repair it and resume the run", + Log.INFO) + try: + Autosubmit.database_fix(expid) + exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, + historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) + if len(job_changes_tracker) > 0: + exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + Autosubmit.database_backup(expid) + except: + Log.warning("Couldn't recover the Historical database, AS will continue without it, GUI may be affected") job_changes_tracker = {} if Autosubmit.exit: job_list.save() @@ -1783,8 +1838,17 @@ class Autosubmit: recovery = False as_conf = AutosubmitConfig(expid, BasicConfig, ConfigParserFactory()) consecutive_retrials = 1 - delay = min(15*consecutive_retrials,120) + failed_names = {} + Log.info("Storing failed job count...") + try: + for job in job_list.get_job_list(): + if job.fail_count > 0: + failed_names[job.name] = job.fail_count + except BaseException as e: + Log.printlog("Error trying to store failed job count",Log.WARNING) + Log.result("Storing failed job count...done") while not recovery and main_loop_retrials > 0: + delay = min(15 * consecutive_retrials, 30) main_loop_retrials = main_loop_retrials - 1 sleep(delay) consecutive_retrials = consecutive_retrials + 1 @@ -1794,6 +1858,7 @@ class Autosubmit: Log.info("Recovering job_list...") job_list = Autosubmit.load_job_list( expid, as_conf, notransitive=notransitive) + Log.info("Recovering job_list... Done") if allowed_members: # Set allowed members after checks have been performed. This triggers the setter and main logic of the -rm feature. job_list.run_members = allowed_members @@ -1801,26 +1866,20 @@ class Autosubmit: "Only jobs with member value in {0} or no member will be allowed in this run. Also, those jobs already SUBMITTED, QUEUING, or RUNNING will be allowed to complete and will be tracked.".format( str(allowed_members))) platforms_to_test = set() + Log.info("Recovering platform information...") for job in job_list.get_job_list(): if job.platform_name is None: job.platform_name = hpcarch job.platform = submitter.platforms[job.platform_name.lower()] platforms_to_test.add(job.platform) - #Recover job_list while keeping job.fail_count - failed_names = {} - for job in job_list.get_job_list(): - if job.platform_name is None: - job.platform_name = hpcarch - job.platform = submitter.platforms[job.platform_name.lower()] - platforms_to_test.add(job.platform) - if job.fail_count > 0: - failed_names[job.name] = job.fail_count + + Log.info("Recovering platform information... Done") + Log.info("Recovering Failure count...") for job in job_list.get_job_list(): if job.name in failed_names.keys(): job.fail_count = failed_names[job.name] - if job.platform_name is None: - job.platform_name = hpcarch - job.platform = submitter.platforms[job.platform_name.lower()] + Log.info("Recovering Failure count... Done") + Log.info("Recovering parameters...") Autosubmit._load_parameters(as_conf, job_list, submitter.platforms) # Recovery wrapper [Packages] @@ -1876,9 +1935,11 @@ class Autosubmit: None, None, jobs[0].platform, as_conf, jobs[0].hold) job_list.job_package_map[jobs[0].id] = wrapper_job + Log.info("Recovering wrappers... Done") job_list.update_list(as_conf) Log.info("Saving recovered job list...") job_list.save() + Log.info("Saving recovered job list... Done") recovery = True Log.result("Recover of job_list is completed") except AutosubmitError as e: @@ -1886,10 +1947,10 @@ class Autosubmit: Log.result("Recover of job_list has fail {0}".format(e.message)) except IOError as e: recovery = False - Log.result("Recover of job_list has fail".format(e.message)) + Log.result("Recover of job_list has fail {0}".format(e.message)) except BaseException as e: recovery = False - Log.result("Recover of job_list has fail".format(e.message)) + Log.result("Recover of job_list has fail {0}".format(e.message)) # Restore platforms and try again, to avoid endless loop with failed configuration, a hard limit is set. reconnected = False mail_notify = True @@ -1924,18 +1985,25 @@ class Autosubmit: except BaseException: reconnected = False if main_loop_retrials <= 0: - raise AutosubmitCritical("Autosubmit Encounter too much errors during running time, limit of 4hours reached", 7051, e.message) + raise AutosubmitCritical("Autosubmit Encounter too much errors during running time, limit of {0} retrials reached".format(main_loop_retrials), 7051, e.message) except AutosubmitCritical as e: # Critical errors can't be recovered. Failed configuration or autosubmit error raise AutosubmitCritical(e.message, e.code, e.trace) except portalocker.AlreadyLocked: message = "We have detected that there is another Autosubmit instance using the experiment\n. Stop other Autosubmit instances that are using the experiment or delete autosubmit.lock file located on tmp folder" raise AutosubmitCritical(message, 7000) except BaseException as e: # If this happens, there is a bug in the code or an exception not-well caught - raise AutosubmitCritical("There is a bug in the code, please contact via git",7070,e.message) + raise AutosubmitCritical("There is a bug in the code, please contact via gitlab",7070,str(e)) Log.result("No more jobs to run.") # Updating job data header with current information when experiment ends - exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) - exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + try: + exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) + exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + Autosubmit.database_backup(expid) + except: + try: + Autosubmit.database_fix(expid) + except: + pass # Wait for all remaining threads of I/O, close remaining connections timeout = 0 active_threads = True @@ -1964,9 +2032,9 @@ class Autosubmit: message = "We have detected that there is another Autosubmit instance using the experiment\n. Stop other Autosubmit instances that are using the experiment or delete autosubmit.lock file located on tmp folder" raise AutosubmitCritical(message, 7000) except AutosubmitCritical as e: - raise AutosubmitCritical(e.message, e.code, e.trace) + raise except BaseException as e: - raise AutosubmitCritical("This seems like a bug in the code, please contact AS developers", 7070,e.message) + raise AutosubmitCritical("This seems like a bug in the code, please contact AS developers", 7070,str(e)) @staticmethod def restore_platforms(platform_to_test,mail_notify=False,as_conf=None,expid=expid): @@ -2007,13 +2075,20 @@ class Autosubmit: platform_issues += "\n[{0}] has configuration issues.\n Check that the connection is passwd-less.(ssh {1}@{4})\n Check the parameters that build the root_path are correct:{{scratch_dir/project/user}} = {{{3}/{2}/{1}}}".format( platform.name, platform.user, platform.project, platform.scratch,platform.host) issues += platform_issues + # Checks if bashrc is provinding output that could mess with Autosubmit remote pooling, if so, warns the user but continues as Autosubmit should be able to strip the output + platform.get_bashrc_output() + if platform.bashrc_output != "" or platform.bashrc_err != "": + Log.warning("Bashrc is providing output that could mess with Autosubmit remote pooling\nHINT: add [ -z \"$PS1\" ] && return. at the header of {1}:~/.bashrc".format(platform.name,platform.host)) if platform_issues == "": Log.result("[{1}] Connection successful to host {0}", platform.host, platform.name) else: - platform.connected = False - Log.printlog("[{1}] Connection failed to host {0}".format( platform.host, platform.name),Log.WARNING) + if platform.connected: + platform.connected = False + Log.printlog("[{1}] Connection sucessful to host {0}, however there are issues with %HPCROOT%".format(platform.host, platform.name), + Log.WARNING) + else: + Log.printlog("[{1}] Connection failed to host {0}".format(platform.host, platform.name), Log.WARNING) if issues != "": - platform.connected = False raise AutosubmitCritical( "Issues while checking the connectivity of platforms.", 7010, issues+"\n"+ssh_config_issues) @@ -2057,7 +2132,6 @@ class Autosubmit: platform.open_submit_script() valid_packages_to_submit = [] # type: List[JobPackageBase] for package in packages_to_submit: - try: # If called from inspect command or -cw if only_wrappers or inspect: @@ -2081,7 +2155,9 @@ class Autosubmit: package.submit(as_conf, job_list.parameters, inspect, hold=hold) save=True if not inspect: - job_list.save() + if str(platform.type).lower() != "slurm": + job_list.update_list(as_conf) + job_list.save() valid_packages_to_submit.append(package) except (IOError, OSError): if package.jobs[0].id != 0: @@ -2091,7 +2167,7 @@ class Autosubmit: if package.jobs[0].id != 0: failed_packages.append(package.jobs[0].id) platform.connected = False - if e.trace.lower().find("bad parameters") != -1 or e.message.lower().find("scheduler is not installed") != -1: + if str(e.trace).lower().find("bad parameters") != -1 or str(e.message).lower().find("scheduler is not installed") != -1: error_msg = "" for package_tmp in valid_packages_to_submit: for job_tmp in package_tmp.jobs: @@ -2100,7 +2176,7 @@ class Autosubmit: for job_tmp in package.jobs: if job_tmp.section not in error_msg: error_msg += job_tmp.section + "&" - if e.trace.lower().find("bad parameters") != -1: + if str(e.trace).lower().find("bad parameters") != -1: error_message+="\ncheck job and queue specified in jobs.conf. Sections that could be affected: {0}".format( error_msg[:-1]) else: @@ -2124,26 +2200,49 @@ class Autosubmit: raise except Exception as e: raise - if platform.type == "slurm" and not inspect and not only_wrappers: # return to == + if str(platform.type).lower() in ["slurm", "pjm"] and not inspect and not only_wrappers: try: valid_packages_to_submit = [ package for package in valid_packages_to_submit if package.x11 != True] if len(valid_packages_to_submit) > 0: + submit_time = int(time.time() / 60) try: jobs_id = platform.submit_Script(hold=hold) except AutosubmitError as e: - jobnames = [job.name for job in valid_packages_to_submit[0].jobs] - for jobname in jobnames: - jobid = platform.get_jobid_by_jobname(jobname) - #cancel bad submitted job if jobid is encountered - for id in jobid: - platform.cancel_job(id) + if not e.message: + e.message = "" + try: + for package in valid_packages_to_submit: + try: + elapsed_time_minutes = str(int(round(int(time.time() / 60) - submit_time)+1)) + job_historic = platform.get_jobid_by_jobname(package.jobs[0].name,minutes=elapsed_time_minutes) + except: + job_historic = [] + #Recover jobid from jobname + if len(job_historic) > 0 and isinstance(job_historic, list): + job_id = job_historic[-1] + for job_id_historic in job_historic: + if job_id_historic != job_id: + try: + platform.send_command(platform.cancel_cmd + " {0}".format(job_id_historic)) + except: + pass + for job in package.jobs: + job.hold = hold + job.id = str(job_id) + job.status = Status.SUBMITTED + job.write_submit_time(hold=hold) + except: + pass + job_list.save() + job_list.update_list(as_conf,store_change=True) jobs_id = None platform.connected = False - if type(e.trace) is not None: - has_trace_bad_parameters = e.trace.lower().find("bad parameters") != -1 + if e.trace is not None: + has_trace_bad_parameters = str(e.trace).lower().find("bad parameters") != -1 else: + e.trace = "" has_trace_bad_parameters = False - if has_trace_bad_parameters or e.message.lower().find("invalid partition") != -1 or e.message.lower().find(" invalid qos") != -1 or e.message.lower().find("scheduler is not installed") != -1: + if has_trace_bad_parameters or e.message.lower().find("invalid partition") != -1 or e.message.lower().find("invalid qos") != -1 or e.message.lower().find("scheduler is not installed") != -1 or e.message.lower().find("failed") != -1 or e.message.lower().find("not available") != -1: error_msg = "" for package_tmp in valid_packages_to_submit: for job_tmp in package_tmp.jobs: @@ -2154,7 +2253,9 @@ class Autosubmit: else: error_message+="Check that {1} platform has set the correct scheduler. Sections that could be affected: {0}".format( error_msg[:-1], platform.name) - raise AutosubmitCritical(error_message,7014,e.message+"\n"+e.trace) + if e.trace is None: + e.trace = "" + raise AutosubmitCritical(error_message,7014,e.message+"\n"+str(e.trace)) except IOError as e: raise AutosubmitError( "IO issues ", 6016, e.message) @@ -2164,8 +2265,21 @@ class Autosubmit: raise AutosubmitError( "Submission failed, this can be due a failure on the platform", 6015, e.message) if jobs_id is None or len(jobs_id) <= 0: + try: + jobnames = [] + for package in valid_packages_to_submit: + jobnames += [job.name for job in package.jobs] + for jobname in jobnames: + jobid = platform.get_jobid_by_jobname(jobname) + #cancel bad submitted job if jobid is encountered + for id in jobid: + platform.cancel_job(id) + except: + pass + platform.connected = False + raise AutosubmitError( - "Submission failed, this can be due a failure on the platform\n{0}\n{1}".format(e.message,e.trace), 6015) + "Submission failed, this can be due a failure on the platform\n{0}\n{1}".format(str(platform.name),""), 6015) i = 0 if hold: sleep(10) @@ -2204,16 +2318,19 @@ class Autosubmit: job.status = Status.SUBMITTED job.write_submit_time(hold=hold) i += 1 - save = True + if not inspect: + job_list.save() if len(failed_packages) > 0: - for job_id in failed_packages: - package.jobs[0].platform.send_command( - package.jobs[0].platform.cancel_cmd + " {0}".format(job_id)) + try: + for job_id in failed_packages: + platform.send_command( platform.cancel_cmd + " {0}".format(job_id)) + except: + pass raise AutosubmitError( "{0} submission failed, some hold jobs failed to be held".format(platform.name), 6015) except WrongTemplateException as e: raise AutosubmitCritical("Invalid parameter substitution in {0} template".format( - e.job_name), 7014, e.message) + e.job_name), 7014, str(e)) except AutosubmitError as e: raise except AutosubmitCritical as e: @@ -2484,7 +2601,7 @@ class Autosubmit: job_list = Autosubmit.load_job_list(expid, as_conf, notransitive=notransitive) Log.debug("Job list restored from {0} files", pkl_dir) jobs = StatisticsUtils.filter_by_section(job_list.get_job_list(), filter_type) - jobs, period_ini, period_fi = StatisticsUtils.filter_by_time_period(jobs, filter_period) + jobs, period_ini, period_fi = StatisticsUtils.filter_by_time_period(jobs, filter_period) # Package information job_to_package, package_to_jobs, _, _ = JobList.retrieve_packages(BasicConfig, expid, [job.name for job in job_list.get_job_list()]) queue_time_fixes = {} @@ -2621,9 +2738,9 @@ class Autosubmit: job.platform_name = hpcarch job.platform = submitter.platforms[job.platform_name.lower()] platforms_to_test.add(job.platform) + job.platform.send_command(job.platform.cancel_cmd + " " + str(job.id), ignore_log=True) for platform in platforms_to_test: platform.test_connection() - job.platform.send_command(job.platform.cancel_cmd + " " + str(job.id), ignore_log=True) if not force: raise AutosubmitCritical( "Experiment can't be recovered due being {0} active jobs in your experiment, If you want to recover the experiment, please use the flag -f and all active jobs will be cancelled".format( @@ -3179,16 +3296,16 @@ class Autosubmit: # Preparation for section parameters no_load_sections = False no_load_platforms = False - try: - job_list = Autosubmit.load_job_list( - expid, as_conf, notransitive=False) - except Exception as e: - no_load_sections = True + + job_list = Autosubmit.load_job_list( + expid, as_conf, notransitive=False) + try: submitter = Autosubmit._get_submitter(as_conf) submitter.load_platforms(as_conf) except Exception as e: no_load_platforms = True + submitter = Autosubmit._get_submitter(as_conf) submitter.load_local_platform(as_conf) try: # Gathering parameters of autosubmit and expdef config files @@ -3550,6 +3667,7 @@ class Autosubmit: parser = SafeConfigParser() parser.optionxform = str parser.read(path) + if parser.has_option('database', 'path'): database_path = parser.get('database', 'path') if parser.has_option('database', 'filename'): @@ -3687,6 +3805,8 @@ class Autosubmit: d.msgbox("Configuration file written successfully", width=50, height=5) os.system('clear') + + except (IOError, OSError) as e: raise AutosubmitCritical( "Can not write config file", 7012, e.message) @@ -3866,6 +3986,17 @@ class Autosubmit: raise @staticmethod + def database_backup(expid): + try: + database_path= os.path.join(BasicConfig.JOBDATA_DIR, "job_data_{0}.db".format(expid)) + backup_path = os.path.join(BasicConfig.JOBDATA_DIR, "job_data_{0}.sql".format(expid)) + command = "sqlite3 {0} .dump > {1} ".format(database_path, backup_path) + Log.debug("Backing up jobs_data...") + out = subprocess.call(command, shell=True) + Log.debug("Jobs_data database backup completed.") + except BaseException as e: + Log.debug("Jobs_data database backup failed.") + @staticmethod def database_fix(expid): """ Database methods. Performs a sql dump of the database and restores it. @@ -3874,56 +4005,36 @@ class Autosubmit: :type expid: str :return: :rtype: - """ + """ os.umask(0) # Overrides user permissions current_time = int(time.time()) + corrupted_db_path = os.path.join(BasicConfig.JOBDATA_DIR, "job_data_{0}_corrupted.db".format(expid)) + database_path = os.path.join(BasicConfig.JOBDATA_DIR, "job_data_{0}.db".format(expid)) - database_backup_path = os.path.join(BasicConfig.JOBDATA_DIR, "job_data_{0}_{1}.db".format(expid, str(current_time))) - dump_file_name = 'job_data_{0}_{1}.sql'.format(expid, current_time) + database_backup_path = os.path.join(BasicConfig.JOBDATA_DIR, "job_data_{0}.sql".format(expid)) + dump_file_name = 'job_data_{0}.sql'.format(expid, current_time) dump_file_path = os.path.join(BasicConfig.JOBDATA_DIR, dump_file_name) - bash_command = 'sqlite3 {0} .dump > {1}'.format(database_path, dump_file_path) + bash_command = 'cat {1} | sqlite3 {0}'.format(database_path, dump_file_path) try: - if os.path.exists(database_path): + if os.path.exists(database_path): + result = os.popen("mv {0} {1}".format(database_path, corrupted_db_path)).read() + time.sleep(1) + Log.info("Original database moved.") + try: + exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, + historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) + Log.info("Restoring from sql") result = os.popen(bash_command).read() - if result is not None and os.path.exists(dump_file_path): - Log.info("sqldump {0} created".format(dump_file_path)) - Log.info( - "Backing up original database {0}".format(database_path)) - result = os.popen("mv {0} {1}".format(database_path, database_backup_path)).read() - time.sleep(10) - if result is not None and not os.path.exists(database_path): - Log.info("Original database moved.") - Log.info("Restoring from sqldump") - HUtils.create_file_with_full_permissions(database_path) - result = os.popen("cat {0} | sqlite3 {1}".format( - dump_file_path, database_path)).read() - time.sleep(10) - if result is not None and os.path.exists(database_path): - Log.info( - "Database {0} restored.".format(database_path)) - Log.info("Deleting sqldump.") - result = os.popen( - "rm {0}".format(dump_file_path)).read() - sleep(5) - if result is not None and not os.path.exists(dump_file_path): - ExperimentHistory(expid).initialize_database() - Log.info("sqldump file deleted.") - Log.result( - "The database {0} has been fixed.".format(database_path)) - else: - raise Exception( - "The sqldump file could not be removed.") - else: - raise Exception( - "It was not possible to restore the sqldump file.") - else: - raise Exception( - "It was not possible to delete the original database.") - else: - raise Exception("The sqldump file couldn't be created.") - else: - raise Exception("The database file doesn't exist.") - except Exception as exp: + exp_history.initialize_database() + + except: + Log.warning("It was not possible to restore the jobs_data.db file... , a new blank db will be created") + result = os.popen("rm {0}".format(database_path)).read() + + exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, + historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) + exp_history.initialize_database() + except Exception as exp: Log.critical(str(exp)) @staticmethod @@ -4000,7 +4111,7 @@ class Autosubmit: Log.warning("Experiment folder renamed to: {0}".format( exp_folder + "_to_delete ")) except Exception as e: - Autosubmit.unarchive(expid, uncompress=False) + Autosubmit.unarchive(expid, uncompressed=False) raise AutosubmitCritical( "Can not remove or rename experiments folder", 7012, str(e)) @@ -4207,17 +4318,20 @@ class Autosubmit: try: exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) exp_history.initialize_database() - exp_history.create_new_experiment_run(as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), as_conf.get_full_config_as_json(), job_list.get_job_list()) + + #exp_history.create_new_experiment_run(as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), as_conf.get_full_config_as_json(), job_list.get_job_list()) + exp_history.process_status_changes(job_list.get_job_list(), + chunk_unit=as_conf.get_chunk_size_unit(), + chunk_size=as_conf.get_chunk_size(), + current_config=as_conf.get_full_config_as_json(),create=True) + Autosubmit.database_backup(expid) except BaseException as e: Log.printlog("Historic database seems corrupted, AS will repair it and resume the run", Log.INFO) - Autosubmit.database_fix(expid) - exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, - historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) - exp_history.initialize_database() - exp_history.create_new_experiment_run(as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), - as_conf.get_full_config_as_json(), - job_list.get_job_list()) + try: + Autosubmit.database_fix(expid) + except: + Log.warning("Couldn't recover the Historical database, AS will continue without it, GUI may be affected") if not noplot: if group_by: status = list() @@ -4324,16 +4438,21 @@ class Autosubmit: """ project_destination = as_conf.get_project_destination() if project_destination is None or len(project_destination) == 0: - raise AutosubmitCritical("Autosubmit couldn't identify the project destination.", 7014) + if project_type.lower() != "none": + raise AutosubmitCritical("Autosubmit couldn't identify the project destination.", 7014) if project_type == "git": - submitter = Autosubmit._get_submitter(as_conf) - submitter.load_platforms(as_conf) + try: + submitter = Autosubmit._get_submitter(as_conf) + submitter.load_platforms(as_conf) hpcarch = submitter.platforms[as_conf.get_platform()] except BaseException as e: - raise AutosubmitCritical("Can't set main platform", 7014, e.message) - + try: + hpcarch = submitter.platforms[as_conf.get_platform()] + except: + hpcarch = "local" + Log.warning("Remote clone may be disabled due to: "+e.message) return AutosubmitGit.clone_repository(as_conf, force, hpcarch) elif project_type == "svn": svn_project_url = as_conf.get_svn_project_url() @@ -4433,7 +4552,10 @@ class Autosubmit: if job.status in [Status.SUBMITTED, Status.QUEUING, Status.HELD] and final_status not in [Status.QUEUING, Status.HELD, Status.SUSPENDED]: job.hold = False if job.platform_name and job.platform_name.lower() != "local": - job.platform.send_command(job.platform.cancel_cmd + " " + str(job.id), ignore_log=True) + try: + job.platform.send_command(job.platform.cancel_cmd + " " + str(job.id), ignore_log=True) + except: + pass elif job.status in [Status.QUEUING, Status.RUNNING, Status.SUBMITTED] and final_status == Status.SUSPENDED: if job.platform_name and job.platform_name.lower() != "local": job.platform.send_command("scontrol hold " + "{0}".format(job.id), ignore_log=True) @@ -4978,6 +5100,7 @@ class Autosubmit: exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) exp_history.initialize_database() exp_history.process_status_changes(job_list.get_job_list(), chunk_unit=as_conf.get_chunk_size_unit(), chunk_size=as_conf.get_chunk_size(), current_config=as_conf.get_full_config_as_json()) + Autosubmit.database_backup(expid) else: Log.printlog( "Changes NOT saved to the JobList!!!!: use -s option to save", 3000) @@ -5354,10 +5477,12 @@ class Autosubmit: raise AutosubmitCritical('Can not test a RERUN experiment', 7014) content = open(as_conf.experiment_file).read() + if random_select: if hpc is None: platforms_parser = as_conf.get_parser( ConfigParserFactory(), as_conf.platforms_file) + test_platforms = list() for section in platforms_parser.sections(): if platforms_parser.get_option(section, 'TEST_SUITE', 'false').lower() == 'true': diff --git a/autosubmit/config/config_common.py b/autosubmit/config/config_common.py index 7b2a6a12bdff06cf49a1cb297eb435735c1bcfd8..e4ce07a66d02c12dfab2e913ea49533426f109b1 100644 --- a/autosubmit/config/config_common.py +++ b/autosubmit/config/config_common.py @@ -35,6 +35,7 @@ from log.log import Log, AutosubmitError, AutosubmitCritical from autosubmit.config.basicConfig import BasicConfig from collections import defaultdict +from autosubmit.database.db_common import update_experiment_descrip_version class AutosubmitConfig(object): @@ -164,6 +165,41 @@ class AutosubmitConfig(object): """ return self._conf_parser.get_option(wrapper_section_name, 'EXPORT', 'none') + def parse_githooks(self): + """ + Parse githooks section in configuration file + + :return: dictionary with githooks configuration + :rtype: dict + """ + proj_dir = os.path.join( + BasicConfig.LOCAL_ROOT_DIR, self.expid, BasicConfig.LOCAL_PROJ_DIR) + #get project_name + project_name = str(self.get_project_destination()) + + #get githook files from proj_dir + githook_files = [os.path.join(os.path.join(os.path.join(proj_dir,project_name),".githooks"), f) for f in os.listdir(os.path.join(os.path.join(proj_dir,project_name),".githooks")) ] + parameters = self.load_parameters() + + #find all '%(? 0: + max_depth = max(git_project_submodules_depth) if as_conf.get_fetch_single_branch() != "true": git_single_branch = False else: @@ -177,7 +183,9 @@ class AutosubmitGit: os.mkdir(project_path) Log.debug("The project folder {0} has been created.", project_path) command_0 = "" + command_githook = "" command_1 = "" + if git_remote_project_path != '': if git_remote_project_path[-1] == '/': git_remote_path = os.path.join( @@ -198,44 +206,77 @@ class AutosubmitGit: try: ##command 0 Log.debug('Clone command: {0}', command_0) - + try: + git_version = subprocess.check_output("git --version",shell=True) + git_version = git_version.split(" ")[2].strip("\n") + version_int = "" + for number in git_version.split("."): + version_int += number + git_version = int(version_int) + except: + git_version = 2251 if git_remote_project_path == '': command_0 = "cd {0} ; {1}".format(project_path, command_0) output_0 = subprocess.check_output(command_0, shell=True) else: - command_0 = "cd {0} ; {1}".format(git_remote_path, command_0) + command_0 = "cd {0} ; {1}".format(project_path, command_0) hpcarch.send_command(command_0) ##command 1 - if os.path.exists(os.path.join(git_path, ".githooks")): + + if os.path.exists(os.path.join(git_path, ".githooks")) and git_version > 2136: for root_dir, dirs, files in os.walk(os.path.join(git_path, ".githooks")): for f_dir in dirs: os.chmod(os.path.join(root_dir, f_dir), 0o750) for f_file in files: os.chmod(os.path.join(root_dir, f_file), 0o750) - command_1 += " git config core.hooksPath ./.githooks ; ".format( + command_githook += " git config core.hooksPath ./.githooks ; ".format( git_path) if git_project_commit: - command_1 += "git checkout {0};".format(git_project_commit) + command_1 += "git checkout {0}; ".format(git_project_commit) else: command_1 += "git checkout; " + if git_project_submodules.__len__() <= 0: - command_1 += " git submodule update --init --recursive;" + if max_depth > 0: + Log.info("Depth is incompatible with --recursive, ignoring recursive option") + command_1 += " git submodule update --init --depth {0}; ".format(max_depth) + else: + command_1 += " git submodule update --init --recursive; " else: - command_1 += " git submodule init;".format(project_destination) + command_1 += " git submodule init; ".format(project_destination) + index_submodule = 0 for submodule in git_project_submodules: - command_1 += " git submodule update {0};".format(submodule) + if max_depth > 0: + Log.info("Depth is incompatible with --recursive, ignoring recursive option") + if index_submodule < len(git_project_submodules_depth): + command_1 += " git submodule update --init --depth {0} {1}; ".format( + git_project_submodules_depth[index_submodule], submodule) + else: + command_1 += " git submodule update --init --depth {0} {1}; ".format( + max_depth, submodule) + else: + command_1 += " git submodule update --init --recursive {0}; ".format(submodule) + index_submodule += 1 if git_remote_project_path == '': try: + if len(command_githook) > 0: + command_githook = "cd {0} ; {1}".format(git_path, command_githook) + as_conf.parse_githooks() + subprocess.check_output(command_githook, shell=True) command_1 = "cd {0}; {1} ".format(git_path,command_1) - Log.debug('Githook + Checkout and Submodules: {0}', command_1) + Log.debug('Githook + Checkout and Submodules: {0}', command_githook, command_1) output_1 = subprocess.check_output(command_1, shell=True) except BaseException as e: submodule_failure = True - Log.printlog("Trace: {0}".format(e.message), 6014) + Log.printlog("Trace: {0}".format(str(e)), 6014) Log.printlog( - "Submodule {0} has a wrong configuration".format(submodule), 6014) + "Submodule has a wrong configuration.\n{0}".format(command_1), 6014) else: - command_1 = "cd {0}; {1} ".format(git_remote_path, command_1) + if len(command_githook) > 0: + command_githook = "cd {0} ; {1}".format(project_path, command_githook) + as_conf.parse_githooks() + hpcarch.send_command(command_githook) + command_1 = "cd {0}; {1} ".format(project_path, command_1) hpcarch.send_command(command_1) except subprocess.CalledProcessError as e: shutil.rmtree(project_path) diff --git a/autosubmit/helpers/utils.py b/autosubmit/helpers/utils.py index 1a6d8e7631862a6aee055f9f16adafb43528722f..0ce27ab8a783713eb302dd026a7161a5f27f49db 100644 --- a/autosubmit/helpers/utils.py +++ b/autosubmit/helpers/utils.py @@ -7,7 +7,6 @@ from typing import Tuple def check_experiment_ownership(expid, basic_config, raise_error=False, logger=None): #Logger variable is not needed, LOG is global thus it will be read if avaliable - # type: (str, BasicConfig, bool, Log) -> Tuple[bool, bool, str] my_user_ID = os.getuid() current_owner_ID = 0 current_owner_name = "NA" diff --git a/autosubmit/history/data_classes/job_data.py b/autosubmit/history/data_classes/job_data.py index b5249b797873aa61bd351a884d35dbcdb04d33a5..93a88797a3a17aa79ae64efed70a50a169652e5c 100644 --- a/autosubmit/history/data_classes/job_data.py +++ b/autosubmit/history/data_classes/job_data.py @@ -57,7 +57,8 @@ class JobData(object): platform) > 0 else "NA" self.job_id = job_id if job_id else 0 try: - self.extra_data_parsed = loads(extra_data) + if extra_data != "": + self.extra_data_parsed = loads(extra_data) except Exception as exp: self.extra_data_parsed = {} # Fail fast self.extra_data = extra_data diff --git a/autosubmit/history/experiment_history.py b/autosubmit/history/experiment_history.py index f1e0be68c06aa6e9e1767312ae70b655f2a8e186..e7ae598a1cd4dc9871a48f65c44d95998eec1aaf 100644 --- a/autosubmit/history/experiment_history.py +++ b/autosubmit/history/experiment_history.py @@ -190,19 +190,24 @@ class ExperimentHistory(): except Exception as exp: self._log.log(str(exp), traceback.format_exc()) - def process_status_changes(self, job_list=None, chunk_unit="NA", chunk_size=0, current_config=""): + def process_status_changes(self, job_list=None, chunk_unit="NA", chunk_size=0, current_config="",create=False): """ Detect status differences between job_list and current job_data rows, and update. Creates a new run if necessary. """ try: - current_experiment_run_dc = self.manager.get_experiment_run_dc_with_max_id() - update_these_changes = self._get_built_list_of_changes(job_list) - should_create_new_run = self.should_we_create_a_new_run(job_list, len(update_these_changes), current_experiment_run_dc, chunk_unit, chunk_size) - if len(update_these_changes) > 0 and should_create_new_run == False: - self.manager.update_many_job_data_change_status(update_these_changes) - if should_create_new_run: - return self.create_new_experiment_run(chunk_unit, chunk_size, current_config, job_list) - return self.update_counts_on_experiment_run_dc(current_experiment_run_dc, job_list) + try: + current_experiment_run_dc = self.manager.get_experiment_run_dc_with_max_id() + update_these_changes = self._get_built_list_of_changes(job_list) + except: + current_experiment_run_dc = 0 + update_these_changes = [] + #("no runs") + should_create_new_run = self.should_we_create_a_new_run(job_list, len(update_these_changes), current_experiment_run_dc, chunk_unit, chunk_size,create) + if len(update_these_changes) > 0 and should_create_new_run == False: + self.manager.update_many_job_data_change_status(update_these_changes) + if should_create_new_run: + return self.create_new_experiment_run(chunk_unit, chunk_size, current_config, job_list) + return self.update_counts_on_experiment_run_dc(current_experiment_run_dc, job_list) except Exception as exp: - self._log.log(str(exp), traceback.format_exc()) + self._log.log(str(exp), traceback.format_exc()) def _get_built_list_of_changes(self, job_list): """ Return: List of (current timestamp, current datetime str, status, rowstatus, id in job_data). One tuple per change. """ @@ -217,11 +222,14 @@ class ExperimentHistory(): except Exception as exp: self._log.log(str(exp), traceback.format_exc()) - def should_we_create_a_new_run(self, job_list, changes_count, current_experiment_run_dc, new_chunk_unit, new_chunk_size): - if len(job_list) != current_experiment_run_dc.total: - return True - if changes_count > int(self._get_date_member_completed_count(job_list)): - return True + def should_we_create_a_new_run(self, job_list, changes_count, current_experiment_run_dc, new_chunk_unit, new_chunk_size,create=False): + if create: + return True + elif not create and self.expid[0].lower() != "t": + if len(job_list) != current_experiment_run_dc.total: + return True + if changes_count > int(self._get_date_member_completed_count(job_list)): + return True return self._chunk_config_has_changed(current_experiment_run_dc, new_chunk_unit, new_chunk_size) def _chunk_config_has_changed(self, current_exp_run_dc, new_chunk_unit, new_chunk_size): @@ -274,15 +282,16 @@ class ExperimentHistory(): def detect_changes_in_job_list(self, job_list): """ Detect changes in job_list compared to the current contents of job_data table. Returns a list of JobData data classes where the status of each item is the new status.""" - job_name_to_job = {job.name: job for job in job_list} + job_name_to_job = {str(job.name): job for job in job_list} current_job_data_dcs = self.manager.get_all_last_job_data_dcs() differences = [] for job_dc in current_job_data_dcs: - if job_dc.job_name in job_name_to_job and job_dc.status != job_name_to_job[job_dc.job_name].status_str: - if not (job_dc.status in ["COMPLETED", "FAILED"] and job_name_to_job[job_dc.job_name].status_str in ["WAITING", "READY"]): - # If the job is not changing from a finalized status to a starting status - job_dc.status = job_name_to_job[job_dc.job_name].status_str - differences.append(job_dc) + if job_dc.job_name in job_name_to_job: + if job_dc.status != job_name_to_job[job_dc.job_name].status_str: + if not (job_dc.status in ["COMPLETED", "FAILED"] and job_name_to_job[job_dc.job_name].status_str in ["WAITING", "READY"]): + # If the job is not changing from a finalized status to a starting status + job_dc.status = job_name_to_job[job_dc.job_name].status_str + differences.append(job_dc) return differences def _get_defined_rowtype(self, code): diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 226b85c376235ee1dca96813523c4f43785cf718..6565b8c3112bde7da2be3faae7940db503287b99 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -95,6 +95,7 @@ class Job(object): self.wallclock = None # type: str self.wchunkinc = None self.tasks = '0' + self.nodes = "" self.threads = '1' self.processors = '1' self.memory = '' @@ -148,6 +149,8 @@ class Job(object): self.export = "none" self.dependencies = [] self.start_time = None + self.ext_header_path = '' + self.ext_tailer_path = '' def __getstate__(self): odict = self.__dict__ @@ -182,7 +185,7 @@ class Job(object): :rtype: set """ return self._parents - + @parents.setter def parents(self, parents): """ @@ -193,13 +196,13 @@ class Job(object): @property def status_str(self): """ - String representation of the current status + String representation of the current status """ return Status.VALUE_TO_KEY.get(self.status, "UNKNOWN") - + @property def children_names_str(self): - """ + """ Comma separated list of children's names """ return ",".join([str(child.name) for child in self._children]) @@ -315,7 +318,7 @@ class Job(object): @property def total_processors(self): """ - Number of processors requested by job. + Number of processors requested by job. Reduces ':' separated format if necessary. """ if ':' in self.processors: @@ -536,8 +539,8 @@ class Job(object): """ Returns the retrials of a job, including the last COMPLETED run. The selection stops, and does not include, when the previous COMPLETED job is located or the list of registers is exhausted. - :return: list of list of dates of retrial [submit, start, finish] in datetime format - :rtype: list of list + :return: list of list of dates of retrial [submit, start, finish] in datetime format + :rtype: list of list """ log_name = os.path.join(self._tmp_path, self.name + '_TOTAL_STATS') retrials_list = [] @@ -600,53 +603,103 @@ class Job(object): self._tmp_path, 'LOG_' + str(self.expid), local_log)) except BaseException as e: Log.printlog("Trace {0} \n Failed to write the {1} e=6001".format( - e.message, self.name)) + str(e), self.name)) except AutosubmitError as e: Log.printlog("Trace {0} \nFailed to retrieve log file for job {1}".format( - e.message, self.name), 6001) + str(e), self.name), 6001) except AutosubmitCritical as e: # Critical errors can't be recovered. Failed configuration or autosubmit error Log.printlog("Trace {0} \nFailed to retrieve log file for job {0}".format( - e.message, self.name), 6001) + str(e), self.name), 6001) return + def read_header_tailer_script(self, script_path, as_conf): + """ + Opens and reads a script. If it is not a BASH script it will fail :( + + Will strip away the line with the hash bang (#!) + + :param script_path: relative to the experiment directory path to the script + :type script_path: string + :param as_conf: Autosubmit configuration file + :type as_conf: config + """ + script_name = script_path.rsplit("/")[-1] # pick the name of the script for a more verbose error + script = '' + if script_path == '': + return script + + try: + script_file = open(os.path.join(as_conf.get_project_dir(), script_path), 'r') + except Exception as e: # log + # We stop Autosubmit if we don't find the script + raise AutosubmitCritical("Extended script: failed to fetch {0} \n".format(str(e)), 7014) + + for line in script_file: + if "#!" not in line: + script += line + else: + # check if the type of the script matches the one in the extended + if "bash" in line: + if self.type != Type.BASH: + raise AutosubmitCritical("Extended script: script {0} seems BASH but job {1} isn't\n".format(script_name, self.script_name), 7011) + elif "Rscript" in line: + if self.type != Type.R: + raise AutosubmitCritical("Extended script: script {0} seems Rscript but job {1} isn't\n".format(script_name, self.script_name), 7011) + elif "python" in line: + if self.type not in (Type.PYTHON, Type.PYTHON2, Type.PYTHON3): + raise AutosubmitCritical("Extended script: script {0} seems Python but job {1} isn't\n".format(script_name, self.script_name), 7011) + + return script + @threaded - def retrieve_logfiles(self, copy_remote_logs, local_logs, remote_logs, expid, platform_name,fail_count = 0): + def retrieve_logfiles(self, copy_remote_logs, local_logs, remote_logs, expid, platform_name,fail_count = 0,job_id=""): max_logs = 0 sleep(5) stat_file = self.script_name[:-4] + "_STAT_" - try: - as_conf = AutosubmitConfig(expid, BasicConfig, ConfigParserFactory()) - as_conf.reload() - submitter = self._get_submitter(as_conf) - submitter.load_platforms(as_conf) - platform = submitter.platforms[platform_name.lower()] + retries = 2 + count = 0 + success = False + error_message = "" + while (count < retries) or not success: try: - platform.test_connection() - except: + as_conf = AutosubmitConfig(expid, BasicConfig, ConfigParserFactory()) + as_conf.reload() + submitter = self._get_submitter(as_conf) + submitter.load_platforms(as_conf) + platform = submitter.platforms[str(platform_name).lower()] + success = True + except BaseException as e: + error_message = str(e) + sleep(60*5) pass + count=count+1 + if not success: + raise AutosubmitError("Couldn't load the autosubmit platforms, seems that the local platform has some issue\n:{0}".format(error_message),6006) + else: max_logs = int(as_conf.get_retrials()) - fail_count last_log = int(as_conf.get_retrials()) - fail_count - if self.wrapper_type is not None and self.wrapper_type == "vertical": - found = False - retrials = 0 - while retrials < 3 and not found: - sleep(2) - if platform.check_stat_file_by_retrials(stat_file + str(max_logs)): - found = True - retrials = retrials - 1 - for i in range(max_logs-1,-1,-1): - if platform.check_stat_file_by_retrials(stat_file + str(i)): - last_log = i - else: - break - remote_logs = (self.script_name + ".out." + str(last_log), self.script_name + ".err." + str(last_log)) + try: + platform.test_connection() + if self.wrapper_type is not None and self.wrapper_type == "vertical": + found = False + retrials = 0 + while retrials < 3 and not found: + if platform.check_stat_file_by_retrials(stat_file + str(max_logs)): + found = True + retrials = retrials + 1 + for i in range(max_logs-1,-1,-1): + if platform.check_stat_file_by_retrials(stat_file + str(i)): + last_log = i + else: + break + remote_logs = (self.script_name + ".out." + str(last_log), self.script_name + ".err." + str(last_log)) - else: - remote_logs = (self.script_name + ".out."+str(fail_count), self.script_name + ".err." + str(fail_count)) + else: + remote_logs = (self.script_name + ".out."+str(fail_count), self.script_name + ".err." + str(fail_count)) - except Exception as e: - Log.printlog( - "{0} \n Couldn't connect to the remote platform for this {1} job err/out files. ".format(e.message, self.name), 6001) + except BaseException as e: + Log.printlog( + "{0} \n Couldn't connect to the remote platform for {1} job err/out files. ".format(str(e), self.name), 6001) out_exist = False err_exist = False retries = 3 @@ -717,7 +770,7 @@ class Job(object): platform.get_logs_files(self.expid, l_log) try: for local_log in l_log: - platform.write_jobid(self.id, os.path.join(self._tmp_path, 'LOG_' + str(self.expid), local_log)) + platform.write_jobid(job_id, os.path.join(self._tmp_path, 'LOG_' + str(self.expid), local_log)) except BaseException as e: pass max_logs = max_logs - 1 @@ -734,7 +787,7 @@ class Job(object): # Update the logs with Autosubmit Job Id Brand try: for local_log in local_logs: - platform.write_jobid(self.id, os.path.join( + platform.write_jobid(job_id, os.path.join( self._tmp_path, 'LOG_' + str(self.expid), local_log)) except BaseException as e: Log.printlog("Trace {0} \n Failed to write the {1} e=6001".format( @@ -768,6 +821,22 @@ class Job(object): except BaseException as e: pass return + def parse_time(self,wallclock): + format = "minute" + regex = re.compile(r'(((?P\d+):)((?P\d+)))(:(?P\d+))?') + parts = regex.match(wallclock) + if not parts: + return + parts = parts.groupdict() + if int(parts['hours']) > 0 : + format = "hour" + else: + format = "minute" + time_params = {} + for name, param in parts.items(): + if param: + time_params[name] = int(param) + return datetime.timedelta(**time_params),format # Duplicated for wrappers and jobs to fix in 4.0.0 def is_over_wallclock(self, start_time, wallclock): """ @@ -777,25 +846,13 @@ class Job(object): :return: """ elapsed = datetime.datetime.now() - start_time - wallclock = datetime.datetime.strptime(wallclock, '%H:%M') - total = 0.0 - if wallclock.hour > 0: - total = wallclock.hour - format = "hour" + wallclock,time_format = self.parse_time(wallclock) + if time_format == "hour": + total = wallclock.days * 24 + wallclock.seconds / 60 / 60 else: - format = "minute" - if format == "hour": - if wallclock.minute > 0: - total += wallclock.minute / 60.0 - if wallclock.second > 0: - total += wallclock.second / 60.0 / 60.0 - else: - if wallclock.minute > 0: - total += wallclock.minute - if wallclock.second > 0: - total += wallclock.second / 60.0 + total = wallclock.days * 24 + wallclock.seconds / 60 total = total * 1.30 # in this case we only want to avoid slurm issues so the time is increased by 50% - if format == "hour": + if time_format == "hour": hour = int(total) minute = int((total - int(total)) * 60.0) second = int(((total - int(total)) * 60 - @@ -868,13 +925,13 @@ class Job(object): if previous_status != Status.RUNNING and self.status in [Status.COMPLETED, Status.FAILED, Status.UNKNOWN, Status.RUNNING]: self.write_start_time() - if previous_status == Status.HELD and self.status in [Status.SUBMITTED, Status.QUEUING, Status.RUNNING]: + if previous_status == Status.HELD and self.status in [Status.SUBMITTED, Status.QUEUING, Status.RUNNING]: self.write_submit_time() # Updating logs if self.status in [Status.COMPLETED, Status.FAILED, Status.UNKNOWN]: # New thread, check if file exist expid = copy.deepcopy(self.expid) - platform_name = copy.deepcopy(self.platform_name.lower()) + platform_name = copy.deepcopy(str(self.platform_name).lower()) local_logs = copy.deepcopy(self.local_logs) remote_logs = copy.deepcopy(self.remote_logs) as_conf = AutosubmitConfig( @@ -883,7 +940,7 @@ class Job(object): if as_conf.get_disable_recovery_threads(self.platform.name) == "true": self.retrieve_logfiles_unthreaded(copy_remote_logs, local_logs) else: - self.retrieve_logfiles(copy_remote_logs, local_logs, remote_logs, expid, platform_name,fail_count = copy.copy(self.fail_count)) + self.retrieve_logfiles(copy_remote_logs, local_logs, remote_logs, expid, platform_name,fail_count = copy.copy(self.fail_count),job_id=self.id) if self.wrapper_type == "vertical": max_logs = int(as_conf.get_retrials()) for i in range(0,max_logs): @@ -1032,9 +1089,11 @@ class Job(object): self.processors = as_conf.get_processors(self.section) self.threads = as_conf.get_threads(self.section) self.tasks = as_conf.get_tasks(self.section) - self.hyperthreading = as_conf.get_hyperthreading(self.section).lower() + self.nodes = as_conf.get_nodes(self.section) + self.ec_queue = as_conf.get_ec_queue(self) + self.hyperthreading = str(as_conf.get_hyperthreading(self.section)).lower() if self.hyperthreading is 'none': - self.hyperthreading = job_platform.hyperthreading.lower() + self.hyperthreading = str(job_platform.hyperthreading).lower() if self.tasks == '0' and job_platform.processors_per_node: self.tasks = job_platform.processors_per_node @@ -1073,17 +1132,22 @@ class Job(object): parameters['CPUS_PER_TASK'] = self.threads parameters['NUMTASK'] = self.tasks parameters['TASKS'] = self.tasks + parameters['NODES'] = self.nodes parameters['TASKS_PER_NODE'] = self.tasks parameters['WALLCLOCK'] = self.wallclock parameters['TASKTYPE'] = self.section parameters['SCRATCH_FREE_SPACE'] = self.scratch_free_space parameters['CUSTOM_DIRECTIVES'] = self.custom_directives parameters['HYPERTHREADING'] = self.hyperthreading - + # we open the files and offload the whole script as a string + # memory issues if the script is too long? Add a check to avoid problems... + parameters['EXTENDED_HEADER'] = self.read_header_tailer_script(self.ext_header_path, as_conf) + parameters['EXTENDED_TAILER'] = self.read_header_tailer_script(self.ext_tailer_path, as_conf) parameters['CURRENT_ARCH'] = job_platform.name parameters['CURRENT_HOST'] = job_platform.host parameters['CURRENT_QUEUE'] = self.queue + parameters['CURRENT_EC_QUEUE'] = self.ec_queue parameters['CURRENT_USER'] = job_platform.user parameters['CURRENT_PROJ'] = job_platform.project parameters['CURRENT_BUDG'] = job_platform.budget @@ -1169,26 +1233,26 @@ class Job(object): """ parameters = self.parameters try: # issue in tests with project_type variable while using threads - if as_conf.get_project_type().lower() != "none": + if str(as_conf.get_project_type()).lower() != "none": template_file = open(os.path.join( as_conf.get_project_dir(), self.file), 'r') template = '' if as_conf.get_remote_dependencies(): if self.type == Type.BASH: - template = 'sleep 5' + "\n" + template = 'sleep 360' + "\n" elif self.type == Type.PYTHON: - template = 'time.sleep(30)' + "\n" + template = 'time.sleep(5)' + "\n" elif self.type == Type.R: - template = 'Sys.sleep(30)' + "\n" + template = 'Sys.sleep(5)' + "\n" template += template_file.read() template_file.close() else: if self.type == Type.BASH: - template = 'sleep 35' - elif self.type == Type.PYTHON: - template = 'time.sleep(35)' + template = 'sleep 360' + elif self.type == Type.PYTHON or self.type == Type.PYTHON2 or self.type == Type.PYTHON3: + template = 'time.sleep(5)' elif self.type == Type.R: - template = 'Sys.sleep(35)' + template = 'Sys.sleep(5)' else: template = '' except: @@ -1243,7 +1307,8 @@ class Job(object): 'QOSMaxNodePerJobLimit', 'DependencyNeverSatisfied', 'QOSMaxMemoryPerJob', 'QOSMaxMemoryPerNode', 'QOSMaxMemoryMinutesPerJob', 'QOSMaxNodeMinutesPerJob', 'InactiveLimit', 'JobLaunchFailure', 'NonZeroExitCode', 'PartitionNodeLimit', - 'PartitionTimeLimit', 'SystemFailure', 'TimeLimit', 'QOSUsageThreshold']: + 'PartitionTimeLimit', 'SystemFailure', 'TimeLimit', 'QOSUsageThreshold', + 'QOSTimeLimit','QOSResourceLimit','QOSJobLimit','InvalidQOS','InvalidAccount']: return True return False except: @@ -1383,7 +1448,7 @@ class Job(object): exp_history = ExperimentHistory(self.expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) exp_history.write_submit_time(self.name, submit=data_time[1], status=Status.VALUE_TO_KEY.get(self.status, "UNKNOWN"), ncpus=self.processors, wallclock=self.wallclock, qos=self.queue, date=self.date, member=self.member, section=self.section, chunk=self.chunk, - platform=self.platform_name, job_id=self.id, wrapper_queue=self._wrapper_queue, wrapper_code=get_job_package_code(self.expid, self.name), + platform=self.platform_name, job_id=self.id, wrapper_queue=self._wrapper_queue, wrapper_code=get_job_package_code(self.expid, self.name), children=self.children_names_str) def write_start_time(self, enabled = False): @@ -1412,8 +1477,8 @@ class Job(object): # Writing database exp_history = ExperimentHistory(self.expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) exp_history.write_start_time(self.name, start=start_time, status=Status.VALUE_TO_KEY.get(self.status, "UNKNOWN"), ncpus=self.processors, - wallclock=self.wallclock, qos=self.queue, date=self.date, member=self.member, section=self.section, chunk=self.chunk, - platform=self.platform_name, job_id=self.id, wrapper_queue=self._wrapper_queue, wrapper_code=get_job_package_code(self.expid, self.name), + wallclock=self.wallclock, qos=self.queue, date=self.date, member=self.member, section=self.section, chunk=self.chunk, + platform=self.platform_name, job_id=self.id, wrapper_queue=self._wrapper_queue, wrapper_code=get_job_package_code(self.expid, self.name), children=self.children_names_str) return True @@ -1456,7 +1521,7 @@ class Job(object): wrapper_code=get_job_package_code(self.expid, self.name), children=self.children_names_str) # Launch second as threaded function only for slurm - if job_data_dc and type(self.platform) is not str and self.platform.type == "slurm": + if job_data_dc and type(self.platform) is not str and self.platform.type == "slurm": thread_write_finish = Thread(target=ExperimentHistory(self.expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR).write_platform_data_after_finish, args=(job_data_dc, self.platform)) thread_write_finish.name = "JOB_data_{}".format(self.name) thread_write_finish.start() @@ -1472,7 +1537,7 @@ class Job(object): Writes all data to TOTAL_STATS file :param total_stats: data gathered by the wrapper :type completed: str - """ + """ if first_retrial: self.write_submit_time(enabled=True) path = os.path.join(self._tmp_path, self.name + '_TOTAL_STATS') @@ -1488,12 +1553,12 @@ class Job(object): exp_history = ExperimentHistory(self.expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) exp_history.write_submit_time(self.name, submit=total_stats[0], status=Status.VALUE_TO_KEY.get(self.status, "UNKNOWN"), ncpus=self.processors, wallclock=self.wallclock, qos=self.queue, date=self.date, member=self.member, section=self.section, chunk=self.chunk, - platform=self.platform_name, job_id=self.id, wrapper_queue=self._wrapper_queue, wrapper_code=get_job_package_code(self.expid, self.name), + platform=self.platform_name, job_id=self.id, wrapper_queue=self._wrapper_queue, wrapper_code=get_job_package_code(self.expid, self.name), children=self.children_names_str) exp_history = ExperimentHistory(self.expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) exp_history.write_start_time(self.name, start=total_stats[0], status=Status.VALUE_TO_KEY.get(self.status, "UNKNOWN"), ncpus=self.processors, - wallclock=self.wallclock, qos=self.queue, date=self.date, member=self.member, section=self.section, chunk=self.chunk, - platform=self.platform_name, job_id=self.id, wrapper_queue=self._wrapper_queue, wrapper_code=get_job_package_code(self.expid, self.name), + wallclock=self.wallclock, qos=self.queue, date=self.date, member=self.member, section=self.section, chunk=self.chunk, + platform=self.platform_name, job_id=self.id, wrapper_queue=self._wrapper_queue, wrapper_code=get_job_package_code(self.expid, self.name), children=self.children_names_str) exp_history = ExperimentHistory(self.expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR) @@ -1502,7 +1567,7 @@ class Job(object): platform=self.platform_name, job_id=self.id, out_file=out, err_file=err, wrapper_queue=self._wrapper_queue, wrapper_code=get_job_package_code(self.expid, self.name), children=self.children_names_str) # Launch second as threaded function only for slurm - if job_data_dc and type(self.platform) is not str and self.platform.type == "slurm": + if job_data_dc and type(self.platform) is not str and self.platform.type == "slurm": thread_write_finish = Thread(target=ExperimentHistory(self.expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR).write_platform_data_after_finish, args=(job_data_dc, self.platform)) thread_write_finish.name = "JOB_data_{}".format(self.name) thread_write_finish.start() @@ -1625,7 +1690,8 @@ class WrapperJob(Job): 'QOSMaxNodePerJobLimit', 'DependencyNeverSatisfied', 'QOSMaxMemoryPerJob', 'QOSMaxMemoryPerNode', 'QOSMaxMemoryMinutesPerJob', 'QOSMaxNodeMinutesPerJob', 'InactiveLimit', 'JobLaunchFailure', 'NonZeroExitCode', 'PartitionNodeLimit', - 'PartitionTimeLimit', 'SystemFailure', 'TimeLimit', 'QOSUsageThreshold']: + 'PartitionTimeLimit', 'SystemFailure', 'TimeLimit', 'QOSUsageThreshold', + 'QOSTimeLimit','QOSResourceLimit','QOSJobLimit','InvalidQOS','InvalidAccount']: return True return False except: @@ -1714,7 +1780,7 @@ class WrapperJob(Job): self.name, reason), 6009) # while running jobs? self._check_running_jobs() - self.update_failed_jobs(canceled_wrapper=True) + self.update_failed_jobs() self.cancel_failed_wrapper_job() return @@ -1726,7 +1792,7 @@ class WrapperJob(Job): for job in self.job_list: job.hold = self.hold job.new_status = Status.QUEUING - job.update_status(self.as_config.get_copy_remote_logs() == 'true') + job.update_status(self.as_config.get_copy_remote_logs() == 'true') Log.info("Job {0} is QUEUING {1}", self.name, reason) else: self.status = Status.HELD @@ -1744,8 +1810,8 @@ class WrapperJob(Job): job.hold = self.hold job.status = self.status if self.status == Status.WAITING: - for job in self.job_list: - job.packed = False + for job2 in self.job_list: + job2.packed = False def _check_inner_job_wallclock(self, job): start_time = self.running_jobs_start[job] diff --git a/autosubmit/job/job_common.py b/autosubmit/job/job_common.py index a4b20ecc4287e86a43c71d4beae437c5ccc43e86..bfbe2cbac97749123c298cb5b3f71a18d9092513 100644 --- a/autosubmit/job/job_common.py +++ b/autosubmit/job/job_common.py @@ -110,19 +110,30 @@ class StatisticsSnippetBash: ################### # Autosubmit header ################### + locale_to_set=$(locale -a | grep ^C.) - if [ -z "$var" ] ; then + if [ -z "$locale_to_set" ] ; then # locale installed... export LC_ALL=$locale_to_set else # locale not installed... - export LC_ALL=C + locale_to_set=$(locale -a | grep ^en_GB.utf8) + if [ -z "$locale_to_set" ] ; then + export LC_ALL=$locale_to_set + else + export LC_ALL=C + fi fi set -xuve job_name_ptrn='%CURRENT_LOGDIR%/%JOBNAME%' echo $(date +%s) > ${job_name_ptrn}_STAT - + + ################### + # Extended header + ################### + %EXTENDED_HEADER% + ################### # Autosubmit job ################### @@ -132,7 +143,11 @@ class StatisticsSnippetBash: @staticmethod def as_tailer(): return textwrap.dedent("""\ - + + ################### + # Extended tailer + ################### + %EXTENDED_TAILER% ################### # Autosubmit tailer ################### @@ -171,9 +186,15 @@ class StatisticsSnippetPython: try: try: locale.setlocale(locale.LC_ALL,'C.utf8') - except: - locale.setlocale(locale.LC_ALL, 'C.UTF-8') - except: + except Exception as e: + try: + locale.setlocale(locale.LC_ALL, 'C.UTF-8') + except Exception as e: + try: + locale.setlocale(locale.LC_ALL, 'en_GB') + except Exception as e: + locale.setlocale(locale.LC_ALL, 'es_ES') + except Exception as e: locale.setlocale(locale.LC_ALL, 'C') job_name_ptrn = '%CURRENT_LOGDIR%/%JOBNAME%' stat_file = open(job_name_ptrn + '_STAT', 'w') @@ -190,7 +211,6 @@ class StatisticsSnippetPython: # expand tailer to use python3 def as_tailer(self): return textwrap.dedent("""\ - ################### # Autosubmit tailer ################### @@ -225,7 +245,7 @@ class StatisticsSnippetR: oldw <- getOption("warn") options( warn = -1 ) leave = F - langs <- c("C.utf8","C.UTF-8","C") + langs <- c("C.utf8","C.UTF-8","C","en_GB","es_ES") i = 1 e="" while (nchar(e) == 0 || leave) diff --git a/autosubmit/job/job_dict.py b/autosubmit/job/job_dict.py index 0b16d29af98b506bed7b98947297a6e946c711dc..f2494ee33699503aebfa9002305a7fcfa862d531 100644 --- a/autosubmit/job/job_dict.py +++ b/autosubmit/job/job_dict.py @@ -126,7 +126,7 @@ class DicJobs: except BaseException as e: raise AutosubmitCritical( "Wrong format for {1} parameter in section {0}".format(section,called_from), 7011, - e.message) + str(e)) pass return parsed_list def read_section(self, section, priority, default_job_type, jobs_data=dict()): @@ -152,11 +152,19 @@ class DicJobs: elif running == 'date': self._create_jobs_startdate(section, priority, frequency, default_job_type, jobs_data,splits) elif running == 'member': - self._create_jobs_member(section, priority, frequency, default_job_type, jobs_data,splits,self.parse_relation(section,True,self.get_option(section, "EXCLUDED_MEMBERS", []),"EXCLUDED_MEMBERS")) + self._create_jobs_member(section, priority, frequency, default_job_type, jobs_data,splits, \ + self.parse_relation(section,True,self.get_option(section, "EXCLUDED_MEMBERS", []),"EXCLUDED_MEMBERS"), \ + self.parse_relation(section,True,self.get_option(section, "INCLUDED_MEMBERS", []),"INCLUDED_MEMBERS")) + elif running == 'chunk': synchronize = self.get_option(section, "SYNCHRONIZE", None) delay = int(self.get_option(section, "DELAY", -1)) - self._create_jobs_chunk(section, priority, frequency, default_job_type, synchronize, delay, splits, jobs_data,excluded_chunks=self.parse_relation(section,False,self.get_option(section, "EXCLUDED_CHUNKS", []),"EXCLUDED_CHUNKS"),excluded_members=self.parse_relation(section,True,self.get_option(section, "EXCLUDED_MEMBERS", []),"EXCLUDED_MEMBERS")) + self._create_jobs_chunk(section, priority, frequency, default_job_type, synchronize, delay, splits, jobs_data, \ + excluded_chunks=self.parse_relation(section,False,self.get_option(section, "EXCLUDED_CHUNKS", []),"EXCLUDED_CHUNKS"), \ + excluded_members=self.parse_relation(section,True,self.get_option(section, "EXCLUDED_MEMBERS", []),"EXCLUDED_MEMBERS"), \ + included_chunks=self.parse_relation(section,False,self.get_option(section, "INCLUDED_CHUNKS", []),"INCLUDED_CHUNKS"), \ + included_members=self.parse_relation(section,True,self.get_option(section, "INCLUDED_MEMBERS", []),"INCLUDED_MEMBERS")) + pass def _create_jobs_once(self, section, priority, default_job_type, jobs_data=dict(),splits=0): @@ -218,7 +226,7 @@ class DicJobs: - def _create_jobs_member(self, section, priority, frequency, default_job_type, jobs_data=dict(),splits=-1,excluded_members=[]): + def _create_jobs_member(self, section, priority, frequency, default_job_type, jobs_data=dict(),splits=-1,excluded_members=[],included_members=[]): """ Create jobs to be run once per member @@ -242,11 +250,18 @@ class DicJobs: count = 0 if splits > 0: for member in self._member_list: - if self._member_list.index(member) not in excluded_members: - tmp_dic[section][date][member] = [] + if len(included_members) == 0: + if self._member_list.index(member) not in excluded_members: + tmp_dic[section][date][member] = [] + else: + if self._member_list.index(member) in included_members: + tmp_dic[section][date][member] = [] for member in self._member_list: if self._member_list.index(member) in excluded_members: continue + if len(included_members) > 0: + if self._member_list.index(member) not in included_members: + continue count += 1 if count % frequency == 0 or count == len(self._member_list): if splits <= 0: @@ -259,7 +274,7 @@ class DicJobs: - def _create_jobs_chunk(self, section, priority, frequency, default_job_type, synchronize=None, delay=0, splits=0, jobs_data=dict(),excluded_chunks=[],excluded_members=[]): + def _create_jobs_chunk(self, section, priority, frequency, default_job_type, synchronize=None, delay=0, splits=0, jobs_data=dict(),excluded_chunks=[],excluded_members=[],included_chunks=[],included_members=[]): """ Create jobs to be run once per chunk @@ -282,6 +297,9 @@ class DicJobs: for chunk in self._chunk_list: if chunk in excluded_chunks: continue + if len(included_chunks) > 0: + if chunk not in included_chunks: + continue count += 1 if delay == -1 or delay < chunk: if count % frequency == 0 or count == len(self._chunk_list): @@ -311,6 +329,9 @@ class DicJobs: for date in self._date_list: self._dic[section][date] = dict() for member in self._member_list: + if len(included_members) > 0: + if self._member_list.index(member) not in included_members: + continue if self._member_list.index(member) in excluded_members: continue self._dic[section][date][member] = dict() @@ -318,6 +339,9 @@ class DicJobs: for chunk in self._chunk_list: if chunk in excluded_chunks: continue + if len(included_chunks) > 0: + if chunk not in included_chunks: + continue count += 1 if delay == -1 or delay < chunk: if count % frequency == 0 or count == len(self._chunk_list): @@ -378,9 +402,10 @@ class DicJobs: for d in self._date_list: self._get_date(jobs, dic, d, member, chunk) try: - if type(jobs[0]) is list: - jobs_flattened = [job for jobs_to_flatten in jobs for job in jobs_to_flatten] - jobs = jobs_flattened + if len(jobs) > 0: + if type(jobs[0]) is list: + jobs_flattened = [job for jobs_to_flatten in jobs for job in jobs_to_flatten] + jobs = jobs_flattened except BaseException as e: pass return jobs @@ -470,6 +495,7 @@ class DicJobs: job.platform_name = job.platform_name job.file = self.get_option(section, "FILE", None) job.queue = self.get_option(section, "QUEUE", None) + job.ec_queue = self.get_option(section, "EC_QUEUE", "hpc") job.check = str(self.get_option(section, "CHECK", 'True')).lower() job.export = str(self.get_option(section, "EXPORT", None)) job.processors = str(self.get_option(section, "PROCESSORS", 1)) @@ -493,6 +519,9 @@ class DicJobs: job.running = self.get_option(section, 'RUNNING', 'once').lower() job.x11 = bool(self.get_option(section, 'X11', False )) + job.ext_tailer_path = self.get_option(section, 'EXTENDED_TAILER_PATH', '') + job.ext_header_path = self.get_option(section, 'EXTENDED_HEADER_PATH', '') + if self.get_option(section, "SKIPPABLE", "False").lower() == "true": job.skippable = True else: diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index 3d55bb0404a916eb2f7831e416a8956b9d41da5e..f2fbe86d215878668cb8921fe0d540e54f122870 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -215,6 +215,15 @@ class JobList(object): new, notransitive, update_structure=update_structure) for job in self._job_list: job.parameters = parameters + job_data = jobs_data.get(job.name,"none") + try: + if job_data != "none": + job.wrapper_type = job_data[12] + else: + job.wrapper_type = "none" + except BaseException as e: + job.wrapper_type = "none" + # Checking for member constraints if len(run_only_members) > 0: # Found @@ -240,7 +249,7 @@ class JobList(object): else: self._ordered_jobs_by_date_member[wrapper_section] = {} except BaseException as e: - raise AutosubmitCritical("Some section jobs of the wrapper:{0} are not in the current job_list defined in jobs.conf".format(wrapper_section),7014,e.message) + raise AutosubmitCritical("Some section jobs of the wrapper:{0} are not in the current job_list defined in jobs.conf".format(wrapper_section),7014,str(e)) pass @@ -376,6 +385,8 @@ class JobList(object): # Get current job dependency relations. Used for select chunk option. This is the job in where select chunks option is defined if len(dependency.select_chunks_orig) > 0: # find chunk relation other_parents = dic_jobs.get_jobs(dependency.section, date, member, None) + jobs_by_section = [p for p in other_parents if p.section == dependency.section] + chunk_relation_indx = 0 while chunk_relation_indx < len(dependency.select_chunks_orig): if job.running in ["once"] or len(dependency.select_chunks_orig[chunk_relation_indx]) == 0 or job.chunk in dependency.select_chunks_orig[chunk_relation_indx]: @@ -396,36 +407,40 @@ class JobList(object): for parent in parents_jobs: # Generic for all dependencies if dependency.delay == -1 or chunk > dependency.delay: - if isinstance(parent, list): - if job.split is not None: - parent = filter( - lambda _parent: _parent.split == job.split, parent)[0] - else: - if dependency.splits is not None: - parent = filter( - lambda _parent: _parent.split in dependency.splits, parent) - #Select chunk + select member - if parent.running in ["once"] or ( len(dependency.select_members_orig) <= 0 and len(dependency.select_chunks_orig) <= 0): - job.add_parent(parent) - JobList._add_edge(graph, job, parent) - elif len(dependency.select_members_orig) > 0: - for relation_indx in member_relations_to_add: - if member_list.index(parent.member) in dependency.select_members_dest[relation_indx] or len(dependency.select_members_dest[relation_indx]) <= 0: - if parent not in visited_parents: - job.add_parent(parent) - JobList._add_edge(graph, job, parent) - other_parents.remove(parent) - visited_parents.add(parent) - elif len(dependency.select_chunks_orig) > 0: + if parent.split is not None and dependency.splits is not None: + if parent.split not in dependency.splits: + continue + #Select chunk + select member + if parent.running in ["once"] or ( len(dependency.select_members_orig) <= 0 and len(dependency.select_chunks_orig) <= 0): + job.add_parent(parent) + JobList._add_edge(graph, job, parent) + elif len(dependency.select_members_orig) > 0: + for relation_indx in member_relations_to_add: + if member_list.index(parent.member) in dependency.select_members_dest[relation_indx] or len(dependency.select_members_dest[relation_indx]) <= 0: + if parent not in visited_parents: + job.add_parent(parent) + JobList._add_edge(graph, job, parent) + other_parents.remove(parent) + visited_parents.add(parent) + elif len(dependency.select_chunks_orig) > 0: + for relation_indx in chunk_relations_to_add: + if parent.chunk in dependency.select_chunks_dest[relation_indx] or len( + dependency.select_chunks_dest[relation_indx]) == 0: + if parent not in visited_parents: + job.add_parent(parent) + JobList._add_edge(graph, job, parent) + other_parents.remove(parent) + visited_parents.add(parent) + # If job doesn't have any parent after a first search, search in all dependency.section. This is to avoid +1 being added only to the last one. + if len(job.parents) <= 0: for relation_indx in chunk_relations_to_add: - if parent.chunk in dependency.select_chunks_dest[relation_indx] or len( - dependency.select_chunks_dest[relation_indx]) == 0: - if parent not in visited_parents: - job.add_parent(parent) - JobList._add_edge(graph, job, parent) - other_parents.remove(parent) - visited_parents.add(parent) - + for parent2 in jobs_by_section: + if parent2.chunk in dependency.select_chunks_dest[relation_indx] or len( + dependency.select_chunks_dest[relation_indx]) == 0: + if parent2 not in visited_parents: + job.add_parent(parent2) + JobList._add_edge(graph, job, parent2) + visited_parents.add(parent2) JobList.handle_frequency_interval_dependencies(chunk, chunk_list, date, date_list, dic_jobs, job, member, member_list, dependency.section, graph, other_parents) @@ -1399,11 +1414,11 @@ class JobList(object): self._persistence_file, self._job_list if self.run_members is None or job_list is None else job_list) pass except BaseException as e: - raise AutosubmitError(e.message,6040,"Failure while saving the job_list") + raise AutosubmitError(str(e),6040,"Failure while saving the job_list") except AutosubmitError as e: raise except BaseException as e: - raise AutosubmitError(e.message,6040,"Unknown failure while saving the job_list") + raise AutosubmitError(str(e),6040,"Unknown failure while saving the job_list") def backup_save(self): @@ -2022,7 +2037,7 @@ class JobList(object): # root exists if root is not None: - result += self._recursion_print(root, 0) + result += self._recursion_print(root, 0,[]) else: result += "\nCannot find root." diff --git a/autosubmit/job/job_packager.py b/autosubmit/job/job_packager.py index 54a6268c36981c275fbcd744f34f51f46708eb16..57d20f2750ec4e47d364113507c57f0ed29fb793 100644 --- a/autosubmit/job/job_packager.py +++ b/autosubmit/job/job_packager.py @@ -17,17 +17,17 @@ # You should have received a copy of the GNU General Public License # along with Autosubmit. If not, see . -from log.log import Log, AutosubmitCritical, AutosubmitError -from autosubmit.job.job_common import Status, Type +import operator from bscearth.utils.date import sum_str_hours -from autosubmit.job.job_packages import JobPackageSimple, JobPackageVertical, JobPackageHorizontal, \ - JobPackageSimpleWrapped, JobPackageHorizontalVertical, JobPackageVerticalHorizontal, JobPackageBase -from operator import attrgetter from math import ceil -import operator -from collections import defaultdict +from operator import attrgetter from typing import List +from autosubmit.job.job_common import Status, Type +from autosubmit.job.job_packages import JobPackageSimple, JobPackageVertical, JobPackageHorizontal, \ + JobPackageSimpleWrapped, JobPackageHorizontalVertical, JobPackageVerticalHorizontal, JobPackageBase +from log.log import Log, AutosubmitCritical + class JobPackager(object): """ @@ -57,7 +57,12 @@ class JobPackager(object): # Submitted + Queuing Jobs for specific Platform queuing_jobs = jobs_list.get_queuing(platform) # We now consider the running jobs count - running_jobs_count = len(jobs_list.get_running(platform)) + running_jobs = jobs_list.get_running(platform) + running_by_id = dict() + for running_job in running_jobs: + running_by_id[running_job.id] = running_job + running_jobs_len = len(running_by_id.keys()) + queued_by_id = dict() for queued_job in queuing_jobs: queued_by_id[queued_job.id] = queued_job @@ -76,42 +81,32 @@ class JobPackager(object): # .total_jobs Maximum number of jobs at the same time self._max_jobs_to_submit = platform.total_jobs - queuing_jobs_len # Substracting running jobs - self._max_jobs_to_submit = self._max_jobs_to_submit - running_jobs_count + self._max_jobs_to_submit = self._max_jobs_to_submit - running_jobs_len self._max_jobs_to_submit = self._max_jobs_to_submit if self._max_jobs_to_submit > 0 else 0 - self.max_jobs = min(self._max_wait_jobs_to_submit, - self._max_jobs_to_submit) + self.max_jobs = min(self._max_wait_jobs_to_submit,self._max_jobs_to_submit) self.wrapper_type["wrapper"] = self._as_config.get_wrapper_type() self.wrapper_policy["wrapper"] = self._as_config.get_wrapper_policy() - self.wrapper_method["wrapper"] = self._as_config.get_wrapper_method().lower() + self.wrapper_method["wrapper"] = str(self._as_config.get_wrapper_method()).lower() self.jobs_in_wrapper["wrapper"] = self._as_config.get_wrapper_jobs() self.extensible_wallclock["wrapper"] = self._as_config.get_extensible_wallclock() if self._as_config.get_wrapper_type() == "multi": for wrapper_section in self._as_config.get_wrapper_multi(): self.wrapper_type[wrapper_section] = self._as_config.get_wrapper_type(wrapper_section) self.wrapper_policy[wrapper_section] = self._as_config.get_wrapper_policy(wrapper_section) - self.wrapper_method[wrapper_section] = self._as_config.get_wrapper_method(wrapper_section).lower() + self.wrapper_method[wrapper_section] = str(self._as_config.get_wrapper_method(wrapper_section)).lower() self.jobs_in_wrapper[wrapper_section] = self._as_config.get_wrapper_jobs(wrapper_section) self.extensible_wallclock[wrapper_section] = int(self._as_config.get_extensible_wallclock(wrapper_section)) self.wrapper_info = [self.wrapper_type,self.wrapper_policy,self.wrapper_method,self.jobs_in_wrapper,self.extensible_wallclock] # to pass to job_packages - - - # True or False - - Log.debug( - "Number of jobs available: {0}", self._max_wait_jobs_to_submit) + Log.debug("Number of jobs available: {0}", self._max_wait_jobs_to_submit) if self.hold: - Log.debug("Number of jobs prepared: {0}", len( - jobs_list.get_prepared(platform))) + Log.debug("Number of jobs prepared: {0}", len(jobs_list.get_prepared(platform))) if len(jobs_list.get_prepared(platform)) > 0: - Log.debug("Jobs ready for {0}: {1}", self._platform.name, len( - jobs_list.get_prepared(platform))) + Log.debug("Jobs ready for {0}: {1}", self._platform.name, len(jobs_list.get_prepared(platform))) else: - Log.debug("Number of jobs ready: {0}", len( - jobs_list.get_ready(platform, hold=False))) + Log.debug("Number of jobs ready: {0}", len(jobs_list.get_ready(platform, hold=False))) if len(jobs_list.get_ready(platform)) > 0: - Log.debug("Jobs ready for {0}: {1}", self._platform.name, len( - jobs_list.get_ready(platform))) + Log.debug("Jobs ready for {0}: {1}", self._platform.name, len(jobs_list.get_ready(platform))) self._maxTotalProcessors = 0 def compute_weight(self, job_list): @@ -164,7 +159,7 @@ class JobPackager(object): jobs_ready = list() if len(self._jobs_list.jobs_to_run_first) > 0: jobs_ready = [job for job in self._jobs_list.jobs_to_run_first if - ( self._platform is None or job.platform.name.lower() == self._platform.name.lower() ) and + ( self._platform is None or str(job.platform.name).lower() == str(self._platform.name).lower() ) and job.status == Status.READY] if len(jobs_ready) == 0: if self.hold: @@ -210,8 +205,7 @@ class JobPackager(object): # Sort by Priority, highest first list_of_available = sorted( available_sorted, key=lambda k: k.priority, reverse=True) - num_jobs_to_submit = min(self._max_wait_jobs_to_submit, len( - jobs_ready), self._max_jobs_to_submit) + num_jobs_to_submit = min(self._max_wait_jobs_to_submit, len(jobs_ready), self._max_jobs_to_submit) # Take the first num_jobs_to_submit from the list of available jobs_to_submit_tmp = list_of_available[0:num_jobs_to_submit] #jobs_to_submit = [ @@ -232,13 +226,17 @@ class JobPackager(object): for wrapper_section in self.jobs_in_wrapper: if "&" in self.jobs_in_wrapper[wrapper_section]: char = "&" - else: - char = " " - for section_inside_wrapper in self.jobs_in_wrapper[wrapper_section].split(char): - if section == section_inside_wrapper: + if section == self.jobs_in_wrapper[wrapper_section]: wrapper_defined = True self.current_wrapper_section = wrapper_section break + else: + char = " " + for section_inside_wrapper in self.jobs_in_wrapper[wrapper_section].split(char): + if section == section_inside_wrapper: + wrapper_defined = True + self.current_wrapper_section = wrapper_section + break if wrapper_defined and self._platform.allow_wrappers and self.wrapper_type[self.current_wrapper_section] in ['horizontal', 'vertical','vertical-horizontal', 'horizontal-vertical'] : # Trying to find the value in jobs_parser, if not, default to an autosubmit_.conf value (Looks first in [wrapper] section) wrapper_limits = dict() @@ -248,6 +246,10 @@ class JobPackager(object): wrapper_limits["max_h"] = self._as_config.get_max_wrapped_jobs_horizontal(self.current_wrapper_section) if wrapper_limits["max"] < wrapper_limits["max_v"] * wrapper_limits["max_h"]: wrapper_limits["max"] = wrapper_limits["max_v"] * wrapper_limits["max_h"] + if wrapper_limits["max_v"] == -1: + wrapper_limits["max_v"] = wrapper_limits["max"] + if wrapper_limits["max_h"] == -1: + wrapper_limits["max_h"] = wrapper_limits["max"] if '&' not in section: if self._as_config.jobs_parser.has_option(section, 'DEPENDENCIES'): dependencies_keys = self._as_config.jobs_parser.get( @@ -435,10 +437,10 @@ class JobPackager(object): wrapper_limits["min_v"], wrapper_limits["min"], len(active_jobs)), 6013) else: - message = "Wrapper couldn't be formed under {0} POLICY due minimum limit not being reached: [wrappeable:{4} < defined_min:{5}] [wrappeable_h:{1} < defined_min_h:{2}]|[wrappeable_v:{3} < defined_min_v:{4}] ".format( + message = "Wrapper couldn't be formed under {0} POLICY due minimum limit not being reached: [wrappeable:{5} <= defined_min:{6}] [wrappeable_h:{1} <= defined_min_h:{2}]|[wrappeable_v:{3} <= defined_min_v:{4}] ".format( self.wrapper_policy[self.current_wrapper_section], min_h, - wrapper_limits["min_h"], min_v, wrapper_limits["min_v"], - wrapper_limits["min"], len(active_jobs)) + wrapper_limits["min_h"], min_v, wrapper_limits["min_v"], len(p.jobs), + wrapper_limits["min"]) if hard_deadlock: message += "\nCheck your configuration: The next wrappeable job can't be wrapped until some of inner jobs of current packages finishes which is imposible" if min_v > 1: @@ -479,13 +481,12 @@ class JobPackager(object): if len(active_jobs) > 0: if show_log: Log.printlog( - "Wrapper policy is set to MIXED and there are not enough jobs to form a wrapper.[wrappeable:{4} < defined_min:{5}] [wrappeable_h:{0} < defined_min_h:{1}]|[wrappeable_v:{2} < defined_min_v:{3}] waiting until the wrapper can be formed.".format( + "Wrapper policy is set to MIXED and there are not enough jobs to form a wrapper.[wrappeable:{4} <= defined_min:{5}] [wrappeable_h:{0} <= defined_min_h:{1}]|[wrappeable_v:{2} <= defined_min_v:{3}] waiting until the wrapper can be formed.".format( min_h, wrapper_limits["min_h"], min_v, wrapper_limits["min_v"],wrapper_limits["min"],len(active_jobs)), 6013) else: - message = "Wrapper couldn't be formed under {0} POLICY due minimum limit not being reached: [wrappeable:{4} < defined_min:{5}] [wrappeable_h:{1} < defined_min_h:{2}]|[wrappeable_v:{3} < defined_min_v:{4}] ".format( - self.wrapper_policy[self.current_wrapper_section], min_h, - wrapper_limits["min_h"], min_v, wrapper_limits["min_v"],wrapper_limits["min"],len(active_jobs)) + message = "Wrapper couldn't be formed under {0} POLICY due minimum limit not being reached: [wrappeable:{5} <= defined_min:{6}] [wrappeable_h:{1} <= defined_min_h:{2}]|[wrappeable_v:{3} <= defined_min_v:{4}] ".format( + self.wrapper_policy[self.current_wrapper_section], min_h,wrapper_limits["min_h"],min_v, wrapper_limits["min_v"], len(p.jobs),wrapper_limits["min"]) if hard_deadlock: message += "\nCheck your configuration: The next wrappeable job can't be wrapped until some of inner jobs of current packages finishes which is imposible" if min_v > 1: @@ -552,7 +553,7 @@ class JobPackager(object): def _build_horizontal_packages(self, section_list, wrapper_limits, section): packages = [] horizontal_packager = JobPackagerHorizontal(section_list, self._platform.max_processors, wrapper_limits, - self.max_jobs, self._platform.processors_per_node, self.wrapper_method[self.current_wrapper_section]) + wrapper_limits["max"], self._platform.processors_per_node, self.wrapper_method[self.current_wrapper_section]) package_jobs = horizontal_packager.build_horizontal_package() @@ -585,11 +586,11 @@ class JobPackager(object): """ packages = [] for job in section_list: - if self.max_jobs > 0: + if wrapper_limits["max"] > 0: if job.packed is False: job.packed = True dict_jobs = self._jobs_list.get_ordered_jobs_by_date_member(self.current_wrapper_section) - job_vertical_packager = JobPackagerVerticalMixed(dict_jobs, job, [job], job.wallclock, self.max_jobs, wrapper_limits, self._platform.max_wallclock) + job_vertical_packager = JobPackagerVerticalMixed(dict_jobs, job, [job], job.wallclock, wrapper_limits["max"], wrapper_limits, self._platform.max_wallclock) jobs_list = job_vertical_packager.build_vertical_package(job) packages.append(JobPackageVertical(jobs_list, configuration=self._as_config,wrapper_section=self.current_wrapper_section,wrapper_info=wrapper_info)) @@ -605,7 +606,7 @@ class JobPackager(object): ## READY JOBS ## ## Create the horizontal ## horizontal_packager = JobPackagerHorizontal(jobs_list, self._platform.max_processors, wrapper_limits, - self.max_jobs, self._platform.processors_per_node) + wrapper_limits["max"], self._platform.processors_per_node) if self.wrapper_type[self.current_wrapper_section] == 'vertical-horizontal': return self._build_vertical_horizontal_package(horizontal_packager, jobs_resources) @@ -654,9 +655,10 @@ class JobPackager(object): horizontal_packager.wrapper_limits["max_by_section"][section] = horizontal_packager.wrapper_limits["max_by_section"][section] - 1 horizontal_packager.wrapper_limits["max"] = horizontal_packager.wrapper_limits["max"] - actual_wrapped_jobs for job in horizontal_package: - job_list = JobPackagerVerticalSimple([job], job.wallclock, self.max_jobs, + #jobs_list, total_wallclock, max_jobs, wrapper_limits, max_wallclock, wrapper_info + job_list = JobPackagerVertical([job], job.wallclock, horizontal_packager.wrapper_limits["max"], horizontal_packager.wrapper_limits, - self._platform.max_wallclock).build_vertical_package(job) + self._platform.max_wallclock,self.wrapper_info).build_vertical_package(job) current_package.append(job_list) @@ -706,7 +708,7 @@ class JobPackagerVertical(object): :rtype: List() of Job Object \n """ # self.jobs_list starts as only 1 member, but wrapped jobs are added in the recursion - if len(self.jobs_list) >= self.max_jobs or len(self.jobs_list) >= self.wrapper_limits["max_v"] or len(self.jobs_list) >= self.wrapper_limits["max_by_section"][job.section] or len(self.jobs_list) >= self.wrapper_limits["max"]: + if len(self.jobs_list) >= self.wrapper_limits["max_v"] or len(self.jobs_list) >= self.wrapper_limits["max_by_section"][job.section] or len(self.jobs_list) >= self.wrapper_limits["max"]: return self.jobs_list child = self.get_wrappable_child(job) # If not None, it is wrappable @@ -897,7 +899,7 @@ class JobPackagerHorizontal(object): for section in jobs_by_section: current_package_by_section[section] = 0 for job in jobs_by_section[section]: - if self.max_jobs > 0 and len(current_package) < self.wrapper_limits["max_h"] and len(current_package) < self.wrapper_limits["max"] and current_package_by_section[section] < self.wrapper_limits["max_by_section"][section]: + if len(current_package) < self.wrapper_limits["max_h"] and len(current_package) < self.wrapper_limits["max"] and current_package_by_section[section] < self.wrapper_limits["max_by_section"][section]: if int(job.tasks) != 0 and int(job.tasks) != int(self.processors_node) and \ int(job.tasks) < job.total_processors: nodes = int( diff --git a/autosubmit/job/job_packages.py b/autosubmit/job/job_packages.py index 52afa70cc54f543914bcdf005886a1ec1653bde7..6174ffc693ea62ebe20008eed2b62bdb0eff670e 100644 --- a/autosubmit/job/job_packages.py +++ b/autosubmit/job/job_packages.py @@ -25,16 +25,16 @@ except ImportError: from ConfigParser import SafeConfigParser import os +import random +import time from datetime import timedelta -import time -import random from autosubmit.job.job_common import Status -from log.log import Log,AutosubmitCritical,AutosubmitError +from log.log import Log, AutosubmitCritical + Log.get_logger("Autosubmit") -from autosubmit.job.job_exceptions import WrongTemplateException from autosubmit.job.job import Job -from bscearth.utils.date import sum_str_hours,date2str +from bscearth.utils.date import sum_str_hours from threading import Thread, Lock from typing import List import multiprocessing @@ -98,13 +98,12 @@ class JobPackageBase(object): @threaded def check_scripts(self,jobs,configuration, parameters,only_generate,hold): for job in jobs: - if job.check.lower() == Job.CHECK_ON_SUBMISSION.lower(): + if str(job.check).lower() == str(Job.CHECK_ON_SUBMISSION).lower(): if only_generate: - exit = True break if not os.path.exists(os.path.join(configuration.get_project_dir(), job.file)): lock.acquire() - if configuration.get_project_type().lower() != "none": + if str(configuration.get_project_type()).lower() != "none": raise AutosubmitCritical( "Template [ {0} ] using CHECK=On_submission has some empty variable {0}".format( job.name), 7014) @@ -114,9 +113,9 @@ class JobPackageBase(object): Log.warning("On submission script has some empty variables") else: Log.result("Script {0} OK", job.name) - lock.acquire() - job.update_parameters(configuration, parameters) - lock.release() + # lock.acquire() + # job.update_parameters(configuration, parameters) + # lock.release() # looking for directives on jobs self._custom_directives = self._custom_directives | set(job.custom_directives) @threaded @@ -148,32 +147,49 @@ class JobPackageBase(object): thread_number = thread_number * 5 chunksize = int((len(self.jobs) + thread_number - 1) / thread_number) try: - if len(self.jobs) < thread_number: - for job in self.jobs: - if job.check.lower() == Job.CHECK_ON_SUBMISSION.lower(): - if only_generate: - exit=True - break + # get one job of each section jobs by section + if only_generate: + if hasattr(configuration, 'current_wrapper_section'): + sections = configuration.get_wrapper_jobs(self.current_wrapper_section) + if "&" in sections: + sections.split("&") + elif " " in sections: + sections.split(" ") + else: + sections = [sections] + else: + sections = [self.jobs[0].section] + for section in sections: + if str(configuration._jobs_parser.get_option(section, "CHECK", 'True')).lower() == str(Job.CHECK_ON_SUBMISSION).lower(): + exit = True + if not exit: + if len(self.jobs) < thread_number: + for job in self.jobs: if not os.path.exists(os.path.join(configuration.get_project_dir(), job.file)): - if configuration.get_project_type().lower() != "none": + if str(configuration.get_project_type()).lower() != "none": raise AutosubmitCritical("Template [ {0} ] using CHECK=On_submission has some empty variable {0}".format(job.name),7014) if not job.check_script(configuration, parameters,show_logs=job.check_warnings): Log.warning("Script {0} check failed",job.name) Log.warning("On submission script has some empty variables") else: Log.result("Script {0} OK",job.name) - job.update_parameters(configuration, parameters) - # looking for directives on jobs - self._custom_directives = self._custom_directives | set(job.custom_directives) + # called inside check_script + #job.update_parameters(configuration, parameters) + # looking for directives on jobs + self._custom_directives = self._custom_directives | set(job.custom_directives) + else: + Lhandle = list() + for i in xrange(0, len(self.jobs), chunksize): + Lhandle.append(self.check_scripts(self.jobs[i:i + chunksize], configuration, parameters, only_generate, hold)) + for dataThread in Lhandle: + dataThread.join() + except BaseException as e: + original = e + if not exit: + raise AutosubmitCritical( + "Error on {1}, template [{0}] still does not exists in running time(check=on_submission actived)\n{2} ".format(self.jobs[0].file,self.jobs[0].name,e), 7014) else: - Lhandle = list() - for i in xrange(0, len(self.jobs), chunksize): - Lhandle.append(self.check_scripts(self.jobs[i:i + chunksize], configuration, parameters, only_generate, hold)) - for dataThread in Lhandle: - dataThread.join() - except BaseException as e: #should be IOERROR - raise AutosubmitCritical( - "Error on {1}, template [{0}] still does not exists in running time(check=on_submission actived) ".format(job.file,job.name), 7014) + raise AutosubmitCritical(original,7014) Log.debug("Creating Scripts") if not exit: if len(self.jobs) < thread_number: @@ -427,7 +443,7 @@ class JobPackageThread(JobPackageBase): def _send_files(self): Log.debug("Check remote dir") - self.platform.check_remote_log_dir() + #self.platform.check_remote_log_dir() compress_type = "w" output_filepath = '{0}.tar'.format("wrapper_scripts") if callable(getattr(self.platform, 'remove_multiple_files')): @@ -638,7 +654,7 @@ class JobPackageVertical(JobPackageThread): num_processors=self._num_processors, jobs_scripts=self._jobs_scripts, dependency=self._job_dependency, jobs_resources=self._jobs_resources, expid=self._expid, rootdir=self.platform.root_dir, - directives=self._custom_directives,threads=self._threads,method=self.method.lower(),retrials=self.inner_retrials, wallclock_by_level=wallclock_by_level) + directives=self._custom_directives,threads=self._threads,method=str(self.method).lower(),retrials=self.inner_retrials, wallclock_by_level=wallclock_by_level) class JobPackageHorizontal(JobPackageThread): @@ -663,12 +679,15 @@ class JobPackageHorizontal(JobPackageThread): self._jobs_resources = jobs_resources def _common_script_content(self): + fail_count = 0 + if len(self.jobs) > 0: + fail_count = self.jobs[0].fail_count return self._wrapper_factory.get_wrapper(self._wrapper_factory.horizontal_wrapper, name=self._name, queue=self._queue, project=self._project, wallclock=self._wallclock, num_processors=self._num_processors, jobs_scripts=self._jobs_scripts, dependency=self._job_dependency, jobs_resources=self._jobs_resources, expid=self._expid, rootdir=self.platform.root_dir, - directives=self._custom_directives,threads=self._threads,method=self.method.lower()) + directives=self._custom_directives,threads=self._threads,method=str(self.method).lower(),fail_count=fail_count) class JobPackageHybrid(JobPackageThread): """ @@ -708,21 +727,27 @@ class JobPackageHybrid(JobPackageThread): class JobPackageVerticalHorizontal(JobPackageHybrid): def _common_script_content(self): + fail_count = 0 + if len(self.jobs) > 0: + fail_count = self.jobs[0].fail_count return self._wrapper_factory.get_wrapper(self._wrapper_factory.hybrid_wrapper_vertical_horizontal, name=self._name, queue=self._queue, project=self._project, wallclock=self._wallclock, num_processors=self._num_processors, jobs_scripts=self._jobs_scripts, dependency=self._job_dependency, jobs_resources=self._jobs_resources, expid=self._expid, - rootdir=self.platform.root_dir, directives=self._custom_directives,threads=self._threads,method=self.method.lower()) + rootdir=self.platform.root_dir, directives=self._custom_directives,threads=self._threads,method=str(self.method).lower(),fail_count=fail_count) class JobPackageHorizontalVertical(JobPackageHybrid): def _common_script_content(self): + fail_count = 0 + if len(self.jobs) > 0: + fail_count = self.jobs[0].fail_count return self._wrapper_factory.get_wrapper(self._wrapper_factory.hybrid_wrapper_horizontal_vertical, name=self._name, queue=self._queue, project=self._project, wallclock=self._wallclock, num_processors=self._num_processors, jobs_scripts=self._jobs_scripts, dependency=self._job_dependency, jobs_resources=self._jobs_resources, expid=self._expid, - rootdir=self.platform.root_dir, directives=self._custom_directives,threads=self._threads,method=self.method.lower()) + rootdir=self.platform.root_dir, directives=self._custom_directives,threads=self._threads,method=str(self.method).lower(),fail_count=fail_count) diff --git a/autosubmit/monitor/diagram.py b/autosubmit/monitor/diagram.py index 8e87531670a58e7e94e1e59cc077592cdd600362..b1f0f67446a94ebb4def35376b75d78a7dcff05f 100644 --- a/autosubmit/monitor/diagram.py +++ b/autosubmit/monitor/diagram.py @@ -66,14 +66,18 @@ def create_bar_diagram(experiment_id, jobs_list, general_stats, output_file, per exp_stats.calculate_statistics() exp_stats.calculate_summary() exp_stats.make_old_format() - failed_jobs_dict = exp_stats.build_failed_jobs_only_list() + failed_jobs_dict = exp_stats.build_failed_jobs_only_list() + # Stats variables definition + normal_plots_count = int(np.ceil(len(exp_stats.jobs_stat) / MAX_JOBS_PER_PLOT)) + failed_jobs_plots_count = int(np.ceil(len(failed_jobs_dict) / MAX_JOBS_PER_PLOT)) except Exception as exp: + if not isinstance(normal_plots_count,int): + normal_plots_count = 0 + if not isinstance(failed_jobs_plots_count,int): + failed_jobs_plots_count = 0 print(exp) print(traceback.format_exc()) - # Stats variables definition - normal_plots_count = int(np.ceil(len(exp_stats.jobs_stat) / MAX_JOBS_PER_PLOT)) - failed_jobs_plots_count = int(np.ceil(len(failed_jobs_dict) / MAX_JOBS_PER_PLOT)) total_plots_count = normal_plots_count + failed_jobs_plots_count # num_plots = norma # ind = np.arange(int(MAX_JOBS_PER_PLOT)) diff --git a/autosubmit/monitor/monitor.py b/autosubmit/monitor/monitor.py index 55c60156a671586685940f989957c4188e650b72..bbfbb10187662f0be1215a1f3381903eb1ce0e65 100644 --- a/autosubmit/monitor/monitor.py +++ b/autosubmit/monitor/monitor.py @@ -153,7 +153,6 @@ class Monitor: for job in joblist: if job.has_parents(): continue - if not groups or job.name not in groups['jobs'] or (job.name in groups['jobs'] and len(groups['jobs'][job.name]) == 1): node_job = pydotplus.Node(job.name, shape='box', style="filled", fillcolor=self.color_status(job.status)) @@ -299,6 +298,7 @@ class Monitor: :param job_list_object: Object that has the main txt generation method :type job_list_object: JobList object """ + error_msg = "" try: Log.info('Plotting...') now = time.localtime() @@ -347,13 +347,14 @@ class Monitor: raise except BaseException as e: try: - e.message += "\n"+e.value - if "GraphViz" in e.message: - e.message= "Graphviz is not installed. Autosubmit need this system package in order to plot the workflow." + if "GraphViz" in str(e): + error_msg="Graphviz is not installed. Autosubmit need this system package in order to plot the workflow." + else: + error_msg = str(e) except: pass - Log.printlog("{0}\nSpecified output doesn't have an available viewer installed or graphviz is not installed. The output was only writted in txt".format(e.message),7014) + Log.printlog("{0}\nSpecified output doesn't have an available viewer installed. The output was only written in txt".format(error_msg),7014) def generate_output_txt(self, expid, joblist, path, classictxt=False, job_list_object=None): diff --git a/autosubmit/notifications/mail_notifier.py b/autosubmit/notifications/mail_notifier.py index 53048138fd985a8a715c2b12019797765f158d77..ade31960111e42e5ff2e822d3c727b6488e5a914 100644 --- a/autosubmit/notifications/mail_notifier.py +++ b/autosubmit/notifications/mail_notifier.py @@ -31,6 +31,8 @@ class MailNotifier: message = MIMEText(message_text) message['From'] = email.utils.formataddr(('Autosubmit', self.config.MAIL_FROM)) message['Subject'] = '[Autosubmit] Warning a remote platform is malfunctioning' + message['Date'] = email.utils.formatdate(localtime=True) + for mail in mail_to: message['To'] = email.utils.formataddr((mail, mail)) try: @@ -42,6 +44,7 @@ class MailNotifier: message = MIMEText(message_text) message['From'] = email.utils.formataddr(('Autosubmit', self.config.MAIL_FROM)) message['Subject'] = '[Autosubmit] The job {0} status has changed to {1}'.format(job_name, str(status)) + message['Date'] = email.utils.formatdate(localtime=True) for mail in mail_to: message['To'] = email.utils.formataddr((mail, mail)) try: diff --git a/autosubmit/platforms/ecplatform.py b/autosubmit/platforms/ecplatform.py index e5de0c73307530da5609538b8e7a9ff8f62e786e..58848b9cc3ecdb449203eda79a6fd24948b46cac 100644 --- a/autosubmit/platforms/ecplatform.py +++ b/autosubmit/platforms/ecplatform.py @@ -26,7 +26,7 @@ from autosubmit.platforms.headers.ec_cca_header import EcCcaHeader from autosubmit.platforms.headers.slurm_header import SlurmHeader from autosubmit.platforms.wrappers.wrapper_factory import EcWrapperFactory from time import sleep - +import locale class EcPlatform(ParamikoPlatform): """ Class to manage queues with ecaccess @@ -39,6 +39,8 @@ class EcPlatform(ParamikoPlatform): def __init__(self, expid, name, config, scheduler): ParamikoPlatform.__init__(self, expid, name, config) + #version=scheduler + self.ec_queue = "hpc" if scheduler == 'pbs': self._header = EcCcaHeader() elif scheduler == 'loadleveler': @@ -57,6 +59,21 @@ class EcPlatform(ParamikoPlatform): self._allow_arrays = False self._allow_wrappers = False # TODO self._allow_python_jobs = False + self.root_dir = "" + self.remote_log_dir = "" + self.cancel_cmd = "" + self._checkjob_cmd = "" + self._checkhost_cmd = "" + self._submit_cmd = "" + self._submit_command_name = "" + self.put_cmd = "" + self.get_cmd = "" + self.del_cmd = "" + self.mkdir_cmd = "" + self.check_remote_permissions_cmd = "" + self.check_remote_permissions_remove_cmd = "" + + self.update_cmds() def update_cmds(self): @@ -65,10 +82,11 @@ class EcPlatform(ParamikoPlatform): """ self.root_dir = os.path.join(self.scratch, self.project, self.user, self.expid) self.remote_log_dir = os.path.join(self.root_dir, "LOG_" + self.expid) - self.cancel_cmd = "eceaccess-job-delete" + self.cancel_cmd = "ecaccess-job-delete" self._checkjob_cmd = "ecaccess-job-list " self._checkhost_cmd = "ecaccess-certificate-list" - self._submit_cmd = ("ecaccess-job-submit -distant -queueName " + self.host + " " + self.host + ":" + + self._checkvalidcert_cmd = "ecaccess-gateway-connected" + self._submit_cmd = ("ecaccess-job-submit -distant -queueName " + self.ec_queue + " " + self.host + ":" + self.remote_log_dir + "/") self._submit_command_name = "ecaccess-job-submit" self.put_cmd = "ecaccess-file-put" @@ -77,8 +95,8 @@ class EcPlatform(ParamikoPlatform): self.mkdir_cmd = ("ecaccess-file-mkdir " + self.host + ":" + self.scratch + "/" + self.project + "/" + self.user + "/" + self.expid + "; " + "ecaccess-file-mkdir " + self.host + ":" + self.remote_log_dir) - self.check_remote_permissions_cmd = "ecaccess-file-mkdir " + os.path.join(self.scratch,self.project,self.user,"_permission_checker_azxbyc") - self.check_remote_permissions_remove_cmd = "ecaccess-file-rmdir " + os.path.join(self.scratch,self.project,self.user,"_permission_checker_azxbyc") + self.check_remote_permissions_cmd = "ecaccess-file-mkdir " + self.host+":"+os.path.join(self.scratch,self.project,self.user,"_permission_checker_azxbyc") + self.check_remote_permissions_remove_cmd = "ecaccess-file-rmdir " + self.host+":"+os.path.join(self.scratch,self.project,self.user,"_permission_checker_azxbyc") def get_checkhost_cmd(self): return self._checkhost_cmd @@ -88,6 +106,9 @@ class EcPlatform(ParamikoPlatform): def get_mkdir_cmd(self): return self.mkdir_cmd + def set_submit_cmd(self,ec_queue="hpc"): + self._submit_cmd = ("ecaccess-job-submit -distant -queueName " + ec_queue + " " + self.host + ":" + + self.remote_log_dir + "/") def parse_job_output(self, output): job_state = output.split('\n') @@ -113,6 +134,7 @@ class EcPlatform(ParamikoPlatform): return self._checkjob_cmd + str(job_id) def get_submit_cmd(self, job_script, job, hold=False, export=""): + self.set_submit_cmd(job.ec_queue) if export == "none" or export == "None" or export is None or export == "": export = "" else: @@ -126,7 +148,16 @@ class EcPlatform(ParamikoPlatform): :return: True :rtype: bool """ - self.connected = True + output = subprocess.check_output(self._checkvalidcert_cmd, shell=True).decode(locale.getlocale()[1]) + if not output: + output = "" + try: + if output.lower().find("yes") != -1: + self.connected = True + else: + self.connected = False + except: + self.connected = False def restore_connection(self): """ In this case, it does nothing because connection is established for each command @@ -134,7 +165,16 @@ class EcPlatform(ParamikoPlatform): :return: True :rtype: bool """ - self.connected = True + output = subprocess.check_output(self._checkvalidcert_cmd, shell=True).decode(locale.getlocale()[1]) + if not output: + output = "" + try: + if output.lower().find("yes") != -1: + self.connected = True + else: + self.connected = False + except: + self.connected = False def test_connection(self): """ In this case, it does nothing because connection is established for each command @@ -142,33 +182,51 @@ class EcPlatform(ParamikoPlatform): :return: True :rtype: bool """ - self.connected = True + output = subprocess.check_output(self._checkvalidcert_cmd, shell=True).decode(locale.getlocale()[1]) + if not output: + output = "" + try: + if output.lower().find("yes") != -1: + self.connected = True + return "OK" + else: + self.connected = False + return "Invalid certificate" + except: + self.connected = False + return "Invalid certificate" def check_remote_permissions(self): try: try: - output = subprocess.check_output(self.check_remote_permissions_remove_cmd, shell=True) - except: + subprocess.check_output(self.check_remote_permissions_remove_cmd, shell=False) + except Exception as e: pass - output = subprocess.check_output(self.check_remote_permissions_cmd, shell=True) - pass - output = subprocess.check_output(self.check_remote_permissions_remove_cmd, shell=True) + subprocess.check_output(self.check_remote_permissions_cmd, shell=True) + subprocess.check_output(self.check_remote_permissions_remove_cmd, shell=True) + self.check_remote_log_dir() + return True - except: + except Exception as e: return False def send_command(self, command, ignore_log=False, x11 = False): try: output = subprocess.check_output(command, shell=True) except subprocess.CalledProcessError as e: + if command.find("ecaccess-job-submit") != -1: + raise AutosubmitError("bad parameters. Error submitting job.") if not ignore_log: raise AutosubmitError('Could not execute command {0} on {1}'.format(e.cmd, self.host),7500,e.message) return False - self._ssh_output = output + if output.startswith(self.bashrc_output): + self._ssh_output = output[len(self.bashrc_output):] + else: + self._ssh_output = output return True def send_file(self, filename, check=True): - self.check_remote_log_dir() + #self.check_remote_log_dir() self.delete_file(filename) command = '{0} {1} {3}:{2}'.format(self.put_cmd, os.path.join(self.tmp_path, filename), os.path.join(self.get_files_path(), filename), self.host) diff --git a/autosubmit/platforms/headers/pjm_header.py b/autosubmit/platforms/headers/pjm_header.py new file mode 100644 index 0000000000000000000000000000000000000000..886ccdc95bb7c303f95846905f7826042169fba1 --- /dev/null +++ b/autosubmit/platforms/headers/pjm_header.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python3 + +# Copyright 2023 Earth Sciences Department, BSC-CNS + +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . + +import textwrap + + +class PJMHeader(object): + """Class to handle the PJM headers of a job""" + + # noinspection PyMethodMayBeStatic,PyUnusedLocal + def get_queue_directive(self, job): + """ + Returns queue directive for the specified job + + :param job: job to create queue directive for + :type job: Job + :return: queue directive + :rtype: str + """ + # There is no queue, so directive is empty + if job.parameters['CURRENT_QUEUE'] == '': + return "" + else: + return "PJM -L rscgrp={0}".format(job.parameters['CURRENT_QUEUE']) + + + # noinspection PyMethodMayBeStatic,PyUnusedLocal + def get_account_directive(self, job): + """ + Returns account directive for the specified job + + :param job: job to create account directive for + :type job: Job + :return: account directive + :rtype: str + """ + # wallet,account group_name. source: nkl.cc.u-tokyo.ac.jp + if job.parameters['CURRENT_PROJ'] != '': + return "PJM -g {0}".format(job.parameters['CURRENT_PROJ']) + return "" + + def get_nodes_directive(self, job): + """ + Returns nodes directive for the specified job + :param job: job to create nodes directive for + :type job: Job + :return: nodes directive + :rtype: str + """ + # There is no account, so directive is empty + nodes = job.parameters.get('NODES',"") + if nodes != '': + return "PJM -L node={0}".format(nodes) + return "" + # noinspection PyMethodMayBeStatic,PyUnusedLocal + def get_memory_directive(self, job): + """ + Returns memory directive for the specified job + + :param job: job to create memory directive for + :type job: Job + :return: memory directive + :rtype: str + """ + if job.parameters['MEMORY'] != '': + return "PJM --node-mem={0}".format(job.parameters['MEMORY']) + return "" + + # noinspection PyMethodMayBeStatic,PyUnusedLocal + def get_memory_per_task_directive(self, job): + """ + Returns memory per task directive for the specified job + + :param job: job to create memory per task directive for + :type job: Job + :return: memory per task directive + :rtype: str + """ + if job.parameters['MEMORY_PER_TASK'] != '': + return "PJM --core-mem={0}".format(job.parameters['MEMORY_PER_TASK']) + return "" + + # noinspection PyMethodMayBeStatic,PyUnusedLocal + def get_custom_directives(self, job): + """ + Returns custom directives for the specified job + + :param job: job to create custom directive for + :type job: Job + :return: custom directives + :rtype: str + """ + # There is no custom directives, so directive is empty + if job.parameters['CUSTOM_DIRECTIVES'] != '': + return '\n'.join(str(s) for s in job.parameters['CUSTOM_DIRECTIVES']) + return "" + + + + def get_tasks_per_node(self, job): + """ + Returns memory per task directive for the specified job + + :param job: job to create tasks per node directive for + :type job: Job + :return: tasks per node directive + :rtype: str + """ + if int(job.parameters['TASKS']) > 1: + return "max-proc-per-node={0}".format(job.parameters['TASKS']) + return "" + + SERIAL = textwrap.dedent("""\ +############################################################################### +# %TASKTYPE% %EXPID% EXPERIMENT +############################################################################### +# +#PJM -N %JOBNAME% +#PJM -L elapse=%WALLCLOCK%:00 +#%QUEUE_DIRECTIVE% +#%ACCOUNT_DIRECTIVE% +#%MEMORY_DIRECTIVE% +%CUSTOM_DIRECTIVES% +#PJM -o %CURRENT_SCRATCH_DIR%/%CURRENT_PROJ%/%CURRENT_USER%/%EXPID%/LOG_%EXPID%/%OUT_LOG_DIRECTIVE% +#PJM -e %CURRENT_SCRATCH_DIR%/%CURRENT_PROJ%/%CURRENT_USER%/%EXPID%/LOG_%EXPID%/%ERR_LOG_DIRECTIVE% +#%X11% +# +############################################################################### + """) + + PARALLEL = textwrap.dedent("""\ +############################################################################### +# %TASKTYPE% %EXPID% EXPERIMENT +############################################################################### +# +#PJM -N %JOBNAME% +#%NODES_DIRECTIVE% +#PJM --mpi "proc=%NUMPROC%" +#PJM --mpi "%TASKS_PER_NODE_DIRECTIVE%" +#PJM -L elapse=%WALLCLOCK%:00 +#%QUEUE_DIRECTIVE% +#%ACCOUNT_DIRECTIVE% +#%MEMORY_DIRECTIVE% +#%MEMORY_PER_TASK_DIRECTIVE% +#PJM -o %CURRENT_SCRATCH_DIR%/%CURRENT_PROJ%/%CURRENT_USER%/%EXPID%/LOG_%EXPID%/%OUT_LOG_DIRECTIVE% +#PJM -e %CURRENT_SCRATCH_DIR%/%CURRENT_PROJ%/%CURRENT_USER%/%EXPID%/LOG_%EXPID%/%ERR_LOG_DIRECTIVE% +%CUSTOM_DIRECTIVES% +# +############################################################################### + """) diff --git a/autosubmit/platforms/headers/slurm_header.py b/autosubmit/platforms/headers/slurm_header.py index c1eb50a1857673d34fb5f34e54ee2adf415c2b12..411f11bef51baef2124f3e44073c0a037e191b77 100644 --- a/autosubmit/platforms/headers/slurm_header.py +++ b/autosubmit/platforms/headers/slurm_header.py @@ -54,6 +54,19 @@ class SlurmHeader(object): return "SBATCH -A {0}".format(job.parameters['CURRENT_PROJ']) return "" + def get_nodes_directive(self, job): + """ + Returns nodes directive for the specified job + :param job: job to create nodes directive for + :type job: Job + :return: nodes directive + :rtype: str + """ + # There is no account, so directive is empty + nodes = job.parameters.get('NODES',"") + if nodes != '': + return "SBATCH -N {0}".format(nodes) + return "" # noinspection PyMethodMayBeStatic,PyUnusedLocal def get_memory_directive(self, job): """ @@ -128,9 +141,9 @@ class SlurmHeader(object): #%QUEUE_DIRECTIVE% #%ACCOUNT_DIRECTIVE% #%MEMORY_DIRECTIVE% - #%THREADS_PER_TASK_DIRECTIVE% #%TASKS_PER_NODE_DIRECTIVE% +#%NODES_DIRECTIVE% #SBATCH -n %NUMPROC% #SBATCH -t %WALLCLOCK%:00 #SBATCH -J %JOBNAME% @@ -152,6 +165,7 @@ class SlurmHeader(object): #%MEMORY_DIRECTIVE% #%MEMORY_PER_TASK_DIRECTIVE% #%THREADS_PER_TASK_DIRECTIVE% +#%NODES_DIRECTIVE% #SBATCH -n %NUMPROC% #%TASKS_PER_NODE_DIRECTIVE% #SBATCH -t %WALLCLOCK%:00 diff --git a/autosubmit/platforms/locplatform.py b/autosubmit/platforms/locplatform.py index 3fe62f5cc80f3ef7101e240c68760103dd84768e..0bee9fb70936d8c931e89bc3c1930a74039e7eb9 100644 --- a/autosubmit/platforms/locplatform.py +++ b/autosubmit/platforms/locplatform.py @@ -83,7 +83,7 @@ class LocalPlatform(ParamikoPlatform): def get_submit_cmd(self, job_script, job, hold=False, export=""): wallclock = self.parse_time(job.wallclock) - seconds = int(wallclock.days * 86400 + wallclock.seconds + 60) + seconds = int(wallclock.days * 86400 + wallclock.seconds * 60) if export == "none" or export == "None" or export is None or export == "": export = "" else: @@ -107,12 +107,15 @@ class LocalPlatform(ParamikoPlatform): if not ignore_log: Log.error('Could not execute command {0} on {1}'.format(e.cmd, self.host)) return False - Log.debug("Command '{0}': {1}", command, output) - self._ssh_output = output + if output.startswith(self.bashrc_output): + self._ssh_output = output[len(self.bashrc_output):] + else: + self._ssh_output = output + Log.debug("Command '{0}': {1}", command, self._ssh_output) + return True def send_file(self, filename): - self.check_remote_log_dir() self.delete_file(filename,del_cmd=True) command = '{0} {1} {2}'.format(self.put_cmd, os.path.join(self.tmp_path, filename), os.path.join(self.tmp_path, 'LOG_' + self.expid, filename)) @@ -147,6 +150,7 @@ class LocalPlatform(ParamikoPlatform): return True def check_remote_permissions(self): + self.check_remote_log_dir() return True # Moves .err .out @@ -167,8 +171,8 @@ class LocalPlatform(ParamikoPlatform): while not file_exist and retries < max_retries: try: file_exist = os.path.isfile(os.path.join(self.get_files_path(),src)) - if not file_exist: # File doesn't exist, retry in sleeptime - Log.debug("{2} File still no exists.. waiting {0}s for a new retry ( retries left: {1})", sleeptime, + if not file_exist: # File doesn't exist, retry in sleep-time + Log.debug("{2} File does not exist.. waiting {0}s for a new retry (retries left: {1})", sleeptime, max_retries - retries, remote_path) if not wrapper_failed: sleep(sleeptime) diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index 43adfd5c6c0cc183073c26835ed1522816e3ab09..0f8aa39bace3b13cf1ba56a95d4ac632a0fe7878 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -35,6 +35,7 @@ class ParamikoPlatform(Platform): """ Platform.__init__(self, expid, name, config) + self._ssh_output_err = "" self.connected = False self._default_queue = None @@ -106,19 +107,22 @@ class ParamikoPlatform(Platform): except BaseException as e: message = e.message if message.find("t accept remote connections") == -1: - transport = self._ssh.get_transport() - transport.send_ignore() + try: + transport = self._ssh.get_transport() + transport.send_ignore() + except: + message = "Timeout connection" return message except EOFError as e: self.connected = False raise AutosubmitError("[{0}] not alive. Host: {1}".format( - self.name, self.host), 6002, e.message) + self.name, self.host), 6002, str(e)) except (AutosubmitError,AutosubmitCritical,IOError): self.connected = False raise except BaseException as e: self.connected = False - raise AutosubmitCritical(message,7051) + raise AutosubmitCritical(str(e),7051) #raise AutosubmitError("[{0}] connection failed for host: {1}".format(self.name, self.host), 6002, e.message) def restore_connection(self): @@ -128,15 +132,13 @@ class ParamikoPlatform(Platform): retry = 0 try: self.connect() - except SSHException as e: - raise except Exception as e: if ',' in self.host: Log.printlog("Connection Failed to {0}, will test another host".format( self.host.split(',')[0]), 6002) else: raise AutosubmitCritical( - "First connection to {0} is failed, check host configuration or try another login node ".format(self.host), 7050,e.message) + "First connection to {0} is failed, check host configuration or try another login node ".format(self.host), 7050,str(e)) while self.connected is False and retry < retries: try: self.connect(True) @@ -149,14 +151,14 @@ class ParamikoPlatform(Platform): raise AutosubmitCritical( 'Experiment cant no continue without unexpected behaviour, Stopping Autosubmit', 7050, trace) - except AutosubmitCritical: + except AutosubmitCritical as e: raise except SSHException as e: raise except Exception as e: raise AutosubmitCritical( - 'Cant connect to this platform due an unknown error', 7050, e.message) - + 'Cant connect to this platform due an unknown error', 7050, str(e)) + def threaded(fn): def wrapper(*args, **kwargs): thread = Thread(target=fn, args=args, kwargs=kwargs) @@ -194,39 +196,50 @@ class ParamikoPlatform(Platform): 0] if 'identityfile' in self._host_config: self._host_config_id = self._host_config['identityfile'] - + #pkey = paramiko.Ed25519Key.from_private_key_file(self._host_config_id[0]) + port = int(self._host_config.get('port',22)) if 'proxycommand' in self._host_config: self._proxy = paramiko.ProxyCommand( self._host_config['proxycommand']) - self._ssh.connect(self._host_config['hostname'], 22, username=self.user, - key_filename=self._host_config_id, sock=self._proxy, timeout=120 , banner_timeout=120) + try: + self._ssh.connect(self._host_config['hostname'], port, username=self.user, + key_filename=self._host_config_id, sock=self._proxy, timeout=120 , banner_timeout=120) + except Exception as e: + self._ssh.connect(self._host_config['hostname'], port, username=self.user, + key_filename=self._host_config_id, sock=self._proxy, timeout=120, + banner_timeout=120,disabled_algorithms={'pubkeys': ['rsa-sha2-256', 'rsa-sha2-512']}) else: - self._ssh.connect(self._host_config['hostname'], 22, username=self.user, - key_filename=self._host_config_id, timeout=120 , banner_timeout=120) + try: + self._ssh.connect(self._host_config['hostname'], port, username=self.user, + key_filename=self._host_config_id, timeout=60 , banner_timeout=60) + except Exception as e: + self._ssh.connect(self._host_config['hostname'], port, username=self.user, + key_filename=self._host_config_id, timeout=60 , banner_timeout=60,disabled_algorithms={'pubkeys': ['rsa-sha2-256', 'rsa-sha2-512']}) self.transport = self._ssh.get_transport() - #self.transport = paramiko.Transport((self._host_config['hostname'], 22)) - #self.transport.connect(username=self.user) - window_size = pow(4, 12) # about ~16MB chunks - max_packet_size = pow(4, 12) - #self._ftpChannel = self._ssh.open_sftp() - self._ftpChannel = paramiko.SFTPClient.from_transport(self.transport,window_size=window_size,max_packet_size=max_packet_size) + self.transport.banner_timeout = 60 + self.transport.set_keepalive(120) + + self._ftpChannel = paramiko.SFTPClient.from_transport(self.transport,window_size=pow(4, 12) ,max_packet_size=pow(4, 12) ) + self._ftpChannel.get_channel().settimeout(120) self.connected = True except SSHException as e: raise except IOError as e: - if "refused" in e.strerror.lower(): + if "refused" in str(e.strerror).lower(): raise SSHException(" {0} doesn't accept remote connections. Check if there is an typo in the hostname".format(self.host)) - elif "name or service not known" in e.strerror.lower(): + elif "name or service not known" in str(e.strerror).lower(): raise SSHException(" {0} doesn't accept remote connections. Check if there is an typo in the hostname".format(self.host)) else: - raise AutosubmitError("File can't be located due an slow connection", 6016, e.message) + raise AutosubmitError("File can't be located due an slow or timeout connection", 6016, str(e)) except BaseException as e: + if "Garbage packet received" in str(e): + Log.error("Couldn't connect to ftp channel due to the stdout given by the {0}:~/.bashrc\nCheck {0}:~/.bashrc for commands that could give output or error".format(self.host)) self.connected = False - if "Authentication failed." in e.message: + if "Authentication failed." in str(e): raise AutosubmitCritical("Authentication Failed, please check the platform.conf of {0}".format( - self._host_config['hostname']), 7050, e.message) + self._host_config['hostname']), 7050, str(e)) if not reconnect and "," in self._host_config['hostname']: - self.restore_connection(reconnect=True) + self.restore_connection() else: raise AutosubmitError( "Couldn't establish a connection to the specified host, wrong configuration?", 6003, e.message) @@ -273,7 +286,7 @@ class ParamikoPlatform(Platform): """ if check: - self.check_remote_log_dir() + #self.check_remote_log_dir() self.delete_file(filename) try: local_path = os.path.join(os.path.join(self.tmp_path, filename)) @@ -283,8 +296,8 @@ class ParamikoPlatform(Platform): self._ftpChannel.chmod(remote_path, os.stat(local_path).st_mode) return True except IOError as e: - raise AutosubmitError('Can not send file {0} to {1}'.format(os.path.join( - self.tmp_path, filename)), os.path.join(self.get_files_path(), filename), 6004, e.message) + + raise AutosubmitError('Can not send file {0} to {1}'.format(os.path.join(self.tmp_path,filename), code=6004, trace=str(e))) except BaseException as e: raise AutosubmitError( 'Send file failed. Connection seems to no be active', 6004) @@ -358,7 +371,7 @@ class ParamikoPlatform(Platform): except BaseException as e: Log.error('Could not remove file {0} due a wrong configuration'.format( os.path.join(self.get_files_path(), filename))) - if e.message.lower().find("garbage") != -1: + if str(e).lower().find("garbage") != -1: raise AutosubmitCritical( "Wrong User or invalid .ssh/config. Or invalid user in platform.conf or public key not set ", 7051, e.message) @@ -452,17 +465,20 @@ class ParamikoPlatform(Platform): """ raise NotImplementedError - def check_job(self, job, default_status=Status.COMPLETED, retries=5, submit_hold_check=False): + def check_job(self, job, default_status=Status.COMPLETED, retries=5, submit_hold_check=False, is_wrapper=False): """ Checks job running status :param retries: retries :param job: job + :type job: autosubmit.job.job.Job + :param default_status: default status if job is not found :type job: class(job) :param default_status: status to assign if it can be retrieved from the platform :type default_status: autosubmit.job.job_common.Status :return: current job status :rtype: autosubmit.job.job_common.Status + """ job_id = job.id job_status = Status.UNKNOWN @@ -491,19 +507,26 @@ class ParamikoPlatform(Platform): job_status = Status.COMPLETED elif job_status in self.job_status['RUNNING']: job_status = Status.RUNNING - if job.status != Status.RUNNING: - job.start_time = datetime.datetime.now() # URi: start time - if job.start_time is not None and str(job.wrapper_type).lower() == "none": - wallclock = job.wallclock - if job.wallclock == "00:00": - wallclock == job.platform.max_wallclock - if wallclock != "00:00" and wallclock != "00:00:00" and wallclock != "": - if job.is_over_wallclock(job.start_time,wallclock): - try: - job.platform.get_completed_files(job.name) - job_status = job.check_completion(over_wallclock=True) - except: - job_status = Status.FAILED + if not is_wrapper: + if job.status != Status.RUNNING: + job.start_time = datetime.datetime.now() # URi: start time + if job.start_time is not None and str(job.wrapper_type).lower() == "none": + wallclock = job.wallclock + if job.wallclock == "00:00": + wallclock == job.platform.max_wallclock + if wallclock != "00:00" and wallclock != "00:00:00" and wallclock != "": + if job.is_over_wallclock(job.start_time,wallclock): + try: + job.platform.get_completed_files(job.name) + job_status = job.check_completion(over_wallclock=True) + except: + job_status = Status.FAILED + if job_status == Status.FAILED: + try: + job.platform.send_command( + job.platform.cancel_cmd + " " + str(job.id)) + except: + pass elif job_status in self.job_status['QUEUING'] and job.hold is False: job_status = Status.QUEUING elif job_status in self.job_status['QUEUING'] and job.hold is True: @@ -528,8 +551,28 @@ class ParamikoPlatform(Platform): if job not in ssh_output: return False return True + def parse_joblist(self, job_list): + """ + Convert a list of job_list to job_list_cmd + :param job_list: list of jobs + :type job_list: list + :param ssh_output: ssh output + :type ssh_output: str + :return: job status + :rtype: str + """ + job_list_cmd = "" + for job in job_list: + if job.id is None: + job_str = "0" + else: + job_str = str(job.id) + job_list_cmd += job_str+"," + if job_list_cmd[-1] == ",": + job_list_cmd=job_list_cmd[:-1] - def check_Alljobs(self, job_list, job_list_cmd, remote_logs, retries=5): + return job_list_cmd + def check_Alljobs(self, job_list, as_conf, retries=5): """ Checks jobs running status @@ -541,20 +584,35 @@ class ParamikoPlatform(Platform): :return: current job status :rtype: autosubmit.job.job_common.Status """ - if job_list_cmd[-1] == ",": - job_list_cmd=job_list_cmd[:-1] + job_status = Status.UNKNOWN + remote_logs = as_conf.get_copy_remote_logs() + job_list_cmd = self.parse_joblist(job_list) cmd = self.get_checkAlljobs_cmd(job_list_cmd) sleep_time = 5 sleep(sleep_time) - self.send_command(cmd) - while not self._check_jobid_in_queue(self.get_ssh_output(), job_list_cmd) and retries > 0: + slurm_error = False + e_msg = "" + try: self.send_command(cmd) - Log.debug('Retrying check job command: {0}', cmd) - Log.debug('retries left {0}', retries) - Log.debug('Will be retrying in {0} seconds', sleep_time) - retries -= 1 - sleep(sleep_time) - sleep_time = sleep_time + 5 + except AutosubmitError as e: + e_msg = e.trace+" "+e.message + slurm_error = True + if not slurm_error: + while not self._check_jobid_in_queue(self.get_ssh_output(), job_list_cmd) and retries > 0: + try: + self.send_command(cmd) + except AutosubmitError as e: + e_msg = e.trace + " " + e.message + slurm_error = True + break + Log.debug('Retrying check job command: {0}', cmd) + Log.debug('retries left {0}', retries) + Log.debug('Will be retrying in {0} seconds', sleep_time) + retries -= 1 + sleep(sleep_time) + sleep_time = sleep_time + 5 + if slurm_error: + raise AutosubmitError("Remote pooling failed with error:{0}\n Resetting platforms connections...".format(e_msg)) job_list_status = self.get_ssh_output() if retries >= 0: Log.debug('Successful check job command') @@ -575,6 +633,27 @@ class ParamikoPlatform(Platform): sleep(sleep_time) sleep_time = sleep_time + 5 # URi: define status list in HPC Queue Class + + if job.status != Status.RUNNING: + job.start_time = datetime.datetime.now() # URi: start time + if job.start_time is not None and str(job.wrapper_type).lower() == "none": + wallclock = job.wallclock + if job.wallclock == "00:00": + wallclock == job.platform.max_wallclock + if wallclock != "00:00" and wallclock != "00:00:00" and wallclock != "": + if job.is_over_wallclock(job.start_time,wallclock): + try: + Log.debug("Job {0} is over wallclock, checking completion".format(job.id)) + job.platform.get_completed_files(job.name) + job_status = job.check_completion(over_wallclock=True) + if job_status is Status.FAILED: + try: + if job.platform.cancel_cmd is not None: + job.platform.send_command(job.platform.cancel_cmd + " " + str(job.id)) + except: + pass + except: + job_status = Status.FAILED if job_status in self.job_status['COMPLETED']: job_status = Status.COMPLETED elif job_status in self.job_status['RUNNING']: @@ -591,43 +670,12 @@ class ParamikoPlatform(Platform): elif retries == 0: job_status = Status.COMPLETED job.update_status(remote_logs) - else: job_status = Status.UNKNOWN Log.error( 'check_job() The job id ({0}) status is {1}.', job_id, job_status) job.new_status = job_status - reason = str() - if self.type == 'slurm' and len(in_queue_jobs) > 0: - cmd = self.get_queue_status_cmd(list_queue_jobid) - self.send_command(cmd) - queue_status = self._ssh_output - for job in in_queue_jobs: - reason = self.parse_queue_reason(queue_status, job.id) - if job.queuing_reason_cancel(reason): - Log.error( - "Job {0} will be cancelled and set to FAILED as it was queuing due to {1}", job.name, reason) - self.send_command( - self.platform.cancel_cmd + " {0}".format(job.id)) - job.new_status = Status.FAILED - job.update_status(remote_logs) - return - elif reason == '(JobHeldUser)': - job.new_status = Status.HELD - if not job.hold: - # SHOULD BE MORE CLASS (GET_scontrol realease but not sure if this can be implemented on others PLATFORMS - self.send_command("scontrol release {0}".format(job.id)) - job.new_status = Status.QUEUING # If it was HELD and was released, it should be QUEUING next. - else: - pass - # This shouldn't happen anymore TODO delete - elif reason == '(JobHeldAdmin)': - Log.debug( - "Job {0} Failed to be HELD, canceling... ", job.name) - job.new_status = Status.WAITING - job.platform.send_command( - job.platform.cancel_cmd + " {0}".format(job.id)) - + self.get_queue_status(in_queue_jobs,list_queue_jobid,as_conf) else: for job in job_list: job_status = Status.UNKNOWN @@ -636,7 +684,9 @@ class ParamikoPlatform(Platform): raise AutosubmitError("Some Jobs are in Unknown status", 6008) # job.new_status=job_status - def get_jobid_by_jobname(self,job_name,retries=2): + + + def get_jobid_by_jobname(self,job_name,retries=2,minutes="5"): """ Get job id by job name :param retries: retries @@ -644,25 +694,24 @@ class ParamikoPlatform(Platform): :return: job id """ #sleep(5) - cmd = self.get_jobid_by_jobname_cmd(job_name) - self.send_command(cmd) - job_id_name = self.get_ssh_output() - while len(job_id_name) <= 0 and retries > 0: + cmd = self.get_jobid_by_jobname_cmd(job_name,minutes) + if cmd != "" and cmd is not None: self.send_command(cmd) - job_id_name = self.get_ssh_output() - retries -= 1 - sleep(2) - if retries >= 0: - #get id last line - job_ids_names = job_id_name.split('\n')[1:-1] - #get all ids by jobname - job_ids = [job_id.split(',')[0] for job_id in job_ids_names] - return job_ids - - - - - + job_id_name = "" + while len(job_id_name) <= 0 and retries > 0: + job_id_name = self.get_ssh_output() + if len(job_id_name) <= 0: + self.send_command(cmd) + else: + break + retries -= 1 + if retries >= 0: + #get id last line + job_ids_names = job_id_name.split('\n') + #get all ids by jobname + job_ids = [job_id.split(' ')[0] for job_id in job_ids_names if job_id != ""] + return job_ids + return [] def get_checkjob_cmd(self, job_id): @@ -695,7 +744,22 @@ class ParamikoPlatform(Platform): while session.recv_stderr_ready(): sys.stderr.write(session.recv_stderr(4096)) - def x11_handler(self, channel, (src_addr, src_port)): + def get_jobid_by_jobname_cmd(self, job_name): + """ + Returns command to get job id by job name on remote platforms + :param job_name: + :return: str + """ + return NotImplementedError + + def get_queue_status_cmd(self, job_name): + """ + Returns command to get queue status on remote platforms + :return: str + """ + return NotImplementedError + + def x11_handler(self, channel, xxx_todo_changeme): '''handler for incoming x11 connections for each x11 incoming connection, - get a connection to the local display @@ -812,7 +876,6 @@ class ParamikoPlatform(Platform): :return: True if executed, False if failed :rtype: bool """ - if "rsync" in command or "find" in command or "convertLink" in command: timeout = None # infinite timeout on migrate command elif "rm" in command: @@ -859,8 +922,6 @@ class ParamikoPlatform(Platform): if not x11: stdout.close() stderr.close() - - self._ssh_output = "" self._ssh_output_err = "" for s in stdout_chunks: @@ -868,23 +929,24 @@ class ParamikoPlatform(Platform): self._ssh_output += s for errorLineCase in stderr_readlines: self._ssh_output_err += errorLineCase - - for errorLineCase in stderr_readlines: - errorLine = errorLineCase.lower() - if "not active" in errorLine: - raise AutosubmitError( - 'SSH Session not active, will restart the platforms', 6005) - if errorLine.find("command not found") != -1: - raise AutosubmitCritical("scheduler is not installed.",7052,self._ssh_output_err) - elif errorLine.find("refused") != -1 or errorLine.find("slurm_persist_conn_open_without_init") != -1 or errorLine.find("slurmdbd") != -1 or errorLine.find("submission failed") != -1 or errorLine.find("git clone") != -1 or errorLine.find("sbatch: error: ") != -1 or errorLine.find("not submitted") != -1 or errorLine.find("invalid") != -1: - if (self._submit_command_name == "sbatch" and (errorLine.find("policy") != -1 or errorLine.find("invalid") != -1) ) or (self._submit_command_name == "sbatch" and errorLine.find("argument") != -1) or (self._submit_command_name == "bsub" and errorLine.find("job not submitted") != -1) or self._submit_command_name == "ecaccess-job-submit" or self._submit_command_name == "qsub ": - raise AutosubmitError(errorLine, 7014, "Bad Parameters.") - raise AutosubmitError('Command {0} in {1} warning: {2}'.format(command, self.host,self._ssh_output_err, 6005)) + # if self._bashrc_output matchs the start of self.ssh_output, then strip it from self.ssh_output + if self._ssh_output.startswith(self.bashrc_output): + self._ssh_output = self._ssh_output[len(self.bashrc_output):] + if self._ssh_output_err.startswith(self.bashrc_err): + self._ssh_output_err = self._ssh_output_err[len(self.bashrc_err):] + if "not active" in self._ssh_output_err: + raise AutosubmitError( + 'SSH Session not active, will restart the platforms', 6005) + if self._ssh_output_err.find("command not found") != -1: + raise AutosubmitCritical("scheduler is not installed.",7052,self._ssh_output_err) + elif self._ssh_output_err.find("refused") != -1 or self._ssh_output_err.find("slurm_persist_conn_open_without_init") != -1 or self._ssh_output_err.find("slurmdbd") != -1 or self._ssh_output_err.find("submission failed") != -1 or self._ssh_output_err.find("git clone") != -1 or self._ssh_output_err.find("sbatch: error: ") != -1 or self._ssh_output_err.find("not submitted") != -1 or self._ssh_output_err.find("invalid") != -1: + if (self._submit_command_name == "sbatch" and (self._ssh_output_err.find("policy") != -1 or self._ssh_output_err.find("invalid") != -1) ) or (self._submit_command_name == "sbatch" and self._ssh_output_err.find("argument") != -1) or (self._submit_command_name == "bsub" and self._ssh_output_err.find("job not submitted") != -1) or self._submit_command_name == "ecaccess-job-submit" or self._submit_command_name == "qsub ": + raise AutosubmitError(self._ssh_output_err, 7014, "Bad Parameters.") + raise AutosubmitError('Command {0} in {1} warning: {2}'.format(command, self.host,self._ssh_output_err, 6005)) if not ignore_log: - if len(stderr_readlines) > 0: - Log.printlog('Command {0} in {1} warning: {2}'.format( - command, self.host, '\n'.join(stderr_readlines)), 6006) + if self._ssh_output_err != '': + Log.printlog('Command {0} in {1} warning: {2}'.format(command, self.host, self._ssh_output_err), 6006) else: pass #Log.debug('Command {0} in {1} successful with out message: {2}', command, self.host, self._ssh_output) @@ -897,7 +959,7 @@ class ParamikoPlatform(Platform): except AutosubmitError as e: raise except IOError as e: - raise AutosubmitError(e.message,6016) + raise AutosubmitError("IO issues, something seems wrong with {0}".format(self.name),6016,e.message) except BaseException as e: raise AutosubmitError('Command {0} in {1} warning: {2}'.format( command, self.host, '\n'.join(stderr_readlines)), 6005, e.message) @@ -955,7 +1017,7 @@ class ParamikoPlatform(Platform): """ raise NotImplementedError - def parse_queue_reason(self, output): + def parse_queue_reason(self, output, job_id): raise NotImplementedError def get_ssh_output(self): @@ -1081,6 +1143,9 @@ class ParamikoPlatform(Platform): if hasattr(self.header, 'get_account_directive'): header = header.replace( '%ACCOUNT_DIRECTIVE%', self.header.get_account_directive(job)) + if hasattr(self.header, 'get_nodes_directive'): + header = header.replace( + '%NODES_DIRECTIVE%', self.header.get_nodes_directive(job)) if hasattr(self.header, 'get_memory_directive'): header = header.replace( '%MEMORY_DIRECTIVE%', self.header.get_memory_directive(job)) @@ -1118,7 +1183,7 @@ class ParamikoPlatform(Platform): def check_tmp_exists(self): try: if self.send_command("ls {0}".format(self.temp_dir)): - if "no such file or directory" in self.get_ssh_output_err().lower(): + if "no such file or directory" in str(self.get_ssh_output_err()).lower(): return False else: return True @@ -1136,15 +1201,18 @@ class ParamikoPlatform(Platform): pass self._ftpChannel.mkdir(path) self._ftpChannel.rmdir(path) + self.check_remote_log_dir() return True except: return False - + + + + def check_remote_log_dir(self): """ Creates log dir on remote host """ - if self.type == "slurm": try: # Test if remote_path exists diff --git a/autosubmit/platforms/paramiko_submitter.py b/autosubmit/platforms/paramiko_submitter.py index c597274f77e3bca06f345eec497cac4d6bda9722..d4df1c036832b486720fcb665c76988d01bbba7d 100644 --- a/autosubmit/platforms/paramiko_submitter.py +++ b/autosubmit/platforms/paramiko_submitter.py @@ -20,7 +20,8 @@ import os -from log.log import Log + +from log.log import Log,AutosubmitCritical,AutosubmitError from autosubmit.config.basicConfig import BasicConfig from autosubmit.config.config_common import AutosubmitConfig from submitter import Submitter @@ -30,6 +31,7 @@ from autosubmit.platforms.pbsplatform import PBSPlatform from autosubmit.platforms.sgeplatform import SgePlatform from autosubmit.platforms.ecplatform import EcPlatform from autosubmit.platforms.slurmplatform import SlurmPlatform +from autosubmit.platforms.pjmplatform import PJMPlatform from autosubmit.platforms.locplatform import LocalPlatform from autosubmit.platforms.paramiko_platform import ParamikoPlatformException @@ -72,7 +74,7 @@ class ParamikoSubmitter(Submitter): :return: platforms used by the experiment :rtype: dict """ - + raise_message="" platforms_used = list() hpcarch = asconf.get_platform() platforms_used.append(hpcarch) @@ -133,6 +135,9 @@ class ParamikoSubmitter(Submitter): elif platform_type == 'slurm': remote_platform = SlurmPlatform( asconf.expid, section.lower(), BasicConfig) + elif platform_type == 'pjm': + remote_platform = PJMPlatform( + asconf.expid, section, BasicConfig) else: raise Exception( "Queue type not specified on platform {0}".format(section)) @@ -159,8 +164,10 @@ class ParamikoSubmitter(Submitter): asconf.get_max_processors()) remote_platform.max_waiting_jobs = int(parser.get_option(section, 'MAX_WAITING_JOBS', asconf.get_max_waiting_jobs())) - remote_platform.total_jobs = int(parser.get_option(section, 'TOTAL_JOBS', + totaljobs = int(parser.get_option(section, 'TOTALJOBS', asconf.get_total_jobs())) + total_jobs = int(parser.get_option(section, 'TOTAL_JOBS', asconf.get_total_jobs())) + remote_platform.total_jobs = min(min(totaljobs, total_jobs),asconf.get_total_jobs()) remote_platform.hyperthreading = parser.get_option(section, 'HYPERTHREADING', 'false').lower() remote_platform.project = parser.get_option( @@ -178,28 +185,40 @@ class ParamikoSubmitter(Submitter): section, 'TEMP_DIR', None) remote_platform._default_queue = parser.get_option( section, 'QUEUE', None) + + remote_platform.ec_queue = parser.get_option(section,'EC_QUEUE', "hpc") remote_platform._serial_queue = parser.get_option( section, 'SERIAL_QUEUE', None) remote_platform.processors_per_node = parser.get_option(section, 'PROCESSORS_PER_NODE', None) remote_platform.custom_directives = parser.get_option(section, 'CUSTOM_DIRECTIVES', None) - Log.debug("Custom directives from platform.conf: {0}".format( - remote_platform.custom_directives)) + + if remote_platform.custom_directives is not None and remote_platform.custom_directives != '' and remote_platform.custom_directives != 'None': + Log.debug("Custom directives from platform.conf: {0}".format( + remote_platform.custom_directives)) remote_platform.scratch_free_space = parser.get_option(section, 'SCRATCH_FREE_SPACE', None) - remote_platform.root_dir = os.path.join(remote_platform.scratch, remote_platform.project, - remote_platform.user, remote_platform.expid) + try: + remote_platform.root_dir = os.path.join(remote_platform.scratch, remote_platform.project, + remote_platform.user, remote_platform.expid) + remote_platform.update_cmds() + platforms[section.lower()] = remote_platform + + except: + raise_message = "Error in platform.conf: SCRATCH_DIR, PROJECT, USER, EXPID must be defined for platform {0}".format(section) # Executes update_cmds() from corresponding Platform Object - remote_platform.update_cmds() # Save platform into result dictionary - platforms[section.lower()] = remote_platform for section in parser.sections(): # if this section is included in platforms if parser.has_option(section, 'SERIAL_PLATFORM'): platforms[section.lower()].serial_platform = platforms[parser.get_option(section, 'SERIAL_PLATFORM', - None).lower()] + None)] + if platforms[section.lower()].serial_platform is not None: + platforms[section.lower()].serial_platform = platforms[section.lower()].serial_platform.lower() self.platforms = platforms + if raise_message != "": + raise AutosubmitError(raise_message) diff --git a/autosubmit/platforms/pbsplatform.py b/autosubmit/platforms/pbsplatform.py index 961bb453477f4dfdd2b9b637e592ce1174bf25ae..0898563953f466dc14a5154d461c4f5f2148016b 100644 --- a/autosubmit/platforms/pbsplatform.py +++ b/autosubmit/platforms/pbsplatform.py @@ -18,6 +18,7 @@ # along with Autosubmit. If not, see . import os +from log.log import Log, AutosubmitCritical, AutosubmitError from autosubmit.platforms.paramiko_platform import ParamikoPlatform from log.log import Log @@ -49,7 +50,7 @@ class PBSPlatform(ParamikoPlatform): self._header = Pbs12Header() else: Log.error('PBS version {0} not supported'.format(version)) - raise HPCPlatformException('PBS version {0} not supported'.format(version)) + raise AutosubmitError('PBS version {0} not supported'.format(version)) self.job_status = dict() self.job_status['COMPLETED'] = ['F', 'E', 'c', 'C'] diff --git a/autosubmit/platforms/pjmplatform.py b/autosubmit/platforms/pjmplatform.py new file mode 100644 index 0000000000000000000000000000000000000000..6ba39d4f83e0556cc481a905bcf94b935db86007 --- /dev/null +++ b/autosubmit/platforms/pjmplatform.py @@ -0,0 +1,505 @@ +#!/usr/bin/env python3 + +# Copyright 2017-2020 Earth Sciences Department, BSC-CNS + +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . +import locale +import os +from time import sleep +from typing import List, Union + +from autosubmit.job.job_common import Status +from autosubmit.job.job_exceptions import WrongTemplateException +from autosubmit.platforms.paramiko_platform import ParamikoPlatform +from autosubmit.platforms.headers.pjm_header import PJMHeader +from autosubmit.platforms.wrappers.wrapper_factory import PJMWrapperFactory +from log.log import AutosubmitCritical, AutosubmitError, Log + +class PJMPlatform(ParamikoPlatform): + """ + Class to manage jobs to host using PJM scheduler + + :param expid: experiment's identifier + :type expid: str + """ + + + def __init__(self, expid, name, config): + ParamikoPlatform.__init__(self, expid, name, config) + self.mkdir_cmd = None + self.get_cmd = None + self.put_cmd = None + self._submit_hold_cmd = None + self._submit_command_name = None + self._submit_cmd = None + self._checkhost_cmd = None + self.cancel_cmd = None + self._header = PJMHeader() + self._wrapper = PJMWrapperFactory(self) + #https://software.fujitsu.com/jp/manual/manualfiles/m220008/j2ul2452/02enz007/j2ul-2452-02enz0.pdf page 16 + self.job_status = dict() + self.job_status['COMPLETED'] = ['EXT'] + self.job_status['RUNNING'] = ['RNO','RNE','RUN'] + self.job_status['QUEUING'] = ['ACC','QUE', 'RNA', 'RNP','HLD'] # TODO NOT SURE ABOUT HOLD HLD + self.job_status['FAILED'] = ['ERR','CCL','RJT'] + self._pathdir = "\$HOME/LOG_" + self.expid + self._allow_arrays = False + self._allow_wrappers = True # NOT SURE IF WE NEED WRAPPERS + self.update_cmds() + self.config = config + exp_id_path = os.path.join(config.LOCAL_ROOT_DIR, self.expid) + tmp_path = os.path.join(exp_id_path, "tmp") + self._submit_script_path = os.path.join( + tmp_path, config.LOCAL_ASLOG_DIR, "submit_" + self.name + ".sh") + self._submit_script_file = open(self._submit_script_path, 'wb').close() + + def submit_error(self,output): + """ + Check if the output of the submit command contains an error message. + :param output: output of the submit cmd + :return: boolean + """ + return not all(part.lower() in output.lower() for part in ["pjsub", "[INFO] PJM 0000"]) + + + + def process_batch_ready_jobs(self,valid_packages_to_submit,failed_packages,error_message="",hold=False): + """ + Retrieve multiple jobs identifiers. + :param valid_packages_to_submit: + :param failed_packages: + :param error_message: + :param hold: + :return: + """ + try: + valid_packages_to_submit = [ package for package in valid_packages_to_submit ] + if len(valid_packages_to_submit) > 0: + try: + jobs_id = self.submit_Script(hold=hold) + except AutosubmitError as e: + jobnames = [job.name for job in valid_packages_to_submit[0].jobs] + for jobname in jobnames: + jobid = self.get_jobid_by_jobname(jobname) + #cancel bad submitted job if jobid is encountered + for id_ in jobid: + self.cancel_job(id_) + jobs_id = None + self.connected = False + if e.trace is not None: + has_trace_bad_parameters = self.submit_error(e.trace) + else: + e.trace = "" + has_trace_bad_parameters = False + if e.message is not None: + has_message_bad_parameters = self.submit_error(e.message) + else: + e.message = "" + has_message_bad_parameters = False + if has_trace_bad_parameters or has_message_bad_parameters or e.message.lower().find("invalid partition") != -1 or e.message.lower().find("invalid qos") != -1 or e.message.lower().find("scheduler is not installed") != -1 or e.message.lower().find("failed") != -1 or e.message.lower().find("not available") != -1: + error_msg = "" + for package_tmp in valid_packages_to_submit: + for job_tmp in package_tmp.jobs: + if job_tmp.section not in error_msg: + error_msg += job_tmp.section + "&" + if has_trace_bad_parameters: + error_message+="Check job and queue specified in jobs.conf. Sections that could be affected: {0}".format(error_msg[:-1]) + else: + error_message+="\ncheck that {1} platform has set the correct scheduler. Sections that could be affected: {0}".format( + error_msg[:-1], self.name) + + if e.trace is None: + e.trace = "" + raise AutosubmitCritical(error_message,7014,e.message+"\n"+str(e.trace)) + except IOError as e: + raise AutosubmitError( + "IO issues ", 6016, str(e)) + except BaseException as e: + if str(e).find("scheduler") != -1: + raise AutosubmitCritical("Are you sure that [{0}] scheduler is the correct type for platform [{1}]?.\n Please, double check that {0} is loaded for {1} before autosubmit launch any job.".format(self.type.upper(),self.name.upper()),str(e),7070) + raise AutosubmitError( + "Submission failed, this can be due a failure on the platform", 6015, str(e)) + if jobs_id is None or len(jobs_id) <= 0: + raise AutosubmitError( + "Submission failed, this can be due a failure on the platform", 6015,"Jobs_id {0}".format(jobs_id)) + i = 0 + if hold: + sleep(10) + + for package in valid_packages_to_submit: + for job in package.jobs: + job.hold = hold + job.id = str(jobs_id[i]) + job.status = Status.SUBMITTED + job.write_submit_time(hold=hold) + i += 1 + save = True + except WrongTemplateException as e: + raise AutosubmitCritical("Invalid parameter substitution in {0} template".format( + e.job_name), 7014, str(e)) + except AutosubmitError as e: + raise + except AutosubmitCritical as e: + raise + except Exception as e: + raise AutosubmitError("{0} submission failed".format(self.name), 6015, str(e)) + return save,valid_packages_to_submit + + def open_submit_script(self): + self._submit_script_file = open(self._submit_script_path, 'wb').close() + self._submit_script_file = open(self._submit_script_path, 'ab') + + def get_submit_script(self): + self._submit_script_file.close() + os.chmod(self._submit_script_path, 0o750) + return os.path.join(self.config.LOCAL_ASLOG_DIR, os.path.basename(self._submit_script_path)) + + def submit_job(self, job, script_name, hold=False, export="none"): + """ + Submit a job from a given job object. + + :param export: + :param job: job object + :type job: autosubmit.job.job.Job + :param script_name: job script's name + :rtype scriptname: str + :param hold: send job hold + :type hold: boolean + :return: job id for the submitted job + :rtype: int + """ + self.get_submit_cmd(script_name, job, hold=hold, export=export) + return None + + + def submit_Script(self, hold=False): + # type: (bool) -> Union[List[str], str] + """ + Sends a Submit file Script, execute it in the platform and retrieves the Jobs_ID of all jobs at once. + + :param hold: if True, the job will be held + :type hold: bool + :return: job id for submitted jobs + :rtype: list(str) + """ + try: + self.send_file(self.get_submit_script(), False) + cmd = os.path.join(self.get_files_path(), + os.path.basename(self._submit_script_path)) + try: + self.send_command(cmd) + except AutosubmitError as e: + raise + except AutosubmitCritical as e: + raise + except Exception as e: + raise + jobs_id = self.get_submitted_job_id(self.get_ssh_output()) + return jobs_id + except IOError as e: + raise AutosubmitError("Submit script is not found, retry again in next AS iteration", 6008, str(e)) + except AutosubmitError as e: + raise + except AutosubmitCritical as e: + raise + except Exception as e: + raise AutosubmitError("Submit script is not found, retry again in next AS iteration", 6008, str(e)) + def check_remote_log_dir(self): + """ + Creates log dir on remote host + """ + + try: + # Test if remote_path exists + self._ftpChannel.chdir(self.remote_log_dir) + except IOError as e: + try: + if self.send_command(self.get_mkdir_cmd()): + Log.debug('{0} has been created on {1} .', + self.remote_log_dir, self.host) + else: + raise AutosubmitError("SFTP session not active ", 6007, "Could not create the DIR {0} on HPC {1}'.format(self.remote_log_dir, self.host)".format( + self.remote_log_dir, self.host)) + except BaseException as e: + raise AutosubmitError( + "SFTP session not active ", 6007, str(e)) + + def update_cmds(self): + """ + Updates commands for platforms + """ + self.root_dir = os.path.join( + self.scratch, self.project, self.user, self.expid) + self.remote_log_dir = os.path.join(self.root_dir, "LOG_" + self.expid) + self.cancel_cmd = "pjdel " + self._checkhost_cmd = "echo 1" + self._submit_cmd = 'cd {0} ; pjsub '.format(self.remote_log_dir) + self._submit_command_name = "pjsub" + self._submit_hold_cmd = 'cd {0} ; pjsub '.format(self.remote_log_dir) + self.put_cmd = "scp" + self.get_cmd = "scp" + self.mkdir_cmd = "mkdir -p " + self.remote_log_dir + + def hold_job(self, job): + try: + cmd = "pjrls {0} ; sleep 2 ; pjhold -R ASHOLD {0}".format(job.id) + self.send_command(cmd) + job_status = self.check_job(job, submit_hold_check=True) + if job_status == Status.RUNNING: + self.send_command("{0} {1}".format(self.cancel_cmd,job.id)) + return False + elif job_status == Status.FAILED: + return False + cmd = self.get_queue_status_cmd(job.id) + self.send_command(cmd) + except BaseException as e: + try: + self.send_command("{0} {1}".format(self.cancel_cmd,job.id)) + raise AutosubmitError( + "Can't hold jobid:{0}, canceling job".format(job.id), 6000, str(e)) + except BaseException as e: + raise AutosubmitError( + "Can't cancel the jobid: {0}".format(job.id), 6000, str(e)) + except AutosubmitError as e: + raise + + def get_checkhost_cmd(self): + return self._checkhost_cmd + + def get_mkdir_cmd(self): + return self.mkdir_cmd + + def get_remote_log_dir(self): + return self.remote_log_dir + + def parse_job_output(self, output): + return output.strip().split()[0].strip() + + def parse_job_finish_data(self, output, packed): + return 0, 0, 0, 0, 0, 0, dict(), False + + def queuing_reason_cancel(self, reason): + try: + if len(reason.split('(', 1)) > 1: + reason = reason.split('(', 1)[1].split(')')[0] + if 'Invalid' in reason or reason in ['ANOTHER JOB STARTED','DELAY','DEADLINE SCHEDULE STARTED','ELAPSE LIMIT EXCEEDED','FILE IO ERROR','GATE CHECK','IMPOSSIBLE SCHED','INSUFF CPU','INSUFF MEMORY','INSUFF NODE','INSUFF','INTERNAL ERROR','INVALID HOSTFILE','LIMIT OVER MEMORY','LOST COMM','NO CURRENT DIR','NOT EXIST','RSCGRP NOT EXIST','RSCGRP STOP','RSCUNIT','USER','EXCEED','WAIT SCHED']: + return True + return False + except Exception as e: + return False + def get_queue_status(self, in_queue_jobs, list_queue_jobid, as_conf): + if not in_queue_jobs: + return + cmd = self.get_queue_status_cmd(list_queue_jobid) + self.send_command(cmd) + queue_status = self._ssh_output + for job in in_queue_jobs: + reason = self.parse_queue_reason(queue_status, job.id) + if job.queuing_reason_cancel(reason): + Log.printlog("Job {0} will be cancelled and set to FAILED as it was queuing due to {1}".format(job.name,reason),6000) + self.send_command(self.cancel_cmd + " {0}".format(job.id)) + job.new_status = Status.FAILED + job.update_status(as_conf) + elif reason.find('ASHOLD') != -1: + job.new_status = Status.HELD + if not job.hold: + self.send_command("{0} {1}".format(self.cancel_cmd,job.id)) + job.new_status = Status.QUEUING # If it was HELD and was released, it should be QUEUING next. + def parse_Alljobs_output(self, output, job_id): + status = "" + try: + status = [x.split()[1] for x in output.splitlines() + if x.split()[0] == str(job_id)] + except BaseException as e: + pass + if len(status) == 0: + return status + return status[0] + + def parse_joblist(self, job_list): + """ + Convert a list of job_list to job_list_cmd + :param job_list: list of jobs + :type job_list: list + :return: job status + :rtype: str + """ + job_list_cmd = "" + for job, job_prev_status in job_list: + if job.id is None: + job_str = "0" + else: + job_str = str(job.id) + job_list_cmd += job_str + "+" + if job_list_cmd[-1] == "+": + job_list_cmd = job_list_cmd[:-1] + + return job_list_cmd + def _check_jobid_in_queue(self, ssh_output, job_list_cmd): + for job in job_list_cmd.split('+'): + if job not in ssh_output: + return False + return True + + def get_submitted_job_id(self, outputlines): + try: + jobs_id = [] + for output in outputlines.splitlines(): + if not self.submit_error(output): + jobs_id.append(int(output.split()[5])) + + return jobs_id + except IndexError: + raise AutosubmitCritical( + "Submission failed. There are issues on your config file", 7014) + + def get_submit_cmd(self, job_script, job, hold=False, export=""): + if (export is None or str(export).lower() == "none") or len(export) == 0: + export = "" + else: + export += " ; " + + + try: + lang = locale.getlocale()[1] + if lang is None: + lang = locale.getdefaultlocale()[1] + if lang is None: + lang = 'UTF-8' + if not hold: + self._submit_script_file.write((export + self._submit_cmd + job_script + "\n").encode(lang)) + else: + self._submit_script_file.write((export + self._submit_hold_cmd + job_script + "\n").encode(lang)) + except: + pass + + def get_checkAlljobs_cmd(self, jobs_id): + # jobs_id = "jobid1+jobid2+jobid3" + # -H == sacct + return "pjstat -H -v --choose jid,st,ermsg --filter \"jid={0}\" > as_checkalljobs.txt ; pjstat -v --choose jid,st,ermsg --filter \"jid={0}\" >> as_checkalljobs.txt ; cat as_checkalljobs.txt ; rm as_checkalljobs.txt".format(jobs_id) + def get_checkjob_cmd(self, jobs_id): + # jobs_id = "jobid1+jobid2+jobid3" + # -H == sacct + return self.get_checkAlljobs_cmd(self, jobs_id) + + def get_queue_status_cmd(self, job_id): + return self.get_checkAlljobs_cmd(job_id) + + def get_jobid_by_jobname_cmd(self, job_name): + return 'pjstat -v --choose jid,st,ermsg --filter \"jnam={0}\"'.format(job_name) + + + def cancel_job(self, job_id): + return '{0} {1}'.format(self.cancel_cmd,job_id) + + #def get_job_energy_cmd(self, job_id): + # return 'sacct -n --jobs {0} -o JobId%25,State,NCPUS,NNodes,Submit,Start,End,ConsumedEnergy,MaxRSS%25,AveRSS%25'.format(job_id) + + def parse_queue_reason(self, output, job_id): + # split() is used to remove the trailing whitespace but also \t and multiple spaces + # split(" ") is not enough + reason = [x.split()[2] for x in output.splitlines() + if x.split()[0] == str(job_id)] + # In case of duplicates.. we take the first one + if len(reason) > 0: + return reason[0] + return reason + + @staticmethod + def wrapper_header(filename, queue, project, wallclock, num_procs, dependency, directives, threads, method="asthreads", partition=""): + if method == 'srun': + language = "#!/bin/bash" + return \ + language + """ +############################################################################### +# {0} +############################################################################### +# +#PJM -N {0} +{1} +{8} +#PJM -g {2} +#PJM -o {0}.out +#PJM -e {0}.err +#PJM -elapse {3}:00 +#PJM --mpi "proc=%NUMPROC%" +#PJM --mpi "max-proc-per-node={7}" +{5} +{6} + +# +############################################################################### + """.format(filename, queue, project, wallclock, num_procs, dependency, + '\n'.ljust(13).join(str(s) for s in directives), threads,partition) + else: + language = "#!/usr/bin/env python3" + return \ + language + """ +############################################################################### +# {0} +############################################################################### +# +#PJM -N {0} +{1} +{8} +#PJM -g {2} +#PJM -o {0}.out +#PJM -e {0}.err +#PJM -elapse {3}:00 +#PJM --mpi "proc=%NUMPROC%" +#PJM --mpi "max-proc-per-node={7}" +{5} +{6} +# +############################################################################### + """.format(filename, queue, project, wallclock, num_procs, dependency, + '\n'.ljust(13).join(str(s) for s in directives), threads,partition) + + @staticmethod + def allocated_nodes(): + return """os.system("scontrol show hostnames $SLURM_JOB_NODELIST > {0}".format(node_id))""" + + def check_file_exists(self, filename,wrapper_failed=False): + file_exist = False + sleeptime = 5 + retries = 0 + max_retries = 3 + while not file_exist and retries < max_retries: + try: + # This return IOError if path doesn't exist + self._ftpChannel.stat(os.path.join( + self.get_files_path(), filename)) + file_exist = True + except IOError as e: # File doesn't exist, retry in sleeptime + Log.debug("{2} File does not exist.. waiting {0}s for a new retry (retries left: {1})", sleeptime, + max_retries - retries, os.path.join(self.get_files_path(), filename)) + if not wrapper_failed: + sleep(sleeptime) + sleeptime = sleeptime + 5 + retries = retries + 1 + else: + retries = 9999 + except BaseException as e: # Unrecoverable error + if str(e).lower().find("garbage") != -1: + if not wrapper_failed: + sleep(sleeptime) + sleeptime = sleeptime + 5 + retries = retries + 1 + else: + Log.printlog("remote logs {0} couldn't be recovered".format(filename), 6001) + file_exist = False # won't exist + retries = 999 # no more retries + return file_exist diff --git a/autosubmit/platforms/platform.py b/autosubmit/platforms/platform.py index c2ccf3575af60222f7fe0dcee31e733f735b193f..09f2d565da0dd98c2f64a30294fdd69703be5ae9 100644 --- a/autosubmit/platforms/platform.py +++ b/autosubmit/platforms/platform.py @@ -26,6 +26,7 @@ class Platform(object): self._serial_queue = None self._default_queue = None self.processors_per_node = None + self.ec_queue = "hpc" self.scratch_free_space = None self.custom_directives = None self.host = '' @@ -48,6 +49,8 @@ class Platform(object): self._allow_arrays = False self._allow_wrappers = False self._allow_python_jobs = True + self.bashrc_output = "" + self.bashrc_err = "" @property def serial_platform(self): @@ -124,6 +127,8 @@ class Platform(object): parameters['{0}ARCH'.format(prefix)] = self.name parameters['{0}HOST'.format(prefix)] = self.host parameters['{0}QUEUE'.format(prefix)] = self.queue + parameters['{0}EC_QUEUE'.format(prefix)] = self.ec_queue + parameters['{0}USER'.format(prefix)] = self.user parameters['{0}PROJ'.format(prefix)] = self.project parameters['{0}BUDG'.format(prefix)] = self.budget @@ -155,7 +160,8 @@ class Platform(object): :type dest: str """ raise NotImplementedError - + def get_jobid_by_jobname_cmd(self, job_name,minutes="5"): + return "" def get_file(self, filename, must_exist=True, relative_path='', ignore_log=False, wrapper_failed=False): """ Copies a file from the current platform to experiment's tmp folder @@ -245,6 +251,17 @@ class Platform(object): return False else: return False + def get_bashrc_output(self): + """ + Checks remote bashrc output/err to strip out any unwanted output + """ + try: + self.send_command("sleep 1") + self.bashrc_output = self.get_ssh_output() + self.bashrc_err = self.get_ssh_output_err() + return True + except: + return False def remove_stat_file(self, job_name): """ @@ -384,7 +401,7 @@ class Platform(object): """ raise NotImplementedError - def check_job(self, jobid, default_status=Status.COMPLETED, retries=5): + def check_job(self, job, default_status=Status.COMPLETED, retries=5, submit_hold_check=False, is_wrapper=False): """ Checks job running status diff --git a/autosubmit/platforms/psplatform.py b/autosubmit/platforms/psplatform.py index aee3e4eb79f4517cc30f65cdc6a2febf2c6f8173..e2c3ede88475a482995ce7b13fe9a1ce8aa46e5f 100644 --- a/autosubmit/platforms/psplatform.py +++ b/autosubmit/platforms/psplatform.py @@ -76,7 +76,7 @@ class PsPlatform(ParamikoPlatform): def get_submit_cmd(self, job_script, job, hold=False, export=""): wallclock = self.parse_time(job.wallclock) - seconds = int(wallclock.days * 86400 + wallclock.seconds + 60) + seconds = int(wallclock.days * 86400 + wallclock.seconds * 60) if export == "none" or export == "None" or export is None or export == "": export = "" else: diff --git a/autosubmit/platforms/slurmplatform.py b/autosubmit/platforms/slurmplatform.py index cd96b21cccedef584ab9541dec985b90e9965c51..67c073a75aad7142af1dacb8f95725c5b934a6fd 100644 --- a/autosubmit/platforms/slurmplatform.py +++ b/autosubmit/platforms/slurmplatform.py @@ -362,8 +362,8 @@ class SlurmPlatform(ParamikoPlatform): return export + self._submit_hold_cmd + job_script else: if not hold: - self._submit_script_file.write( - export + self._submit_cmd + job_script + "\n") + write_this = export + self._submit_cmd + job_script +"\n" + self._submit_script_file.write(write_this) else: self._submit_script_file.write( export + self._submit_hold_cmd + job_script + "\n") @@ -377,8 +377,8 @@ class SlurmPlatform(ParamikoPlatform): def get_queue_status_cmd(self, job_id): return 'squeue -j {0} -o %A,%R'.format(job_id) - def get_jobid_by_jobname_cmd(self, job_name): - return 'squeue -o %A,%.50j -n {0}'.format(job_name) + def get_jobid_by_jobname_cmd(self, job_name,minutes="5"): + return "sacct --name {0} -o JobID -n -X --starttime=$(date -d '{1} minutes ago' +'%Y-%m-%dT%H:%M:%S')".format(job_name,minutes) def cancel_job(self, job_id): @@ -394,6 +394,28 @@ class SlurmPlatform(ParamikoPlatform): return reason[0] return reason + def get_queue_status(self, in_queue_jobs, list_queue_jobid, as_conf): + if not in_queue_jobs: + return + cmd = self.get_queue_status_cmd(list_queue_jobid) + self.send_command(cmd) + queue_status = self._ssh_output + for job in in_queue_jobs: + reason = self.parse_queue_reason(queue_status, job.id) + if job.queuing_reason_cancel(reason): # this should be a platform method to be implemented + Log.error( + "Job {0} will be cancelled and set to FAILED as it was queuing due to {1}", job.name, reason) + self.send_command( + self.cancel_cmd + " {0}".format(job.id)) + job.new_status = Status.FAILED + job.update_status(as_conf) + elif reason == '(JobHeldUser)': + if not job.hold: + # should be self.release_cmd or something like that but it is not implemented + self.send_command("scontrol release {0}".format(job.id)) + job.new_status = Status.QUEUING # If it was HELD and was released, it should be QUEUING next. + else: + job.new_status = Status.HELD @staticmethod def wrapper_header(filename, queue, project, wallclock, num_procs, dependency, directives, threads, method="asthreads"): if method == 'srun': @@ -443,7 +465,7 @@ class SlurmPlatform(ParamikoPlatform): @staticmethod def allocated_nodes(): - return """os.system("scontrol show hostnames $SLURM_JOB_NODELIST > node_list")""" + return """os.system("scontrol show hostnames $SLURM_JOB_NODELIST > {0}".format(node_id))""" def check_file_exists(self, filename,wrapper_failed=False): file_exist = False @@ -456,8 +478,8 @@ class SlurmPlatform(ParamikoPlatform): self._ftpChannel.stat(os.path.join( self.get_files_path(), filename)) file_exist = True - except IOError: # File doesn't exist, retry in sleeptime - Log.debug("{2} File still no exists.. waiting {0}s for a new retry ( retries left: {1})", sleeptime, + except IOError as e: # File doesn't exist, retry in sleeptime + Log.debug("{2} File does not exist.. waiting {0}s for a new retry (retries left: {1})", sleeptime, max_retries - retries, os.path.join(self.get_files_path(), filename)) if not wrapper_failed: sleep(sleeptime) @@ -466,7 +488,7 @@ class SlurmPlatform(ParamikoPlatform): else: retries = 9999 except BaseException as e: # Unrecoverable error - if e.message.lower().find("garbage") != -1: + if str(e).lower().find("garbage") != -1: if not wrapper_failed: sleep(sleeptime) sleeptime = sleeptime + 5 diff --git a/autosubmit/platforms/wrappers/wrapper_builder.py b/autosubmit/platforms/wrappers/wrapper_builder.py index 92ac25662b46268a7137a0ddde6b874321599dce..836e86d270fac1f6c10d64c0b70e7c84243c5ea7 100644 --- a/autosubmit/platforms/wrappers/wrapper_builder.py +++ b/autosubmit/platforms/wrappers/wrapper_builder.py @@ -44,8 +44,14 @@ class WrapperDirector: return wrapper_script class WrapperBuilder(object): def __init__(self, **kwargs): + # Vertical wrapper if "retrials" in kwargs.keys(): self.retrials = kwargs['retrials'] + # rest of wrappers + if "fail_count" in kwargs.keys(): + self.fail_count = kwargs['fail_count'] + else: + self.fail_count = 0 self.header_directive = kwargs['header_directive'] self.job_scripts = kwargs['jobs_scripts'] self.threads = kwargs['threads'] @@ -109,7 +115,7 @@ class PythonWrapperBuilder(WrapperBuilder): sample_list = list(sample_str) random.shuffle(sample_list) final_string = ''.join(sample_list) - return final_string+"_FAILED" + return final_string def build_imports(self): return textwrap.dedent(""" @@ -136,7 +142,8 @@ class PythonWrapperBuilder(WrapperBuilder): return getattr(self.stream, attr) sys.stdout = Unbuffered(sys.stdout) - wrapper_id = "{1}" + wrapper_id = "{1}_FAILED" + node_id = "node_list_{1}" # Defining scripts to be run scripts= {0} """).format(str(self.job_scripts), self.get_random_alphanumeric_string(5,5),'\n'.ljust(13)) @@ -148,16 +155,19 @@ class PythonWrapperBuilder(WrapperBuilder): Thread.__init__(self) self.template = template self.id_run = id_run + self.fail_count = {0} def run(self): jobname = self.template.replace('.cmd', '') #os.system("echo $(date +%s) > "+jobname+"_STAT") - out = str(self.template) + ".out." + str(self.retrials) - err = str(self.template) + ".err." + str(self.retrials) + out = str(self.template) + ".out." + str(self.fail_count) + err = str(self.template) + ".err." + str(self.fail_count) print(out+"\\n") command = "./" + str(self.template) + " " + str(self.id_run) + " " + os.getcwd() (self.status) = getstatusoutput(command + " > " + out + " 2> " + err) - """).format('\n'.ljust(13)) + """).format(self.fail_count,'\n'.ljust(13)) + + # hybrids def build_joblist_thread(self): @@ -173,7 +183,7 @@ class PythonWrapperBuilder(WrapperBuilder): {0} os.system("mkdir -p machinefiles") - with open('node_list', 'r') as file: + with open('{{0}}'.format(node_id), 'r') as file: all_nodes = file.read() all_nodes = all_nodes.split("_NEWLINE_") @@ -678,7 +688,7 @@ class SrunWrapperBuilder(WrapperBuilder): {0} os.system("mkdir -p machinefiles") - with open('node_list', 'r') as file: + with open('{{0}}'.format(node_id), 'r') as file: all_nodes = file.read() all_nodes = all_nodes.split("_NEWLINE_") diff --git a/autosubmit/platforms/wrappers/wrapper_factory.py b/autosubmit/platforms/wrappers/wrapper_factory.py index d0690791a6be2a0a77bc05063f6bdc25bb8838ee..707c2502ab09c6e51aa0f7d2c7fa8a1a3820b38e 100644 --- a/autosubmit/platforms/wrappers/wrapper_factory.py +++ b/autosubmit/platforms/wrappers/wrapper_factory.py @@ -141,3 +141,41 @@ class EcWrapperFactory(WrapperFactory): def dependency_directive(self, dependency): return '#PBS -v depend=afterok:{0}'.format(dependency) + +class PJMWrapperFactory(WrapperFactory): + + def vertical_wrapper(self, **kwargs): + return PythonVerticalWrapperBuilder(**kwargs) + + def horizontal_wrapper(self, **kwargs): + + if kwargs["method"] == 'srun': + return SrunHorizontalWrapperBuilder(**kwargs) + else: + return PythonHorizontalWrapperBuilder(**kwargs) + + def hybrid_wrapper_horizontal_vertical(self, **kwargs): + return PythonHorizontalVerticalWrapperBuilder(**kwargs) + + def hybrid_wrapper_vertical_horizontal(self, **kwargs): + if kwargs["method"] == 'srun': + return SrunVerticalHorizontalWrapperBuilder(**kwargs) + else: + return PythonVerticalHorizontalWrapperBuilder(**kwargs) + + def header_directives(self, **kwargs): + return self.platform.wrapper_header(kwargs['name'], kwargs['queue'], kwargs['project'], kwargs['wallclock'], + kwargs['num_processors'], kwargs['dependency'], kwargs['directives'],kwargs['threads'],kwargs['method'],kwargs['partition']) + + def allocated_nodes(self): + return self.platform.allocated_nodes() + + #def dependency_directive(self, dependency): + # # There is no option for afterok in the PJM scheduler, but I think it is not needed. + # return '#PJM --dependency=afterok:{0}'.format(dependency) + + def queue_directive(self, queue): + return '#PJM -L rscgrp={0}'.format(queue) + + def partition_directive(self, partition): + return '#PJM -g {0}'.format(partition) diff --git a/autosubmit/statistics/utils.py b/autosubmit/statistics/utils.py index 46574018751893f30e50a18c131245166bb552ad..765994c9ea51cce38bdd229ad7d1bf2e047d6a92 100644 --- a/autosubmit/statistics/utils.py +++ b/autosubmit/statistics/utils.py @@ -38,8 +38,7 @@ def timedelta2hours(deltatime): def parse_number_processors(processors_str): """ Defaults to 1 in case of error """ - # type: (str) -> int - if ':' in processors_str: + if ':' in processors_str: components = processors_str.split(":") processors = int(sum( [math.ceil(float(x) / 36.0) * 36.0 for x in components])) diff --git a/bin/autosubmit b/bin/autosubmit index 39ba333321118ec241af6e30cc18253323d923ad..8170a19fd3a7759020038be007c340bd5b91a2bb 100755 --- a/bin/autosubmit +++ b/bin/autosubmit @@ -21,6 +21,7 @@ """Script for handling experiment monitoring""" import os import sys +import traceback scriptdir = os.path.abspath(os.path.dirname(sys.argv[0])) sys.path.append(scriptdir) @@ -52,18 +53,19 @@ def main(): if e.trace is not None: Log.error("Trace: {0}", e.trace) Log.critical("{1} [eCode={0}]", e.code, e.message) - Log.info("More info at https://autosubmit.readthedocs.io/en/latest/faq.html") + Log.info("More info at https://autosubmit.readthedocs.io/en/v3.15.0/faq.html") os._exit(1) except Exception as e: if os.path.exists(os.path.join(Log.file_path, "autosubmit.lock")): os.remove(os.path.join(Log.file_path, "autosubmit.lock")) - Log.error("Trace: {0}", str(e.message)) - if "temporarily unavailable" in str(e.message): + Log.error("Trace: {0}", str(e)) + if "temporarily unavailable" in str(e): Log.critical( - "Another instance of autosubmit is running on this experiment. If this is not the case, delete autosubmit.lock", 7000) + "{0}\nAnother instance of autosubmit is running on this experiment. If this is not the case, delete autosubmit.lock".format(str(e)), 7000) else: Log.critical( - "Unhandled error: If you see this message, please report it in Autosubmit's GitLab project") + "{1}{0}\nUnhandled error: If you see this message, please report it in Autosubmit's GitLab project".format(str(e),traceback.print_exc() +), 7000) os._exit(1) diff --git a/docs/build/html/_sources/usage/new_platform.rst.txt b/docs/build/html/_sources/usage/new_platform.rst.txt index 173dafae45e937f818845ae19a69bfa879f841c3..97177806182c76ff7fa060a580c7ed62c2940360 100644 --- a/docs/build/html/_sources/usage/new_platform.rst.txt +++ b/docs/build/html/_sources/usage/new_platform.rst.txt @@ -53,9 +53,9 @@ There are some other parameters that you may need to specify: * TEST_SUITE: if true, autosubmit test command can use this queue as a main queue. Defaults to false -* MAX_WAITING_JOBS: maximum number of jobs to be waiting in this platform. +* MAX_WAITING_JOBS: Maximum number of jobs to be queuing or submitted in this platform. -* TOTAL_JOBS: maximum number of jobs to be running at the same time in this platform. +* TOTAL_JOBS: Maximum number of jobs to be queuing, running or submitted at the same time in this platform. * CUSTOM_DIRECTIVES: Custom directives for the resource manager of this platform. diff --git a/docs/source/devguide/variables.rst b/docs/source/devguide/variables.rst index ede2468539e333bf553ea9dce0d83464e198f441..52b0389d4405494dbb35c30ad94e8eaa6f4d7bca 100644 --- a/docs/source/devguide/variables.rst +++ b/docs/source/devguide/variables.rst @@ -30,12 +30,14 @@ This variables are relatives to the current job. - **Chunk_END_MONTH**: chunk's end month - **Chunk_END_DAY**: chunk's end day - **Chunk_END_HOUR**: chunk's end hour +- **STARTDATES**: List of startdates - **PREV**: days since startdate at the chunk's start - **Chunk_FIRST**: True if the current chunk is the first, false otherwise. - **Chunk_LAST**: True if the current chunk is the last, false otherwise. - **NUMPROC**: Number of processors that the job will use. - **NUMTHREADS**: Number of threads that the job will use. - **NUMTASKS**: Number of tasks that the job will use. +- **NODES**: Number of nodes that the job will use. - **HYPERTHREADING**: Detects if hyperthreading is enabled or not. - **WALLCLOCK**: Number of processors that the job will use. - **SCRATCH_FREE_SPACE**: Percentage of free space required on the ``scratch``. diff --git a/docs/source/installation/index.rst b/docs/source/installation/index.rst index 6573f8723cd789594842c6bfebfcd71d76c50e35..157f28ecc61f61768311c3ec90f7cc524e989616 100644 --- a/docs/source/installation/index.rst +++ b/docs/source/installation/index.rst @@ -1,14 +1,17 @@ -######################### -How to Install Autosubmit -######################### +############ +Installation +############ + +How to install +============== The Autosubmit code is maintained in *PyPi*, the main source for python packages. -- Pre-requisites: bash, python2, sqlite3, git-scm > 1.8.2, subversion, dialog, curl, python-tk, python2-dev, graphviz >= 2.41, pip2 +- Pre-requisites: bash, python2, sqlite3, git-scm > 1.8.2, subversion, dialog, curl, python-tk(tkinter in centOS), python2-dev, graphviz >= 2.41, pip2 .. important:: (SYSTEM) Graphviz version must be >= 2.38 except 2.40(not working). You can check the version using dot -v. -- Python dependencies: argparse, python-dateutil, pyparsing, numpy, pydotplus, matplotlib, paramiko, python2-pythondialog, portalocker, requests, typing +- Python dependencies: argparse, python-dateutil, pyparsing, numpy, pydotplus, matplotlib, paramiko, python2-pythondialog, portalocker, requests, typing, six >= 1.10 .. important:: dot -v command should contain "dot",pdf,png,svg,xlib in device section. @@ -35,76 +38,8 @@ or download, unpack and: .. hint:: To see the changelog, use ``autosubmit changelog`` -Examples -======== - -Sequence of instructions to install Autosubmit and its dependencies in Ubuntu. ------------------------------------------------------------------------------- - -.. code-block:: bash - - - # Update repositories - apt update - - # Avoid interactive stuff - export DEBIAN_FRONTEND=noninteractive - - # Dependencies - apt install wget curl python2 python-tk python2-dev graphviz -y -q - - # Additional dependencies related with pycrypto - apt install build-essential libssl-dev libffi-dev -y -q - - # Download get pip script and launch it - wget https://bootstrap.pypa.io/pip/2.7/get-pip.py - python2 get-pip.py - - # Install autosubmit using pip - pip2 install autosubmit - - # Check that we can execute autosubmit commands - autosubmit -h - - # Configure - autosubmit configure - - # Install - autosubmit install - - # Get expid - autosubmit expid -H TEST -d "Test exp." - - # Create with -np - # Since it was a new install the expid will be a000 - autosubmit create a000 -np - -Sequence of instructions to install Autosubmit and its dependencies with conda. -------------------------------------------------------------------------------- - -.. code-block:: bash - - # Download conda - wget https://repo.anaconda.com/miniconda/Miniconda3-py39_4.12.0-Linux-x86_64.sh./Miniconda3-py39_4.12.0-Linux-x86_64.sh - # Launch it - ./Miniconda3-py39_4.12.0-Linux-x86_64.sh - # Download git - apt install git -y -q - # Download autosubmit - git clone https://earth.bsc.es/gitlab/es/autosubmit.git -b v3.14.0 - cd autosubmit - # Create conda environment - conda env update -f environment.yml -n autosubmit python=2 - # Activate env - source activate autosubmit - # Test autosubmit - autosubmit -v - # Configure autosubmitrc and install database as indicated in this doc - - -################ How to configure -################ +================ After installation, you have to configure database and path for Autosubmit. In order to use the default settings, just create a directory called `autosubmit` in your home directory before running the configure command. @@ -136,9 +71,11 @@ For installing the database for Autosubmit on the configured folder, when no dat autosubmit install -.. important:: Be careful ! autosubmit install will create a blank database. +.. danger:: Be careful ! autosubmit install will create a blank database. + +Lastly, if autosubmit configure doesn't work for you or you need to configure additional info create: -Lastly, if autosubmit configure doesn't work for you or you need to configure additional info create or modify /etc/autosubmitrc file or ~/.autosubmitrc with the information as follows: +Create or modify /etc/autosubmitrc file or ~/.autosubmitrc with the information as follows: .. code-block:: ini @@ -182,4 +119,74 @@ From 3.14+ onwards, autosubmit commands can be tailored to run on specific machi * If no commands are defined, all commands are authorized. * If no machines are defined, all machines are authorized. -Now you are ready to use Autosubmit ! \ No newline at end of file +Now you are ready to use Autosubmit ! + + +Examples +======== + +Sequence of instructions to install Autosubmit and its dependencies in Ubuntu. +------------------------------------------------------------------------------ + +.. code-block:: bash + + + # Update repositories + apt update + + # Avoid interactive stuff + export DEBIAN_FRONTEND=noninteractive + + # Dependencies + apt install wget curl python2 python-tk python2-dev graphviz -y -q + + # Additional dependencies related with pycrypto + apt install build-essential libssl-dev libffi-dev -y -q + + # Download get pip script and launch it + wget https://bootstrap.pypa.io/pip/2.7/get-pip.py + python2 get-pip.py + + # Install autosubmit using pip + pip2 install autosubmit + + # Check that we can execute autosubmit commands + autosubmit -h + + # Configure + autosubmit configure + + # Install + autosubmit install + + # Get expid + autosubmit expid -H local -d "Test exp." + + # Create with -np + # Since it was a new install the expid will be a000 + autosubmit create a000 -np + +Sequence of instructions to install Autosubmit and its dependencies with conda. +------------------------------------------------------------------------------- + +.. code-block:: bash + + # Download conda + wget https://repo.anaconda.com/miniconda/Miniconda3-py39_4.12.0-Linux-x86_64.sh + # Launch it + chmod +x ./Miniconda3-py39_4.12.0-Linux-x86_64.sh ; ./Miniconda3-py39_4.12.0-Linux-x86_64.sh + # Download git (if it is not already installed) + apt install git -y -q + # Download autosubmit + git clone https://earth.bsc.es/gitlab/es/autosubmit.git -b v3.14.0b + cd autosubmit + # Create conda environment + conda env update -f environment.yml -n autosubmit python=2 + # Activate env + conda activate autosubmit + # Test autosubmit + autosubmit -v + # Configure autosubmitrc and install the database as indicated in the installation instructions above this section + +.. hint:: + After installing conda, you may need to close the terminal and re-open it so the installation takes effect. \ No newline at end of file diff --git a/docs/source/qstartguide/index.rst b/docs/source/qstartguide/index.rst index 6ec8898bdf0ed311e223be49a2cc59bdb85d9f86..b65ec0539b7d913132ab13e749e1df7129a9bcfe 100644 --- a/docs/source/qstartguide/index.rst +++ b/docs/source/qstartguide/index.rst @@ -165,10 +165,13 @@ Now open platforms.conf. Note: This will be an example for marenostrum4 [marenostrum4] # Queue type. Options: ps, SGE, LSF, SLURM, PBS, eceaccess - TYPE = slurm # scheduler type + # scheduler type + TYPE = slurm HOST = mn1.bsc.es,mn2.bsc.es,mn3.bsc.es - PROJECT = bsc32 # <- your project - USER = bsc32070 # <- your user + # <- your project ( usually is the user name without the last 3 digits, however check your hpc) + PROJECT = bsc32 + # <- your user name + USER = bsc32070 SCRATCH_DIR = /gpfs/scratch ADD_PROJECT_TO_HOST = False # use 72:00 if you are using a PRACE account, 48:00 for the bsc account diff --git a/docs/source/troubleshooting/error-codes.rst b/docs/source/troubleshooting/error-codes.rst index c92ba38ada9432a57ca1e66e867831decfcba0bf..ed91549971f7c63a1ab658900476c5663cbf24f7 100644 --- a/docs/source/troubleshooting/error-codes.rst +++ b/docs/source/troubleshooting/error-codes.rst @@ -155,7 +155,9 @@ Minor errors - Error codes [6000+] +------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ | 6013 | Configuration issues | Check log output for more info | +------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ -| 6014 | Git Can't clone repository submodule | Check submodule url, perform a refresh | +| 6014 | Git Can't clone repository submodule | Check submodule url, perform a refresh | +------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ | 6015 | Submission failed | Automatically, if there aren't bigger issues | +------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ +| 6016 | Temporal connection issues | Automatically, if there aren't bigger issues | ++------+------------------------------------------------------+------------------------------------------------------------------------------------------------+ diff --git a/docs/source/userguide/configure/develop_a_project.rst b/docs/source/userguide/configure/develop_a_project.rst index 39960413a3bf43c4f74346c46f032c169b579725..0dda37b3cc52cdee7be26c5c1e649ca0e1cb73ac 100644 --- a/docs/source/userguide/configure/develop_a_project.rst +++ b/docs/source/userguide/configure/develop_a_project.rst @@ -1,6 +1,7 @@ .. _develproject: +==================== Developing a project ==================== @@ -8,14 +9,14 @@ This section contains some examples on how to develop a new project. All files, with the exception of user-defined scripts, are located in the ``/conf`` directory. -Configuration files are written in ``ini`` format. Autosubmit supports user-defined scripts are written in ``bash``, ``python``, and ``R``. +Configuration files are written in ``ini`` format. In the other hand, the user-defined scripts are written in ``bash/python or R`` format. -To configure the experiment, edit ``autosubmit_cxxx.conf``, ``expdef_cxxx.conf``, ``jobs_cxxx.conf`` , ``platforms_cxxx.conf``, and ``proj_cxxx.conf``` in the ``conf`` folder of the experiment. +To configure the experiment, edit ``autosubmit_cxxx.conf``, ``expdef_cxxx.conf``, ``jobs_cxxx.conf`` , ``platforms_cxxx.conf`` and ``proj_cxxx.conf``` in the ``conf`` folder of the experiment. Expdef configuration --------------------- +==================== - ``vi /cxxx/conf/expdef_cxxx.conf`` + vi /cxxx/conf/expdef_cxxx.conf .. code-block:: ini @@ -92,9 +93,9 @@ Expdef configuration FILE_JOBS_CONF = templates/common/jobs.conf Autosubmit configuration ------------------------- +======================== - ``vi /cxxx/conf/autosubmit_cxxx.conf`` + vi /cxxx/conf/autosubmit_cxxx.conf .. code-block:: ini @@ -128,9 +129,9 @@ Autosubmit configuration # [wrappers] Jobs configuration ------------------- +================== - ``vi /cxxx/conf/jobs_cxxx.conf`` + vi /cxxx/conf/jobs_cxxx.conf .. code-block:: ini @@ -234,9 +235,9 @@ Jobs configuration RUNNING = member Platform configuration ----------------------- +====================== - ``vi /cxxx/conf/platforms_cxxx.conf`` + vi /cxxx/conf/platforms_cxxx.conf .. code-block:: ini @@ -291,9 +292,9 @@ Platform configuration TEST_SUITE = True Proj configuration ------------------- +================== -After filling the experiment configuration and executing ``autosubmit create cxxx -np``, a copy of the model is stored in ``proj``. +After filling the experiment configuration and promt ``autosubmit create cxxx -np`` create, user can go into ``proj`` which has a copy of the model. The experiment project contains the scripts specified in ``jobs_cxxx.conf`` and a copy of model source code and data specified in ``expdef_xxxx.conf``. @@ -511,7 +512,7 @@ Example: PISCES_timestep = 3600 Proj configuration:: Full example -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +--------------------------------- This section contains a full example of a valid proj file with a valid user script. @@ -560,7 +561,7 @@ Final script, which is generated by `autosubmit run` or ``autosubmit inspect`` (...) Detailed platform configuration -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +------------------------------- In this section, we describe the platform configuration using `-QOS` and also `PARTITION` @@ -673,7 +674,7 @@ In this section, we describe the platform configuration using `-QOS` and also `P The custom directives can be used for multiple parameters at the same time using the follow syntax. - `vi /conf/platform_cxxx.conf` + vi /conf/platform_cxxx.conf .. code-block:: ini @@ -699,3 +700,46 @@ The custom directives can be used for multiple parameters at the same time using MAX_PROCESSORS = 80 # test [40] / small [40] // large [40] PROCESSORS_PER_NODE = 40 + +Controling the number of active concurrent tasks in an experiment +---------------------------------------------------------------------- + +In some cases, you may want to control the number of concurrent tasks/jobs that can be active in an experiment. + +To set the maximum number of concurrent tasks/jobs, you can use the ``TOTAL_JOBS`` and ``MAX_WAITING_JOBS`` variable in the ``conf/autosubmit_cxxx.conf`` file. + + vi /conf/autosubmit_cxxx.conf + +.. code-block:: ini + + # Controls the maximum number of submitted,waiting and running tasks + TOTAL_JOBS = 10 + # Controls the maximum number of submitted and waiting tasks + MAX_WAITING_JOBS = 10 + +To control the number of jobs included in a wrapper, you can use the `MAX_WRAPPED_JOBS` and `MIN_WRAPPED_JOBS` variables in the ``conf/autosubmit_cxxx.conf`` file. + +Note that a wrapped job is counted as a single job regardless of the number of tasks it contains. Therefore, `TOTAL_JOBS` and `MAX_WAITING_JOBS` won't have an impact inside a wrapper. + + vi /conf/autosubmit_cxxx.conf + +.. code-block:: ini + + [wrapper] + TYPE = + MIN_WRAPPED = 2 # Minium amount of jobs that will be wrapped together in any given time. + MIN_WRAPPED_H = 2 # Same as above but only for the horizontal packages. + MIN_WRAPPED_V = 2 # Same as above but only for the vertical packages. + MAX_WRAPPED = 99999 # Maximum amount of jobs that will be wrapped together in any given time. + MAX_WRAPPED_H = 99999 # Same as above but only for the horizontal packages. + MAX_WRAPPED_V = 99999 # Same as above but only for the vertical packages. + +- **MAX_WRAPPED** can be defined in ``jobs_cxxx.conf`` in order to limit the number of jobs wrapped for the corresponding job section + - If not defined, it considers the **MAX_WRAPPED** defined under [wrapper] in ``autosubmit_cxxx.conf`` + - If **MAX_WRAPPED** is not defined, then the max_wallclock of the platform will be final factor. +- **MIN_WRAPPED** can be defined in ``autosubmit_cxxx.conf`` in order to limit the minimum number of jobs that a wrapper can contain + - If not defined, it considers that **MIN_WRAPPED** is 2. + - If **POLICY** is flexible and it is not possible to wrap **MIN_WRAPPED** or more tasks, these tasks will be submitted as individual jobs, as long as the condition is not satisfied. + - If **POLICY** is mixed and there are failed jobs inside a wrapper, these jobs will be submitted as individual jobs. + - If **POLICY** is strict and it is not possible to wrap **MIN_WRAPPED** or more tasks, these tasks will not be submitted until there are enough tasks to build a package. + - strict and mixed policies can cause **deadlocks**. diff --git a/docs/source/userguide/configure/index.rst b/docs/source/userguide/configure/index.rst index 2a2b4f0a8cd096101c630a6ead9ef393c6dda139..cadaf925e87847a078b95c71f785a5142b3fe4ce 100644 --- a/docs/source/userguide/configure/index.rst +++ b/docs/source/userguide/configure/index.rst @@ -138,6 +138,9 @@ To do this use: * TASKS: tasks number to be submitted to the HPC. If not specified, defaults to 1. +* NODES: nodes number to be submitted to the HPC. If not specified, the directive is not added. + + * HYPERTHREADING: Enables Hyper-threading, this will double the max amount of threads. defaults to false. ( Not available on slurm platforms ) * QUEUE: queue to add the job to. If not specified, uses PLATFORM default. @@ -173,6 +176,10 @@ There are also other, less used features that you can use: * QUEUE: queue to add the job to. If not specified, uses PLATFORM default. +* EXTENDED_HEADER_PATH: path to a script to be appended at the begging of the .cmd script that Autosubmit generates. Only supports job type BASH. + +* EXTENDED_TAILER_PATH: path to a script to be appended at the end of the .cmd script that Autosubmit generates. Only supports job type BASH. + How to configure email notifications ------------------------------------ diff --git a/docs/source/userguide/configure/fig/dashed.png b/docs/source/userguide/defining workflows/fig/dashed.png similarity index 100% rename from docs/source/userguide/configure/fig/dashed.png rename to docs/source/userguide/defining workflows/fig/dashed.png diff --git a/docs/source/userguide/configure/fig/date-synchronize.png b/docs/source/userguide/defining workflows/fig/date-synchronize.png similarity index 100% rename from docs/source/userguide/configure/fig/date-synchronize.png rename to docs/source/userguide/defining workflows/fig/date-synchronize.png diff --git a/docs/source/userguide/configure/fig/dependencies_previous.png b/docs/source/userguide/defining workflows/fig/dependencies_previous.png similarity index 100% rename from docs/source/userguide/configure/fig/dependencies_previous.png rename to docs/source/userguide/defining workflows/fig/dependencies_previous.png diff --git a/docs/source/userguide/configure/fig/dependencies_running.png b/docs/source/userguide/defining workflows/fig/dependencies_running.png similarity index 100% rename from docs/source/userguide/configure/fig/dependencies_running.png rename to docs/source/userguide/defining workflows/fig/dependencies_running.png diff --git a/docs/source/userguide/configure/fig/experiment_delay_doc.png b/docs/source/userguide/defining workflows/fig/experiment_delay_doc.png similarity index 100% rename from docs/source/userguide/configure/fig/experiment_delay_doc.png rename to docs/source/userguide/defining workflows/fig/experiment_delay_doc.png diff --git a/docs/source/userguide/configure/fig/frequency.png b/docs/source/userguide/defining workflows/fig/frequency.png similarity index 100% rename from docs/source/userguide/configure/fig/frequency.png rename to docs/source/userguide/defining workflows/fig/frequency.png diff --git a/docs/source/userguide/configure/fig/member-synchronize.png b/docs/source/userguide/defining workflows/fig/member-synchronize.png similarity index 100% rename from docs/source/userguide/configure/fig/member-synchronize.png rename to docs/source/userguide/defining workflows/fig/member-synchronize.png diff --git a/docs/source/userguide/configure/fig/no-synchronize.png b/docs/source/userguide/defining workflows/fig/no-synchronize.png similarity index 100% rename from docs/source/userguide/configure/fig/no-synchronize.png rename to docs/source/userguide/defining workflows/fig/no-synchronize.png diff --git a/docs/source/userguide/configure/fig/running.png b/docs/source/userguide/defining workflows/fig/running.png similarity index 100% rename from docs/source/userguide/configure/fig/running.png rename to docs/source/userguide/defining workflows/fig/running.png diff --git a/docs/source/userguide/configure/fig/select_chunks.png b/docs/source/userguide/defining workflows/fig/select_chunks.png similarity index 100% rename from docs/source/userguide/configure/fig/select_chunks.png rename to docs/source/userguide/defining workflows/fig/select_chunks.png diff --git a/docs/source/userguide/configure/fig/select_members.png b/docs/source/userguide/defining workflows/fig/select_members.png similarity index 100% rename from docs/source/userguide/configure/fig/select_members.png rename to docs/source/userguide/defining workflows/fig/select_members.png diff --git a/docs/source/userguide/configure/fig/simple.png b/docs/source/userguide/defining workflows/fig/simple.png similarity index 100% rename from docs/source/userguide/configure/fig/simple.png rename to docs/source/userguide/defining workflows/fig/simple.png diff --git a/docs/source/userguide/configure/fig/skip.png b/docs/source/userguide/defining workflows/fig/skip.png similarity index 100% rename from docs/source/userguide/configure/fig/skip.png rename to docs/source/userguide/defining workflows/fig/skip.png diff --git a/docs/source/userguide/configure/fig/split.png b/docs/source/userguide/defining workflows/fig/split.png similarity index 100% rename from docs/source/userguide/configure/fig/split.png rename to docs/source/userguide/defining workflows/fig/split.png diff --git a/docs/source/userguide/manage/index.rst b/docs/source/userguide/manage/index.rst index 3f4edea72d50e498f6f8e21aa90c9e5fec927003..23fe97a5a32ebf3b4dd207f0f0eb32f4b7e5ea91 100644 --- a/docs/source/userguide/manage/index.rst +++ b/docs/source/userguide/manage/index.rst @@ -1,4 +1,4 @@ -Manage experiments +Manage Experiments =================== How to clean the experiment diff --git a/docs/source/userguide/run/index.rst b/docs/source/userguide/run/index.rst index 34f937ed7dbf32b22296432fc545cfb6c5f578b0..90f0180bf1a09e7f4d9c808c354203a4aa62d47f 100644 --- a/docs/source/userguide/run/index.rst +++ b/docs/source/userguide/run/index.rst @@ -1,4 +1,4 @@ -Running experiments +Running Experiments =================== Run an experiment diff --git a/docs/source/userguide/run/fig/dasim.png b/docs/source/userguide/wrappers/fig/dasim.png similarity index 100% rename from docs/source/userguide/run/fig/dasim.png rename to docs/source/userguide/wrappers/fig/dasim.png diff --git a/docs/source/userguide/run/fig/horizontal_remote.png b/docs/source/userguide/wrappers/fig/horizontal_remote.png similarity index 100% rename from docs/source/userguide/run/fig/horizontal_remote.png rename to docs/source/userguide/wrappers/fig/horizontal_remote.png diff --git a/docs/source/userguide/run/fig/multiple_wrappers.png b/docs/source/userguide/wrappers/fig/multiple_wrappers.png similarity index 100% rename from docs/source/userguide/run/fig/multiple_wrappers.png rename to docs/source/userguide/wrappers/fig/multiple_wrappers.png diff --git a/docs/source/userguide/run/fig/rerun.png b/docs/source/userguide/wrappers/fig/rerun.png similarity index 100% rename from docs/source/userguide/run/fig/rerun.png rename to docs/source/userguide/wrappers/fig/rerun.png diff --git a/docs/source/userguide/run/fig/vertical-horizontal.png b/docs/source/userguide/wrappers/fig/vertical-horizontal.png similarity index 100% rename from docs/source/userguide/run/fig/vertical-horizontal.png rename to docs/source/userguide/wrappers/fig/vertical-horizontal.png diff --git a/docs/source/userguide/run/fig/vertical-mixed.png b/docs/source/userguide/wrappers/fig/vertical-mixed.png similarity index 100% rename from docs/source/userguide/run/fig/vertical-mixed.png rename to docs/source/userguide/wrappers/fig/vertical-mixed.png diff --git a/docs/source/userguide/wrappers/index.rst b/docs/source/userguide/wrappers/index.rst index 2ee2a34e1c9c1350ad918c3b03e33d3c5e6bee69..e19ddaa1bb6bb1a09f6a05676e69f48078d4c632 100644 --- a/docs/source/userguide/wrappers/index.rst +++ b/docs/source/userguide/wrappers/index.rst @@ -1,5 +1,5 @@ -Wrappers -======== +Configure Wrappers +================== In order to understand the goal of this feature, please take a look at: https://earth.bsc.es/wiki/lib/exe/fetch.php?media=library:seminars:techniques_to_improve_the_throughput.pptx diff --git a/environment.yml b/environment.yml index 4585486d92967e57c86c1eb4562b4d9f569262b1..6429c6982e25851e9453e40f9d530053550c9f1b 100644 --- a/environment.yml +++ b/environment.yml @@ -4,6 +4,7 @@ channels: dependencies: - nose +- configparser - nose-cov - paramiko - pydotplus @@ -16,6 +17,7 @@ dependencies: - portalocker - networkx - python=2.7 +- requests - pip: - bscearth.utils - Xlib diff --git a/log/log.py b/log/log.py index ae3ca5a7426f5cb13d84624367c3496bf3943ad1..216fc23eb5c466ee804450bda5e2bf1b33f5c952 100644 --- a/log/log.py +++ b/log/log.py @@ -161,7 +161,7 @@ class Log: logging.getLogger(name) @staticmethod - def set_file(file_path, type='out', level=WARNING): + def set_file(file_path, type='out', level="WARNING"): """ Configure the file to store the log. If another file was specified earlier, new messages will only go to the new file. @@ -169,6 +169,19 @@ class Log: :param file_path: file to store the log :type file_path: str """ + levels = {} + levels["STATUS_FAILED"] = 500 + levels["STATUS"] = 1000 + levels["DEBUG"] = 2000 + levels["WARNING"] = 3000 + levels["INFO"] = 4000 + levels["RESULT"] = 5000 + levels["ERROR"] = 6000 + levels["CRITICAL"] = 7000 + levels["NO_LOG"] = levels["CRITICAL"] + 1000 + + level = levels.get(str(level).upper(),"DEBUG") + max_retrials = 3 retrials = 0 timeout = 5 diff --git a/requeriments.txt b/requeriments.txt index f2dfdd0aac92b88ff2246a6907da782bb84a5333..7f4b2ea14274adcc68c13e33c355e639157d1ed4 100644 --- a/requeriments.txt +++ b/requeriments.txt @@ -1,7 +1,9 @@ +pytest==2.9.2 +configparser argparse>=1.2,<2 python-dateutil>2 matplotlib -numpy +numpy<1.17 pydotplus>=2 pyparsing>=2.0.1 paramiko @@ -13,6 +15,7 @@ typing bscearth.utils cryptography==3.3.2 PyNaCl==1.4.0 +six>=1.10.0 requests xlib Pygments \ No newline at end of file diff --git a/setup.py b/setup.py index 35e8f4f4f50a1396aa3f4b3d590c6cdb551044d2..c639b85e53d80bb9655bc395e7b6579f7dcdd09f 100644 --- a/setup.py +++ b/setup.py @@ -34,14 +34,14 @@ setup( version=version, description='Autosubmit: a versatile tool to manage Weather and Climate Experiments in diverse ' 'Supercomputing Environments', - author='Domingo Manubens-Gil', - author_email='domingo.manubens@bsc.es', + author='Daniel Beltran Mora', + author_email='daniel.beltran@bsc.es', url='http://www.bsc.es/projects/earthscience/autosubmit/', download_url='https://earth.bsc.es/wiki/doku.php?id=tools:autosubmit', keywords=['climate', 'weather', 'workflow', 'HPC'], - install_requires=['argparse>=1.2,<2','argcomplete==1.10.3', 'python-dateutil>2', 'pydotplus>=2', 'pyparsing>=2.0.1', - 'numpy', 'matplotlib', 'typing', 'paramiko == 2.7.1', - 'mock>=1.3.0', 'portalocker==0.5.7', 'networkx', 'bscearth.utils', 'Xlib == 0.21'], + install_requires=['argparse>=1.2,<2','six>=1.10.0', 'python-dateutil>2', 'pydotplus>=2', 'pyparsing>=2.0.1', + 'numpy<1.17', 'matplotlib', 'typing', 'paramiko == 2.7.1', + 'mock>=1.3.0', 'portalocker==0.5.7', 'networkx', 'bscearth.utils', 'Xlib == 0.21', 'requests','configparser==4.0.2'], extras_require={ 'dialog': ["python2-pythondialog>=3.3.0"] }, diff --git a/teeeest.py b/teeeest.py new file mode 100644 index 0000000000000000000000000000000000000000..c7919c429b65996080489d893ad13ca922a085fe --- /dev/null +++ b/teeeest.py @@ -0,0 +1,181 @@ +#!/usr/bin/env python2 +############################################################################### +# a000_ASThread_16801803863908_5_60 +############################################################################### +# +#SBATCH -J a000_ASThread_16801803863908_5_60 +#SBATCH --qos=debug +#SBATCH -A bsc32 +#SBATCH --output=a000_ASThread_16801803863908_5_60.out +#SBATCH --error=a000_ASThread_16801803863908_5_60.err +#SBATCH -t 02:00:00 +#SBATCH --cpus-per-task=1 +#SBATCH -n 8 +############################################################################### + +import os +import sys +# from bscearth.utils.date import date2str +from threading import Thread +from commands import getstatusoutput +from datetime import datetime +import time +from math import ceil +from collections import OrderedDict +import copy + + +class Unbuffered(object): + def __init__(self, stream): + self.stream = stream + + def write(self, data): + self.stream.write(data) + self.stream.flush() + + def writelines(self, datas): + self.stream.writelines(datas) + self.stream.flush() + + def __getattr__(self, attr): + return getattr(self.stream, attr) + + +sys.stdout = Unbuffered(sys.stdout) +wrapper_id = "8aQlI6U962_FAILED" +# Defining scripts to be run +scripts = [[u'a000_19600101_fc0000_1_SIM.cmd', u'a000_19600101_fc0000_2_SIM.cmd', u'a000_19600101_fc0000_3_SIM.cmd', + u'a000_19600101_fc0000_4_SIM.cmd', u'a000_19600101_fc0000_5_SIM.cmd'], [u'a000_19600101_fc0000_POST.cmd'], + [u'a000_19600101_fc0001_4_SIM.cmd', u'a000_19600101_fc0001_2_SIM.cmd', u'a000_19600101_fc0001_5_SIM.cmd', + u'a000_19600101_fc0001_1_SIM.cmd', u'a000_19600101_fc0001_3_SIM.cmd'], [u'a000_19600101_fc0001_POST.cmd'], + [u'a000_19600101_fc0002_4_SIM.cmd', u'a000_19600101_fc0002_2_SIM.cmd', u'a000_19600101_fc0002_1_SIM.cmd', + u'a000_19600101_fc0002_5_SIM.cmd', u'a000_19600101_fc0002_3_SIM.cmd'], [u'a000_19600101_fc0002_POST.cmd'], + [u'a000_19600101_fc0003_4_SIM.cmd', u'a000_19600101_fc0003_3_SIM.cmd', u'a000_19600101_fc0003_5_SIM.cmd', + u'a000_19600101_fc0003_1_SIM.cmd', u'a000_19600101_fc0003_2_SIM.cmd'], [u'a000_19600101_fc0003_POST.cmd'], + [u'a000_19600101_fc0004_5_SIM.cmd', u'a000_19600101_fc0004_1_SIM.cmd', u'a000_19600101_fc0004_4_SIM.cmd', + u'a000_19600101_fc0004_3_SIM.cmd', u'a000_19600101_fc0004_2_SIM.cmd'], [u'a000_19600101_fc0004_POST.cmd'], + [u'a000_19600101_fc0005_2_SIM.cmd', u'a000_19600101_fc0005_5_SIM.cmd', u'a000_19600101_fc0005_1_SIM.cmd', + u'a000_19600101_fc0005_4_SIM.cmd', u'a000_19600101_fc0005_3_SIM.cmd'], [u'a000_19600101_fc0005_POST.cmd'], + [u'a000_19600101_fc0006_5_SIM.cmd', u'a000_19600101_fc0006_2_SIM.cmd', u'a000_19600101_fc0006_4_SIM.cmd', + u'a000_19600101_fc0006_1_SIM.cmd', u'a000_19600101_fc0006_3_SIM.cmd'], [u'a000_19600101_fc0006_POST.cmd'], + [u'a000_19600101_fc0007_5_SIM.cmd', u'a000_19600101_fc0007_1_SIM.cmd', u'a000_19600101_fc0007_4_SIM.cmd', + u'a000_19600101_fc0007_2_SIM.cmd', u'a000_19600101_fc0007_3_SIM.cmd'], [u'a000_19600101_fc0007_POST.cmd'], + [u'a000_19600101_fc0008_1_SIM.cmd', u'a000_19600101_fc0008_3_SIM.cmd', u'a000_19600101_fc0008_5_SIM.cmd', + u'a000_19600101_fc0008_4_SIM.cmd', u'a000_19600101_fc0008_2_SIM.cmd'], [u'a000_19600101_fc0008_POST.cmd'], + [u'a000_19600101_fc0009_1_SIM.cmd', u'a000_19600101_fc0009_5_SIM.cmd', u'a000_19600101_fc0009_4_SIM.cmd', + u'a000_19600101_fc0009_2_SIM.cmd', u'a000_19600101_fc0009_3_SIM.cmd'], [u'a000_19600101_fc0009_POST.cmd']] + + +class JobThread(Thread): + def __init__(self, template, id_run): + Thread.__init__(self) + self.template = template + self.id_run = id_run + self.fail_count = 0 + + def run(self): + jobname = self.template.replace('.cmd', '') + print("Thread level {0}".format(jobname)) + # os.system("echo $(date +%s) > "+jobname+"_STAT") + out = str(self.template) + ".out." + str(self.fail_count) + err = str(self.template) + ".err." + str(self.fail_count) + print(out + "\n") + print("{1}/machinefiles/machinefile_{0}".format(jobname,os.getcwd())) + os.environ["MACHINEFILE"] = "{1}/machinefiles/machinefile_{0}".format(jobname,os.getcwd()) + command = "./" + str(self.template) + " " + str(self.id_run) + " " + os.getcwd() + (self.status) = getstatusoutput(command + " > " + out + " 2> " + err) + + +class JobListThread(Thread): + def __init__(self, jobs_list, id_run, node_list): + Thread.__init__(self) + self.jobs_list = jobs_list + self.id_run = id_run + self.node_list = node_list + + def run(self): + pid_list = [] + print("Jobs list: {0}".format(self.jobs_list)) + print("len_jobs_list: {0}".format(len(self.jobs_list))) + print("all_nodes: {0}".format(self.node_list)) + + for i,job in enumerate(self.jobs_list): + jobname = job.replace(".cmd", '') + section = jobname.split('_')[-1] + machines = "" + cores = int(jobs_resources[section]['PROCESSORS']) + tasks = int(jobs_resources[section]['TASKS']) + processors_per_node = int(jobs_resources['PROCESSORS_PER_NODE']) + nodes = int(ceil((float(tasks*cores)) / processors_per_node)) + print("Nodes: {0}".format(nodes)) + print("Nodes_remaining_for_this_list: {0}".format(self.node_list)) + while nodes > 0: + node = self.node_list.pop(0) + machines += "{0} 1\n".format(node, processors_per_node) + nodes = nodes - 1 + # machines = "\n".join([s for s in machines.split("\n") if s]) + #machines = "\n".join([s for s in machines.split("\n") if s]) + print("FINAL_MACHINES:{0} ".format(machines)) + with open("machinefiles/machinefile_" + jobname, "w") as machinefile: + machinefile.write(machines) + current = JobThread(job, i + self.id_run) + pid_list.append(current) + current.start() + + # Waiting until all scripts finish + for i in range(len(pid_list)): + pid = pid_list[i] + pid.join() + + completed_filename = self.jobs_list[i].replace('.cmd', '_COMPLETED') + completed_path = os.path.join(os.getcwd(), completed_filename) + failed_filename = self.jobs_list[i].replace('.cmd', '_FAILED') + failed_path = os.path.join(os.getcwd(), failed_filename) + failed_wrapper = os.path.join(os.getcwd(), wrapper_id) + if os.path.exists(completed_path): + print datetime.now(), "The job ", pid.template, " has been COMPLETED" + else: + open(failed_wrapper, 'w').close() + open(failed_path, 'w').close() + print datetime.now(), "The job ", pid.template, " has FAILED" + + +# Getting the list of allocated nodes +os.system("scontrol show hostnames $SLURM_JOB_NODELIST > {0}".format(node_id)) +os.system("mkdir -p machinefiles") + +with open('node_list_{0}'.format(wrapper_id), 'r') as file: + all_nodes = file.read() + +all_nodes = all_nodes.split("\n") + +total_cores = 5 +jobs_resources = {u'POST': {'TASKS': u'12', 'PROCESSORS': '1'}, 'MACHINEFILES': u'STANDARD', + 'PROCESSORS_PER_NODE': u'12', u'SIM': {'TASKS': '1', 'PROCESSORS': '1'}} +processors_per_node = int(jobs_resources['PROCESSORS_PER_NODE']) +idx = 0 +all_cores = [] +while total_cores > 0: + if processors_per_node > 0: + processors_per_node -= 1 + total_cores -= 1 + all_cores.append(all_nodes[idx]) + else: + idx += 1 + processors_per_node = int(jobs_resources['PROCESSORS_PER_NODE']) +processors_per_node = int(jobs_resources['PROCESSORS_PER_NODE']) + +failed_wrapper = os.path.join(os.getcwd(), wrapper_id) +for i in range(len(scripts)): + current = JobListThread(scripts[i], i * (len(scripts[i])), copy.deepcopy(all_cores)) + current.start() + current.join() + if os.path.exists(failed_wrapper): + os.system("rm -f node_list_{0}".format(wrapper_id)) + os.remove(os.path.join(os.getcwd(), wrapper_id)) + wrapper_failed = os.path.join(os.getcwd(), "WRAPPER_FAILED") + open(wrapper_failed, 'w').close() + os._exit(1) +os.system("rm -f {0}".format(node_list)) + + diff --git a/test-unthreaded.py b/test-unthreaded.py new file mode 100644 index 0000000000000000000000000000000000000000..27f32e28178636aaba3153fce96b79c28d846d72 --- /dev/null +++ b/test-unthreaded.py @@ -0,0 +1,186 @@ +#!/usr/bin/env python2 +############################################################################### +# a000_ASThread_16801803863908_5_60 +############################################################################### +# +#SBATCH -J test +#SBATCH --qos=debug +#SBATCH -A bsc32 +#SBATCH --output=test.out +#SBATCH --error=test.err +#SBATCH -t 02:00:00 +#SBATCH --cpus-per-task=1 +#SBATCH -n 8 +############################################################################### + +import os +import sys +import subprocess +# from bscearth.utils.date import date2str +from threading import Thread +from commands import getstatusoutput +from datetime import datetime +import time +from math import ceil +from collections import OrderedDict +import copy + + +class Unbuffered(object): + def __init__(self, stream): + self.stream = stream + + def write(self, data): + self.stream.write(data) + self.stream.flush() + + def writelines(self, datas): + self.stream.writelines(datas) + self.stream.flush() + + def __getattr__(self, attr): + return getattr(self.stream, attr) + + + +class Job: + def __init__(self, template, id_run): + self.template = template + self.id_run = id_run + self.fail_count = 0 + self.process = None + + def launch(self): + jobname = self.template.replace('.cmd', '') + print("Thread level {0}".format(jobname)) + # os.system("echo $(date +%s) > "+jobname+"_STAT") + out = str(self.template) + ".out." + str(self.fail_count) + err = str(self.template) + ".err." + str(self.fail_count) + print(out + "\n") + print("{1}/machinefiles/machinefile_{0}".format(jobname,os.getcwd())) + os.environ["MACHINEFILE"] = "{1}/machinefiles/machinefile_{0}".format(jobname,os.getcwd()) + command = "./" + str(self.template) + " " + str(self.id_run) + " " + os.getcwd() + # Use subprocess to run the command and get the process ID + self.process = subprocess.Popen(command + " > " + out + " 2> " + err, shell=True) + return self + +class JobList: + def __init__(self, jobs_list, id_run, node_list): + """ + + :param jobs_list: + :param id_run: + :param node_list: + """ + self.jobs_list = jobs_list + self.id_run = id_run + self.node_list = node_list + + def launch(self): + """ + Launch the jobs in the wrapper sublist + :return: + """ + pid_list = [] + for i,job in enumerate(self.jobs_list): + jobname = job.replace(".cmd", '') + section = jobname.split('_')[-1] + machines = "" + cores = int(jobs_resources[section]['PROCESSORS']) + tasks = int(jobs_resources[section]['TASKS']) + cores_per_tasks = ceil((float(tasks*cores))) + processors_per_node = int(jobs_resources['PROCESSORS_PER_NODE']) + nodes = (1 + int(cores_per_tasks) ) / processors_per_node # 1 for the main process + remaining_processors = abs(cores_per_tasks - (nodes * processors_per_node)) + while nodes > 0: + node = self.node_list.pop(0) + if nodes > 1: + machines += "{0} {1}\n".format(node, processors_per_node) + else: + machines += "{0} {1}\n".format(node, remaining_processors+1) # +1 for the main process + nodes = nodes - 1 + with open("machinefiles/machinefile_" + jobname, "w") as machinefile: + machinefile.write(machines) + pid_list.append(Job(job, self.id_run).launch()) + self.check_status(pid_list) + def check_status(self,pid_list): + for i in range(len(pid_list)): + job = pid_list[i] + #(process_output, process_error) = pid.communicate() + job.process.wait() + completed_filename = self.jobs_list[i].replace('.cmd', '_COMPLETED') + completed_path = os.path.join(os.getcwd(), completed_filename) + failed_filename = self.jobs_list[i].replace('.cmd', '_FAILED') + failed_path = os.path.join(os.getcwd(), failed_filename) + failed_wrapper = os.path.join(os.getcwd(), wrapper_id) + if os.path.exists(completed_path): + print datetime.now(), "The job ", completed_filename, " has been COMPLETED" + else: + open(failed_wrapper, 'w').close() + open(failed_path, 'w').close() + print datetime.now(), "The job ", completed_filename, " has FAILED" + + +sys.stdout = Unbuffered(sys.stdout) +wrapper_id = "8aQlI6U962_FAILED" +# Defining scripts to be run +scripts = [[u'a000_19600101_fc0000_1_SIM.cmd', u'a000_19600101_fc0000_2_SIM.cmd', u'a000_19600101_fc0000_3_SIM.cmd', + u'a000_19600101_fc0000_4_SIM.cmd', u'a000_19600101_fc0000_5_SIM.cmd'], [u'a000_19600101_fc0000_POST.cmd'], + [u'a000_19600101_fc0001_4_SIM.cmd', u'a000_19600101_fc0001_2_SIM.cmd', u'a000_19600101_fc0001_5_SIM.cmd', + u'a000_19600101_fc0001_1_SIM.cmd', u'a000_19600101_fc0001_3_SIM.cmd'], [u'a000_19600101_fc0001_POST.cmd'], + [u'a000_19600101_fc0002_4_SIM.cmd', u'a000_19600101_fc0002_2_SIM.cmd', u'a000_19600101_fc0002_1_SIM.cmd', + u'a000_19600101_fc0002_5_SIM.cmd', u'a000_19600101_fc0002_3_SIM.cmd'], [u'a000_19600101_fc0002_POST.cmd'], + [u'a000_19600101_fc0003_4_SIM.cmd', u'a000_19600101_fc0003_3_SIM.cmd', u'a000_19600101_fc0003_5_SIM.cmd', + u'a000_19600101_fc0003_1_SIM.cmd', u'a000_19600101_fc0003_2_SIM.cmd'], [u'a000_19600101_fc0003_POST.cmd'], + [u'a000_19600101_fc0004_5_SIM.cmd', u'a000_19600101_fc0004_1_SIM.cmd', u'a000_19600101_fc0004_4_SIM.cmd', + u'a000_19600101_fc0004_3_SIM.cmd', u'a000_19600101_fc0004_2_SIM.cmd'], [u'a000_19600101_fc0004_POST.cmd'], + [u'a000_19600101_fc0005_2_SIM.cmd', u'a000_19600101_fc0005_5_SIM.cmd', u'a000_19600101_fc0005_1_SIM.cmd', + u'a000_19600101_fc0005_4_SIM.cmd', u'a000_19600101_fc0005_3_SIM.cmd'], [u'a000_19600101_fc0005_POST.cmd'], + [u'a000_19600101_fc0006_5_SIM.cmd', u'a000_19600101_fc0006_2_SIM.cmd', u'a000_19600101_fc0006_4_SIM.cmd', + u'a000_19600101_fc0006_1_SIM.cmd', u'a000_19600101_fc0006_3_SIM.cmd'], [u'a000_19600101_fc0006_POST.cmd'], + [u'a000_19600101_fc0007_5_SIM.cmd', u'a000_19600101_fc0007_1_SIM.cmd', u'a000_19600101_fc0007_4_SIM.cmd', + u'a000_19600101_fc0007_2_SIM.cmd', u'a000_19600101_fc0007_3_SIM.cmd'], [u'a000_19600101_fc0007_POST.cmd'], + [u'a000_19600101_fc0008_1_SIM.cmd', u'a000_19600101_fc0008_3_SIM.cmd', u'a000_19600101_fc0008_5_SIM.cmd', + u'a000_19600101_fc0008_4_SIM.cmd', u'a000_19600101_fc0008_2_SIM.cmd'], [u'a000_19600101_fc0008_POST.cmd'], + [u'a000_19600101_fc0009_1_SIM.cmd', u'a000_19600101_fc0009_5_SIM.cmd', u'a000_19600101_fc0009_4_SIM.cmd', + u'a000_19600101_fc0009_2_SIM.cmd', u'a000_19600101_fc0009_3_SIM.cmd'], [u'a000_19600101_fc0009_POST.cmd']] + + +# Getting the list of allocated nodes +os.system("scontrol show hostnames $SLURM_JOB_NODELIST > node_list_{0}".format(wrapper_id)) +os.system("mkdir -p machinefiles") + +with open('node_list_{0}'.format(wrapper_id.split("_")[0]), 'r') as file: + all_nodes = file.read() + +all_nodes = all_nodes.split("\n") + +total_cores = 5+1 +jobs_resources = {u'POST': {'TASKS': u'12', 'PROCESSORS': '1'}, 'MACHINEFILES': u'STANDARD', + 'PROCESSORS_PER_NODE': u'12', u'SIM': {'TASKS': '1', 'PROCESSORS': '1'}} +processors_per_node = int(jobs_resources['PROCESSORS_PER_NODE']) +idx = 0 +all_cores = [] +while total_cores > 0: + if processors_per_node > 0: + processors_per_node -= 1 + total_cores -= 1 + all_cores.append(all_nodes[idx]) + else: + idx += 1 + processors_per_node = int(jobs_resources['PROCESSORS_PER_NODE']) +processors_per_node = int(jobs_resources['PROCESSORS_PER_NODE']) + +failed_wrapper = os.path.join(os.getcwd(), wrapper_id) +for i in range(len(scripts)): + current = JobList(scripts[i], i * (len(scripts[i])), copy.deepcopy(all_cores)) + current.launch() + if os.path.exists(failed_wrapper): + os.system("rm -f node_list_{0}".format(wrapper_id.split("_")[0])) + os.remove(os.path.join(os.getcwd(), wrapper_id)) + wrapper_failed = os.path.join(os.getcwd(), "WRAPPER_FAILED") + open(wrapper_failed, 'w').close() + os._exit(1) +os.system("rm -f node_list_{0}".format(wrapper_id.split("_")[0])) + + diff --git a/test/regression/tests_runner.py b/test/regression/tests_runner.py index ffd490888b835aaff9937b87eb3677ed6f581410..ab186e8492c0ca59b45e6325e399060086894a29 100644 --- a/test/regression/tests_runner.py +++ b/test/regression/tests_runner.py @@ -79,6 +79,7 @@ def run(current_experiment_id, only_list=None, exclude_list=None, max_threads=5) tests_parser.optionxform = str tests_parser.read(tests_parser_file) + # Resetting the database clean_database(db_path) create_database() diff --git a/test/regression/tests_utils.py b/test/regression/tests_utils.py index 297fb8f7523faeabb6f9683746317bec5fe514da..53ead0dd58e1b494ad50188bc33525c7a143ee35 100644 --- a/test/regression/tests_utils.py +++ b/test/regression/tests_utils.py @@ -23,7 +23,7 @@ def check_cmd(command, path=BIN_PATH, verbose='AS_TEST_VERBOSE' in os.environ): except subprocess.CalledProcessError as e: if verbose: - print e.output + print str(e) return False diff --git a/test/unit/files/fake-jobs.yml b/test/unit/files/fake-jobs.yml new file mode 100755 index 0000000000000000000000000000000000000000..93c5a55dcaa3c316f26bcd282b62473f659cc7e2 --- /dev/null +++ b/test/unit/files/fake-jobs.yml @@ -0,0 +1,36 @@ +JOBS: + LOCAL_SETUP: + FILE: LOCAL_SETUP.sh + PLATFORM: LOCAL + RUNNING: once + REMOTE_SETUP: + FILE: REMOTE_SETUP.sh + DEPENDENCIES: LOCAL_SETUP + WALLCLOCK: '00:05' + RUNNING: once + INI: + FILE: INI.sh + DEPENDENCIES: REMOTE_SETUP + RUNNING: member + WALLCLOCK: '00:05' + SIM: + FILE: SIM.sh + DEPENDENCIES: INI SIM-1 + RUNNING: chunk + WALLCLOCK: '00:05' + POST: + FILE: POST.sh + DEPENDENCIES: SIM + RUNNING: once + WALLCLOCK: '00:05' + CLEAN: + FILE: CLEAN.sh + DEPENDENCIES: POST + RUNNING: once + WALLCLOCK: '00:05' + TRANSFER: + FILE: TRANSFER.sh + PLATFORM: LOCAL + DEPENDENCIES: CLEAN + RUNNING: member + diff --git a/test/unit/files/fake-platforms.yml b/test/unit/files/fake-platforms.yml new file mode 100644 index 0000000000000000000000000000000000000000..ba5810c3f08b720dcd8ff8967c3d36934e9812a3 --- /dev/null +++ b/test/unit/files/fake-platforms.yml @@ -0,0 +1,75 @@ +PLATFORMS: + ARM: + TYPE: pjm + HOST: armlogin1.bsc.es,armlogin2.bsc.es,armlogin3.bsc.es + PROJECT: bsc32 + USER: bsc32070 + SCRATCH_DIR: /scratch + ADD_PROJECT_TO_HOST: 'False' + MAX_WALLCLOCK: 48:00 + MAX_PROCESSORS: '2400' + PROCESSORS_PER_NODE: '48' + SERIAL_QUEUE: small + QUEUE: small + MARENOSTRUM4: + TYPE: slurm + HOST: mn1.bsc.es,mn2.bsc.es,mn3.bsc.es + PROJECT: bsc32 + USER: bsc32070 + SCRATCH_DIR: /gpfs/scratch + ADD_PROJECT_TO_HOST: 'False' + MAX_WALLCLOCK: 48:00 + MAX_PROCESSORS: '2400' + PROCESSORS_PER_NODE: '48' + SERIAL_QUEUE: debug + QUEUE: debug + MARENOSTRUM_ARCHIVE: + TYPE: ps + HOST: dt01.bsc.es + PROJECT: bsc32 + USER: bsc32070 + SCRATCH_DIR: /gpfs/scratch + ADD_PROJECT_TO_HOST: 'False' + TEST_SUITE: 'False' + POWER9: + TYPE: slurm + HOST: plogin1.bsc.es + PROJECT: bsc32 + USER: bsc32070 + SCRATCH_DIR: /gpfs/scratch + ADD_PROJECT_TO_HOST: 'False' + TEST_SUITE: 'False' + SERIAL_QUEUE: debug + QUEUE: debug + NORD3: + TYPE: lsf + HOST: nord1.bsc.es + PROJECT: bsc32 + USER: bsc32070 + ADD_PROJECT_TO_HOST: 'False' + SCRATCH_DIR: /gpfs/scratch + TEST_SUITE: 'False' + MAX_WALLCLOCK: 48:00 + MAX_PROCESSORS: '1024' + PROCESSORS_PER_NODE: '16' + TRANSFER_NODE: + TYPE: ps + HOST: dt01.bsc.es + PROJECT: bsc32 + USER: bsc32070 + ADD_PROJECT_TO_HOST: 'false' + SCRATCH_DIR: /gpfs/scratch + TRANSFER_NODE_BSCEARTH000: + TYPE: ps + HOST: bscearth000 + USER: dbeltran + PROJECT: Earth + ADD_PROJECT_TO_HOST: 'false' + QUEUE: serial + SCRATCH_DIR: /esarchive/scratch + BSCEARTH000: + TYPE: ps + HOST: bscearth000 + PROJECT: Earth + USER: dbeltran + SCRATCH_DIR: /esarchive/scratch \ No newline at end of file diff --git a/test/unit/test_autosubmit_config.py b/test/unit/test_autosubmit_config.py index c4c8480df09e1c0ace6ffb7b6cb64706bddad282..00e62440627e409325a5faff649333577f4311b1 100644 --- a/test/unit/test_autosubmit_config.py +++ b/test/unit/test_autosubmit_config.py @@ -181,13 +181,22 @@ class TestAutosubmitConfig(TestCase): open_mock.assert_any_call(config.experiment_file, 'w') def test_set_version(self): - # arrange + + #ARRAGE + FakeBasicConfig.DB_PATH = 'fake-path' + sys.modules['os'].path.exists = Mock(return_value=True) + connection_mock = Mock() + cursor_mock = Mock() + connection_mock.cursor = Mock(return_value=cursor_mock) + cursor_mock.fetchone = Mock(return_value=[0]) + + sys.modules['sqlite3'].connect = Mock(return_value=connection_mock) config = AutosubmitConfig(self.any_expid, FakeBasicConfig, ConfigParserFactory()) open_mock = mock_open(read_data='AUTOSUBMIT_VERSION = dummy') with patch.object(builtins, "open", open_mock): # act - config.set_version('dummy-vesion') + config.set_version('dummy-version') # assert open_mock.assert_any_call(getattr(config, '_conf_parser_file'), 'w') @@ -461,3 +470,4 @@ class FakeBasicConfig: LOCAL_PROJ_DIR = '/dummy/local/proj/dir' DEFAULT_PLATFORMS_CONF = '' DEFAULT_JOBS_CONF = '' + diff --git a/test/unit/test_dic_jobs.py b/test/unit/test_dic_jobs.py index 5565c93280bd66ba2dcabcad8e0598e91ef6a036..f955f96dcd3cdd5d5e0a45ae327c19a1b413c8d6 100644 --- a/test/unit/test_dic_jobs.py +++ b/test/unit/test_dic_jobs.py @@ -81,9 +81,10 @@ class TestDicJobs(TestCase): frequency = 123 splits = 0 excluded_list_m = [] + included_list_m = [] self.parser_mock.has_option = Mock(return_value=True) self.parser_mock.get = Mock(return_value='member') - self.dictionary.get_option = Mock(side_effect=[splits,frequency,excluded_list_m]) + self.dictionary.get_option = Mock(side_effect=[splits,frequency,excluded_list_m,included_list_m]) self.dictionary._create_jobs_once = Mock() self.dictionary._create_jobs_startdate = Mock() self.dictionary._create_jobs_member = Mock() @@ -95,7 +96,7 @@ class TestDicJobs(TestCase): # assert self.dictionary._create_jobs_once.assert_not_called() self.dictionary._create_jobs_startdate.assert_not_called() - self.dictionary._create_jobs_member.assert_called_once_with(section, priority, frequency, Type.BASH, {},splits,excluded_list_m) + self.dictionary._create_jobs_member.assert_called_once_with(section, priority, frequency, Type.BASH, {},splits,excluded_list_m,included_list_m) self.dictionary._create_jobs_chunk.assert_not_called() def test_read_section_running_chunk_create_jobs_chunk(self): @@ -108,9 +109,11 @@ class TestDicJobs(TestCase): splits = 0 excluded_list_c = [] excluded_list_m = [] + included_list_c = [] + included_list_m = [] self.parser_mock.has_option = Mock(return_value=True) self.parser_mock.get = Mock(return_value='chunk') - self.dictionary.get_option = Mock(side_effect=[splits,frequency, synchronize, delay,excluded_list_c,excluded_list_m]) + self.dictionary.get_option = Mock(side_effect=[splits,frequency, synchronize, delay,excluded_list_c,excluded_list_m,included_list_c,included_list_m]) self.dictionary._create_jobs_once = Mock() self.dictionary._create_jobs_startdate = Mock() self.dictionary._create_jobs_member = Mock() @@ -123,7 +126,7 @@ class TestDicJobs(TestCase): self.dictionary._create_jobs_once.assert_not_called() self.dictionary._create_jobs_startdate.assert_not_called() self.dictionary._create_jobs_member.assert_not_called() - self.dictionary._create_jobs_chunk.assert_called_once_with(section, priority, frequency, Type.BASH, synchronize, delay, splits, {},excluded_chunks=[],excluded_members=[]) + self.dictionary._create_jobs_chunk.assert_called_once_with(section, priority, frequency, Type.BASH, synchronize, delay, splits, {},excluded_chunks=[],excluded_members=[],included_chunks=[],included_members=[]) def test_dic_creates_right_jobs_by_startdate(self): # arrange diff --git a/test/unit/test_expid.py b/test/unit/test_expid.py index 85e5a012bd1777876f50bec6c233cb13f8f83cba..7eee22bfc26e8dbfc6cce0916c19254e07fcf80a 100644 --- a/test/unit/test_expid.py +++ b/test/unit/test_expid.py @@ -31,21 +31,21 @@ class TestExpid(TestCase): @patch('autosubmit.experiment.experiment_common.db_common') def test_create_new_experiment_with_previous_one(self, db_common_mock): - current_experiment_id = "a006" + current_experiment_id = "a007" self._build_db_mock(current_experiment_id, db_common_mock) experiment_id = new_experiment(self.description, self.version) self.assertEquals("a007", experiment_id) @patch('autosubmit.experiment.experiment_common.db_common') def test_create_new_test_experiment_with_previous_one(self, db_common_mock): - current_experiment_id = "t0ab" + current_experiment_id = "t0ac" self._build_db_mock(current_experiment_id, db_common_mock) experiment_id = new_experiment(self.description, self.version, True) self.assertEquals("t0ac", experiment_id) @patch('autosubmit.experiment.experiment_common.db_common') def test_create_new_operational_experiment_with_previous_one(self, db_common_mock): - current_experiment_id = "o112" + current_experiment_id = "o113" self._build_db_mock(current_experiment_id, db_common_mock) experiment_id = new_experiment(self.description, self.version, False, True) self.assertEquals("o113", experiment_id) diff --git a/test/unit/test_job.py b/test/unit/test_job.py index fae76dccb110ff31d84373f732753b8dc4647b22..59e1f51fc89670e28e0c2cfe12538fad4a5c4d8e 100644 --- a/test/unit/test_job.py +++ b/test/unit/test_job.py @@ -179,6 +179,36 @@ class TestJob(TestCase): write_mock.write.assert_called_with('some-content: 999, 777, 666 % %') chmod_mock.assert_called_with(os.path.join(self.job._tmp_path, self.job.name + '.cmd'), 0o755) + def test_create_header_tailer_script(self): + tailer_script = '#!/usr/bin/bash\necho "Header test"\n' + header_script = '#!/usr/bin/bash\necho "Tailer test"\n' + # arrange + self.job.parameters = dict() + self.job.type = 0 # Type.BASH + self.job.parameters["EXTENDED_HEADER"] = header_script + self.job.parameters["EXTENDED_TAILER"] = tailer_script + + self.job._tmp_path = '/tmp/' + + update_content_mock = Mock(return_value='%EXTENDED_HEADER%\nsome-content\n%EXTENDED_TAILER%') + self.job.update_content = update_content_mock + + # fill the rest of the values on the job with something + update_parameters_mock = Mock(return_value=self.job.parameters) + self.job.update_parameters = update_parameters_mock + + # create an autosubmit config + config = Mock(spec=AutosubmitConfig) + + # will create a file on /tmp + self.job.create_script(config) + + with open(os.path.join(self.job._tmp_path, self.job.name + '.cmd')) as script_file: + full_script = script_file.read() + assert header_script in full_script + assert tailer_script in full_script + + def test_that_check_script_returns_false_when_there_is_an_unbound_template_variable(self): # arrange update_content_mock = Mock(return_value='some-content: %UNBOUND%') diff --git a/test/unit/test_job_package.py b/test/unit/test_job_package.py index c6a52166f48a8ee0fe40279ae6c2bfc8449b757d..a929d6c43709bb8bbe179cdfd6237e061a81e018 100644 --- a/test/unit/test_job_package.py +++ b/test/unit/test_job_package.py @@ -1,12 +1,10 @@ from unittest import TestCase -import os -from mock import Mock -from mock import patch +from mock import Mock, patch, MagicMock -from autosubmit.job.job_packages import JobPackageSimple from autosubmit.job.job import Job from autosubmit.job.job_common import Status +from autosubmit.job.job_packages import JobPackageSimple class TestJobPackage(TestCase): @@ -43,24 +41,34 @@ class TestJobPackage(TestCase): def test_job_package_platform_getter(self): self.assertEquals(self.platform, self.job_package.platform) - def test_job_package_submission(self): + @patch('os.path.exists') + def test_job_package_submission(self, os_mock): # arrange write_mock = Mock().write = Mock() - - for job in self.jobs: + os_mock.return_value = True + for job in self.job_package.jobs: job._tmp_path = Mock() + job.name = "fake-name" job._get_paramiko_template = Mock("false","empty") + job.file = "fake-file" + job.update_parameters = MagicMock(return_value="fake-params") + job.parameters = "fake-params" + + self.job_package._create_scripts = Mock() self.job_package._send_files = Mock() self.job_package._do_submission = Mock() - for job in self.jobs: - job.update_parameters = Mock() + configuration = Mock() + configuration.get_project_type = Mock(return_value='fake-type') + configuration.get_project_dir = Mock(return_value='fake-dir') + configuration.get_project_name = Mock(return_value='fake-name') + # act - self.job_package.submit('fake-config', 'fake-params') + self.job_package.submit(configuration, 'fake-params') # assert for job in self.jobs: - job.update_parameters.assert_called_once_with('fake-config', 'fake-params') + job.update_parameters.assert_called_once_with(configuration, 'fake-params') self.job_package._create_scripts.is_called_once_with() self.job_package._send_files.is_called_once_with() self.job_package._do_submission.is_called_once_with()