From d4230dd2575981273846389f1a61352bb5541870 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 20 Nov 2023 10:03:52 +0100 Subject: [PATCH 01/41] fix str added autokey word # pending some values testproof added autokey word # pending some values testproof work on going calendar for splits calendar calendar calendar tentative split Changed blank message to warning standarized on_submission debug line configparser version outdated Fix issues with totaljobs, check_script Clean outdate code added a missing counter for totaljobs added a missing counter for totaljobs import change location simplified send_command simplified if remove unused array Fix monitor not showing the correct status of the workflow when -cw is prompt clean code keys_to_erase regex changed configparser version Fixed the "%CURRENT_*% issue Fixed the "%CURRENT_*% issue added job.packed = False once the job is ready or completed/failed (affects reruns) fixed queue and partition not being updated properly delete_edgeless_jobs changed changed == for is to evaluate the type removed get_all_filter_jobs changed _create_jobs_splits Now, if -cw is used in monitor or inspect. It won't check all the templates. Update configparser version to fix load/save Fixed a perfomance issue Fixed an issue with running: once adding more edges fix run Clean the code a bit working now needs some cleaning test fix bug fix bugs, reworked a bit fix bug updated test updated test fixed some bugs, added some docs more fixes test fix pipeline fix pipeline math readded datetime readded fix issue with delay retrial fix issue with -1 Fixes !https://earth.bsc.es/gitlab/es/autosubmit/-/issues/1209 fix grouping test test fix test fix some changes for Bruno comments moved if inside gen Workflow optimizations added ( mega squashed commit ) Fixes #1158 added zipp dependency ( rocrate in bscearth000) re-added additional files Database is locked error in historic db ( I think it is an issue in my computer as happened in master aswell) QOL when splits is introduced with "" ( testing francesc experiment ) Ran regression test, noticed issue with experiment a005 and fixed converse job to list changed == for in to dont care about spaces Fix splits when * and not * is in same line added if not monitor Fix changes Fix delay Fixed edge_info Differences fixed Differences fixed comments fixed comments added comments added N-1 deleted test of deleted function deleted old code fixed pipeline Fixed save Added version and hpcarch as requisites to change Improved split_to Improved split_to (wip) Added "previous" filter (wip) Added "previous" filter fixed status .lower() added Add filter previous docs python3 or pytho2 ( fixed) type python updated test changed configparserversion better detection if data is changed working, added the real configuration to the docs changed configparserversion working? changed test working? issue_with_none Added -f flag to force the recreation from 0 ... (useful mainly for test ) maybe almost working fixed bug with chunk wrapper fix comments comments comments comments comments comments doble # job_section comments docstring added ref todo changed wallclock commented removed funcy Deleted funcy, updated configar paser that has some fixes in changed files Improved the run/monitor speed. Fixed some default stuff fix stats Some memory changes introduced added more cases reformat Added test_dependencies changed the location re-added marked_status File parameter reviewing changed results removed root = None update_genealogy clean unused code update_genealogy clean unused code reviewing comments reviewing comments reviewing comments tests tes fix pipeline test fix test fix added funcy to setup.py updated test changed configparserversion better detection if data is changed working, added the real configuration to the docs changed configparserversion working? changed test working? issue_with_none Added -f flag to force the recreation from 0 ... (useful mainly for test ) maybe almost working fixed bug with chunk wrapper fix comments comments comments comments comments comments doble # job_section comments docstring added ref todo changed wallclock commented removed funcy Deleted funcy, updated configar paser that has some fixes in changed files Improved the run/monitor speed. Fixed some default stuff fix stats Some memory changes introduced reviewing changes (comments) reviewing changes (comments) reviewing changes (comments) reviewing changes (graph enumerate) reviewing changes ( delete commentS) reviewing changes ( delete valid parents) reviewing changes reviewing changes reviewing changes reviewing changes reviewing changes reviewing changes (numpy) reviewing changes (numpy) reviewing changes ( docstring) reviewing changes ( docstring) reviewing changes reviewing changes reviewing changes reviewing changes added more cases reformat Added test_dependencies changed the location re-added marked_status File parameter reviewing changed results removed root = None update_genealogy clean unused code update_genealogy clean unused code reviewing comments reviewing comments reviewing comments tests tes fix pipeline test fix test fix added funcy to setup.py fixing Bruno review comments fixing Bruno review comments fixing Bruno review comments fixing Bruno review comments fixing Bruno review comments fixing Bruno review comments fixing Bruno review comments fixing Bruno review comments Merge lastest changes Fixed ext header to work under this version Fixed default type [rocrate] Add RO-Crate support to Autosubmit. This commit includes work from several other commits, squashed. It started around February 2023, and by July 2023 it was validated by the RO-Crate community, thanks especially to Simone Leo. Unit tests and documentation were added as well. It add support to the following three RO-Crate profiles in Autosubmit: - Process Run Crate - Workflow Run Crate - Workflow RO-Crate profile 1.0 This is available through the Autosubmit commands archive and unarchive. revise the changes update version bug fix an issue with additional_files and \\ variables added retrial key Move temp folder to the outside of for loops to reduce file creation. Rewrite the assertion part Add dani's check so that it doesnt complain with file not found when proj type is none add extended header and tailer documentation test if the file does not exist, it throws an exception test all the routes from extended tailer and header except fetching the file change the check of hashbang to the first two characters Handle if user sets value with empty key Add R, Bash, and python extended scripts Fix an issue with retrials ( present in 4.0) found while testing a full run with templates and wrapper Added platform_name to the variables to load before the rest, ( mainly when building the dict ) Fixed -cw in create, like in inspect Re-adapted some test-cases to match new code workflows fixed fixing all workflows fixing all workflows fixing all workflows # If parent and childs has the same amount of splits \\ doesn't make sense so it is disabled Remove cycles ( job depends on itself) detail is now a function Added a local test to compare workflows from 4.0 to 4.1 using -d option fix default values fix split fix split fixed parent.split == child.split when 1//2 improved test added get_jobs_filtered test Improved job_list test Improved job_list test pipeline not working pipeline not working removed __eq__ due being incompatible with grand part of the code, changed the test instead added job_list generate tests Added __eq__ fixed an issue with dependencies None Changed DB for PKL in tests Added more tests Added more tests fix wrapper dic added run_member test added test_build_job_with_existent_job_list_status test added compare_section test added update_parameters test added update_parameters test added update_parameters test added add_child test added _repr test Old tests working Only 19 remains, have to doble check grouping fix job_list half fix job_list half fix job_list fix test_job.py fix checkpoint and doc tests Fix member_from more changes numpy deleted from environment.yml pep warning fix added test fix doc docs for the new autosubmit_rc env variable docs for the new autosubmit_rc env variable fix doc added another suppress added comment changed try: except for suppress - commented the debug line Changed version Changes to th efunction, fix a bug with the connection, added a close for ._transport of ssh more fixes added a debugfunction Added a notify for push force portalocker to <= 2.7 removed inputtimeout from requeriments requeriments 2fa notification change Fix applied to 2fa, local platform may were asking for a password Fix applied to 2fa indent in docs dependencies docs docs added method parameter 2fa: instead of 2fa rollback few things 2fa threads timeout timeout test 2fa added docs CHANGED input for getpass to hide typing ( it may not work) 2fa 2fa fix additional files for ecmwf Fixed more issues, now edgeless nodes are correctly deleted and dependencies parameter is correctly set , fixed other issues when loading previous job_list and when the node doesnt have the job fixed few workflow inconsistencies fixed dependency fixed ready jobs more fix Working but have an issue with the initial status added apply_filter_1_to_1 more test test more fixes bsic monitor working working on fixing merges working on fixing merges Pickle working, Futher performance improves in the manage_dependencies part working with pickle up to 1000000, afterwards it give segfualt in saving.. looking for alternatives MUCH faster, is probabily bugged for some cases (wip) version update Added a delete function for nodes that are no longer part of the workflow ( with a xor) TODO: Delete old nodes Reloading only the neccesary, added two methods for asconfparser Fix reload in create pkl changes working faster, no memory issues but thinking more solutions corrected prents testing fast test Fixed some bugs with refactor More memory optimization and call optimizations, deleted uneccesary attr when generating the job becasue they will be added later with update_parameters method, code for generate jobs run very fast, inspect working has to check other commands Reduced uneccesary operations, Reduced memory usage Using igraph for perform the transitive reduction added split filter added split filter setstatus refactoring rebased pkl changes working faster, no memory issues but thinking more solutions corrected prents testing fast test Fixed some bugs with refactor More memory optimization and call optimizations, deleted uneccesary attr when generating the job becasue they will be added later with update_parameters method, code for generate jobs run very fast, inspect working has to check other commands Reduced uneccesary operations, Reduced memory usage Using igraph for perform the transitive reduction added split filter added split filter setstatus refactoring cherry picked fix ( changes in nodes declaration ) total_jobs fix suppress changed assert added unit test added unit test added unit test split the function in some smaller functions to add unit testing easier split the function in some smaller functions to add unit testing easier split the function in some smaller functions to add unit testing easier fix wrapper and totaljobs added retrial, command is failing sometimes removed submit_cmd from update_cmds added .get moved - at the end changed requeriments now all regex includes _.- update_version changes Add mailmap file (cherry picked from commit 4f76976feac077a04e1ba2b80160ee067c5b08b4) added --exclusive changed the bugfix sleep rechanged fix bug Changed to pass ro_crate and be more robust to miss introductions Changed to pass ro_crate and be more robust to miss introductions Stats now counts the exact amount of processors if node is prompt Queuing times for inner_jobs fixed updated test updated test changed version .lower() added Add filter previous changed name changed name added test ( same than 4.1) version Fixes #1158 changed version Add SECURITY.MD file Add documentation about exit status of autosubmit run Return >0 if the autosubmit command fails .lower() added Add filter previous changed name changed name added test ( same than 4.1) fixed bug with chunk version wallclock Fixes #1158 Fixes #1158 added zipp dependency ( rocrate in bscearth000) changed version --- autosubmit/job/job.py | 90 +++++++- autosubmit/job/job_dict.py | 13 +- autosubmit/job/job_utils.py | 77 +++++++ .../userguide/configure/develop_a_project.rst | 11 + test/unit/test_dic_jobs.py | 6 + test/unit/test_job.py | 218 ++++++++++++++++++ 6 files changed, 412 insertions(+), 3 deletions(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index b697211a7..ed11c74cc 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -31,7 +31,7 @@ import os import re import textwrap import time -from bscearth.utils.date import date2str, parse_date, previous_day, chunk_end_date, chunk_start_date, Log, subs_dates +from bscearth.utils.date import date2str, parse_date, previous_day, chunk_end_date, chunk_start_date, Log, subs_dates, add_time from functools import reduce from threading import Thread from time import sleep @@ -42,7 +42,7 @@ from autosubmit.history.experiment_history import ExperimentHistory from autosubmit.job.job_common import StatisticsSnippetBash, StatisticsSnippetPython from autosubmit.job.job_common import StatisticsSnippetR, StatisticsSnippetEmpty from autosubmit.job.job_common import Status, Type, increase_wallclock_by_chunk -from autosubmit.job.job_utils import get_job_package_code +from autosubmit.job.job_utils import get_job_package_code, get_split_size_unit, get_split_size from autosubmit.platforms.paramiko_submitter import ParamikoSubmitter from autosubmitconfigparser.config.basicconfig import BasicConfig from autosubmitconfigparser.config.configcommon import AutosubmitConfig @@ -1798,6 +1798,67 @@ class Job(object): parameters['DELAY_RETRIALS'] = self.delay_retrials parameters['DELETE_WHEN_EDGELESS'] = self.delete_when_edgeless + def calendar_split(self, as_conf, parameters): + """ + Calendar for splits + :param parameters: + :return: + """ + # Calendar struct type numbered ( year, month, day, hour ) + + + job_data = as_conf.jobs_data.get(self.section,{}) + if job_data.get("SPLITS", None) and self.running != "once": # once jobs has no date + # total_split = int(self.splits) + split_unit = get_split_size_unit(as_conf.experiment_data, self.section) + cal = str(parameters.get('EXPERIMENT.CALENDAR', "standard")).lower() + split_length = get_split_size(as_conf.experiment_data, self.section) + start_date = parameters.get('CHUNK_START_DATE', None) + if start_date: + self.date = datetime.datetime.strptime(start_date, "%Y%m%d") + split_start = self.split_start_date(self.date, int(self.split), split_length, split_unit, cal) + split_end = self.split_end_date(split_start, split_length, split_unit, cal) + if split_unit == 'hour': + split_end_1 = split_end + else: + split_end_1 = previous_day(split_end, cal) + + parameters['SPLIT'] = self.split + parameters['SPLITSCALENDAR'] = cal + parameters['SPLITSIZE'] = split_length + parameters['SPLITSIZEUNIT'] = split_unit + + parameters['SPLIT_START_DATE'] = date2str( + split_start, self.date_format) + parameters['SPLIT_START_YEAR'] = str(split_start.year) + parameters['SPLIT_START_MONTH'] = str(split_start.month).zfill(2) + parameters['SPLIT_START_DAY'] = str(split_start.day).zfill(2) + parameters['SPLIT_START_HOUR'] = str(split_start.hour).zfill(2) + + parameters['SPLIT_SECOND_TO_LAST_DATE'] = date2str( + split_end_1, self.date_format) + parameters['SPLIT_SECOND_TO_LAST_YEAR'] = str(split_end_1.year) + parameters['SPLIT_SECOND_TO_LAST_MONTH'] = str(split_end_1.month).zfill(2) + parameters['SPLIT_SECOND_TO_LAST_DAY'] = str(split_end_1.day).zfill(2) + parameters['SPLIT_SECOND_TO_LAST_HOUR'] = str(split_end_1.hour).zfill(2) + + parameters['SPLIT_END_DATE'] = date2str( + split_end, self.date_format) + parameters['SPLIT_END_YEAR'] = str(split_end.year) + parameters['SPLIT_END_MONTH'] = str(split_end.month).zfill(2) + parameters['SPLIT_END_DAY'] = str(split_end.day).zfill(2) + parameters['SPLIT_END_HOUR'] = str(split_end.hour).zfill(2) + if int(self.split) == 1: + parameters['SPLIT_FIRST'] = 'TRUE' + else: + parameters['SPLIT_FIRST'] = 'FALSE' + + # if int(total_split) == int(self.split): + # parameters['SPLIT_LAST'] = 'TRUE' + # else: + # parameters['SPLIT_LAST'] = 'FALSE' + + return parameters if self.date is not None and len(str(self.date)) > 0: if self.chunk is None and len(str(self.chunk)) > 0: @@ -1859,6 +1920,31 @@ class Job(object): parameters['CHUNK_LAST'] = 'TRUE' else: parameters['CHUNK_LAST'] = 'FALSE' + return parameters + + def update_job_parameters(self,as_conf, parameters): + self.splits = as_conf.jobs_data[self.section].get("SPLITS", None) + self.delete_when_edgeless = as_conf.jobs_data[self.section].get("DELETE_WHEN_EDGELESS", True) + self.check = as_conf.jobs_data[self.section].get("CHECK", False) + self.check_warnings = as_conf.jobs_data[self.section].get("CHECK_WARNINGS", False) + if self.checkpoint: # To activate placeholder sustitution per in the template + parameters["AS_CHECKPOINT"] = self.checkpoint + parameters['JOBNAME'] = self.name + parameters['FAIL_COUNT'] = str(self.fail_count) + parameters['SDATE'] = self.sdate + parameters['MEMBER'] = self.member + parameters['SPLIT'] = self.split + parameters['SPLITS'] = self.splits + parameters['DELAY'] = self.delay + parameters['FREQUENCY'] = self.frequency + parameters['SYNCHRONIZE'] = self.synchronize + parameters['PACKED'] = self.packed + parameters['CHUNK'] = 1 + parameters['RETRIALS'] = self.retrials + parameters['DELAY_RETRIALS'] = self.delay_retrials + parameters['DELETE_WHEN_EDGELESS'] = self.delete_when_edgeless + parameters = self.calendar_chunk(parameters) + parameters = self.calendar_split(as_conf,parameters) parameters['NUMMEMBERS'] = len(as_conf.get_member_list()) self.dependencies = as_conf.jobs_data[self.section].get("DEPENDENCIES", "") self.dependencies = str(self.dependencies) diff --git a/autosubmit/job/job_dict.py b/autosubmit/job/job_dict.py index ec22a2a25..e4a651953 100644 --- a/autosubmit/job/job_dict.py +++ b/autosubmit/job/job_dict.py @@ -21,10 +21,12 @@ from bscearth.utils.date import date2str from autosubmit.job.job import Job +from autosubmit.job.job_utils import get_split_size_unit, get_split_size, calendar_chunk_section from autosubmit.job.job_common import Status import datetime import re +from log.log import AutosubmitCritical class DicJobs: @@ -144,8 +146,15 @@ class DicJobs: """ self.compare_section(section) parameters = self.experiment_data["JOBS"] - splits = int(parameters[section].get("SPLITS", -1)) + splits = parameters[section].get("SPLITS", -1) running = str(parameters[section].get('RUNNING', "once")).lower() + if running != "chunk": + if str(splits).isdigit() or splits == -1: + splits = int(splits) + elif splits == "auto": + raise AutosubmitCritical("Splits: auto is only allowed for chunk splitted jobs") + else: + raise AutosubmitCritical(f"Splits must be an integer: {splits}") frequency = int(parameters[section].get("FREQUENCY", 1)) if running == 'once': self._create_jobs_once(section, priority, default_job_type, splits) @@ -258,6 +267,8 @@ class DicJobs: self._dic[section][date][member] = dict() count = 0 for chunk in (chunk for chunk in self._chunk_list): + if splits == "auto": + splits = calendar_chunk_section(self.experiment_data, section, date, chunk) count += 1 if delay == -1 or delay < chunk: if count % frequency == 0 or count == len(self._chunk_list): diff --git a/autosubmit/job/job_utils.py b/autosubmit/job/job_utils.py index c02a92952..08748edd2 100644 --- a/autosubmit/job/job_utils.py +++ b/autosubmit/job/job_utils.py @@ -1,4 +1,11 @@ #!/usr/bin/env python3 +import math +from log.log import Log, AutosubmitCritical +import os +from autosubmit.job.job_package_persistence import JobPackagePersistence +from autosubmitconfigparser.config.basicconfig import BasicConfig +from typing import Dict +from bscearth.utils.date import date2str, previous_day, chunk_end_date, chunk_start_date, subs_dates # Copyright 2017-2020 Earth Sciences Department, BSC-CNS @@ -22,6 +29,76 @@ from autosubmit.job.job_package_persistence import JobPackagePersistence from autosubmitconfigparser.config.basicconfig import BasicConfig from typing import Dict +def is_leap_year(year): + """Determine whether a year is a leap year.""" + return year % 4 == 0 and (year % 100 != 0 or year % 400 == 0) + +def calendar_chunk_section(exp_data, section, date, chunk): + """ + Calendar for chunks + :param section: + :param parameters: + :return: + """ + date_str = date2str(date) + splits = 0 + jobs_data = exp_data.get('JOBS', {}) + split_unit = str(exp_data.get("EXPERIMENT", {}).get('SPLITSIZEUNIT', jobs_data.get(section,{}).get("SPLITSIZEUNIT", None))).lower() + chunk_unit = str(exp_data.get("EXPERIMENT", {}).get('CHUNKSIZEUNIT', "day")).lower() + if chunk_unit == "hour": + raise AutosubmitCritical("Chunk unit is hour, Autosubmit doesn't support lower than hour splits. Please change the chunk unit to day or higher. Or don't use calendar splits.") + if jobs_data.get(section,{}).get("RUNNING","once") != "once": + chunk_length = int(exp_data.get("EXPERIMENT", {}).get('CHUNKSIZE', 1)) + cal = str(exp_data.get('CALENDAR', "standard")).lower() + chunk_start = chunk_start_date( + date, chunk, chunk_length, chunk_unit, cal) + chunk_end = chunk_end_date( + chunk_start, chunk_length, chunk_unit, cal) + run_days = subs_dates(chunk_start, chunk_end, cal) + if split_unit == "none": + if chunk_unit == "day": + split_unit = "hour" + elif chunk_unit == "month": + split_unit = "day" + elif chunk_unit == "year": + split_unit = "month" + if split_unit == "hour": + num_max_splits = run_days * 24 + elif split_unit == "month": + num_max_splits = run_days / 12 + elif split_unit == "year": + if not is_leap_year(chunk_start.year) or cal == "noleap": + num_max_splits = run_days / 365 + else: + num_max_splits = run_days / 366 + else: + num_max_splits = run_days + split_size = get_split_size(exp_data, section) + splits = num_max_splits / split_size + if not splits.is_integer(): + Log.warning(f"The number of splits is not exact. This lead to one extra split.\nJOB_DATE:{date_str},JOB_CHUNK:{chunk}:n_splits:{num_max_splits}\{split_size}={splits}") + splits = math.ceil(splits) + return splits + +def get_split_size_unit(data, section): + split_unit = str(data.get('JOBS',{}).get(section,{}).get('SPLITSIZEUNIT', "none")).lower() + if split_unit == "none": + split_unit = str(data.get('EXPERIMENT',{}).get("CHUNKSIZEUNIT", "day")).lower() + if split_unit == "year": + return "month" + elif split_unit == "month": + return "day" + elif split_unit == "day": + return "hour" + else: + return "day" + return split_unit + + +def get_split_size(as_conf, section): + job_data = as_conf.get('JOBS',{}).get(section,{}) + exp_data = as_conf.get('EXPERIMENT',{}) + return int(job_data.get("SPLITSIZE", exp_data.get("SPLITSIZE", exp_data.get('CHUNKSIZE', 1)))) def transitive_reduction(graph): """ diff --git a/docs/source/userguide/configure/develop_a_project.rst b/docs/source/userguide/configure/develop_a_project.rst index 5da3114a2..0a2346f97 100644 --- a/docs/source/userguide/configure/develop_a_project.rst +++ b/docs/source/userguide/configure/develop_a_project.rst @@ -40,12 +40,17 @@ Expdef configuration MEMBERS: fc0 # Chunk size unit. STRING: hour, day, month, year CHUNKSIZEUNIT: month + # Split size unit. STRING: hour, day, month, year and lower than CHUNKSIZEUNIT + SPLITSIZEUNIT: day # default CHUNKSIZEUNIT-1 (month-1 == day) # Chunk size. NUMERIC: 4, 6, 12 CHUNKSIZE: 1 + # Split size. NUMERIC: 4, 6, 12 + SPLITSIZE: 1 # Total number of chunks in experiment. NUMERIC: 30, 15, 10 NUMCHUNKS: 2 # Calendar used. LIST: standard, noleap CALENDAR: standard + # List of members that can be included in this run. Optional. # RUN_ONLY_MEMBERS: fc0 fc1 fc2 fc3 fc4 # RUN_ONLY_MEMBERS: fc[0-4] @@ -201,6 +206,12 @@ Jobs configuration ## Specify the path to the interpreter. If empty, use system default based on job type . Default: empty # EXECUTABLE: /my_python_env/python3 + # Split the job in N jobs. If not specified, defaults to None + # Splits = 2 + # Size unit of the split. Options: hour, day, month, year. Defaults to EXPERIMENT.CHUNKSIZEUNIT-1 + # SPLITSIZEUNIT: day + # Size of the split. If not specified, defaults to 1 + # SPLITSIZE: 1 LOCAL_SETUP: FILE: LOCAL_SETUP.sh diff --git a/test/unit/test_dic_jobs.py b/test/unit/test_dic_jobs.py index bf5360070..f8b2138e6 100644 --- a/test/unit/test_dic_jobs.py +++ b/test/unit/test_dic_jobs.py @@ -594,6 +594,12 @@ class TestDicJobs(TestCase): self.dictionary._create_jobs_split(5,'fake-section','fake-date', 'fake-member', 'fake-chunk', 0,Type.BASH, section_data) self.assertEqual(5, len(section_data)) + @patch('autosubmit.job.job_dict.date2str') + def test_create_jobs_split(self,mock_date2str): + mock_date2str.side_effect = lambda x, y: str(x) + section_data = [] + self.dictionary._create_jobs_split(5,'fake-section','fake-date', 'fake-member', 'fake-chunk', 0,Type.BASH, section_data) + self.assertEqual(5, len(section_data)) diff --git a/test/unit/test_job.py b/test/unit/test_job.py index f4887886c..cf8125b58 100644 --- a/test/unit/test_job.py +++ b/test/unit/test_job.py @@ -5,11 +5,13 @@ import sys import tempfile from pathlib import Path from autosubmit.job.job_list_persistence import JobListPersistencePkl +import datetime # compatibility with both versions (2 & 3) from sys import version_info from textwrap import dedent from unittest import TestCase +from autosubmit.job.job_utils import calendar_chunk_section from autosubmitconfigparser.config.configcommon import AutosubmitConfig from autosubmitconfigparser.config.configcommon import BasicConfig, YAMLParserFactory @@ -1185,6 +1187,222 @@ CONFIG: self.assertEqual(1, len(self.job.children)) self.assertEqual(child, list(self.job.children)[0]) + def test_auto_calendar_split(self): + self.experiment_data = { + 'EXPERIMENT': { + 'DATELIST': '20000101', + 'MEMBERS': 'fc0', + 'CHUNKSIZEUNIT': 'day', + 'CHUNKSIZE': '1', + 'NUMCHUNKS': '2', + 'CALENDAR': 'standard' + }, + 'JOBS': { + 'A': { + 'FILE': 'a', + 'PLATFORM': 'test', + 'RUNNING': 'chunk', + 'SPLITS': 'auto', + 'SPLITSIZE': 1 + }, + 'B': { + 'FILE': 'b', + 'PLATFORM': 'test', + 'RUNNING': 'chunk', + 'SPLITS': 'auto', + 'SPLITSIZE': 2 + } + } + } + section = "A" + date = datetime.datetime.strptime("20000101", "%Y%m%d") + chunk = 1 + splits = calendar_chunk_section(self.experiment_data, section, date, chunk) + self.assertEqual(splits, 24) + splits = calendar_chunk_section(self.experiment_data, "B", date, chunk) + self.assertEqual(splits, 12) + self.experiment_data['EXPERIMENT']['CHUNKSIZEUNIT'] = 'hour' + with self.assertRaises(AutosubmitCritical): + calendar_chunk_section(self.experiment_data, "A", date, chunk) + + self.experiment_data['EXPERIMENT']['CHUNKSIZEUNIT'] = 'month' + splits = calendar_chunk_section(self.experiment_data, "A", date, chunk) + self.assertEqual(splits, 31) + splits = calendar_chunk_section(self.experiment_data, "B", date, chunk) + self.assertEqual(splits, 16) + + self.experiment_data['EXPERIMENT']['CHUNKSIZEUNIT'] = 'year' + splits = calendar_chunk_section(self.experiment_data, "A", date, chunk) + self.assertEqual(splits, 31) + splits = calendar_chunk_section(self.experiment_data, "B", date, chunk) + self.assertEqual(splits, 16) + + + + + + def test_calendar(self): + split = 12 + splitsize = 2 + expid = 'zzyy' + with tempfile.TemporaryDirectory() as temp_dir: + BasicConfig.LOCAL_ROOT_DIR = str(temp_dir) + Path(temp_dir, expid).mkdir() + for path in [f'{expid}/tmp', f'{expid}/tmp/ASLOGS', f'{expid}/tmp/ASLOGS_{expid}', f'{expid}/proj', + f'{expid}/conf']: + Path(temp_dir, path).mkdir() + with open(Path(temp_dir, f'{expid}/conf/minimal.yml'), 'w+') as minimal: + minimal.write(dedent(f'''\ + CONFIG: + RETRIALS: 0 + DEFAULT: + EXPID: {expid} + HPCARCH: test + EXPERIMENT: + # List of start dates + DATELIST: '20000101' + # List of members. + MEMBERS: fc0 + # Unit of the chunk size. Can be hour, day, month, or year. + CHUNKSIZEUNIT: day + # Size of each chunk. + CHUNKSIZE: '4' + # Size of each split + SPLITSIZE: {splitsize} + # Number of chunks of the experiment. + NUMCHUNKS: '2' + CHUNKINI: '' + # Calendar used for the experiment. Can be standard or noleap. + CALENDAR: standard + + JOBS: + A: + FILE: a + PLATFORM: test + RUNNING: chunk + SPLITS: {split} + SPLITSIZE: {splitsize} + PLATFORMS: + test: + TYPE: slurm + HOST: localhost + PROJECT: abc + QUEUE: debug + USER: me + SCRATCH_DIR: /anything/ + ADD_PROJECT_TO_HOST: False + MAX_WALLCLOCK: '00:55' + TEMP_DIR: '' + ''')) + minimal.flush() + + basic_config = FakeBasicConfig() + basic_config.read() + basic_config.LOCAL_ROOT_DIR = str(temp_dir) + + config = AutosubmitConfig(expid, basic_config=basic_config, parser_factory=YAMLParserFactory()) + config.reload(True) + parameters = config.load_parameters() + + job_list = JobList(expid, basic_config, YAMLParserFactory(), + Autosubmit._get_job_list_persistence(expid, config), config) + job_list.generate( + as_conf=config, + date_list=[datetime.datetime.strptime("20000101", "%Y%m%d")], + member_list=["fc0"], + num_chunks=2, + chunk_ini=1, + parameters=parameters, + date_format='', + default_retrials=config.get_retrials(), + default_job_type=config.get_default_job_type(), + wrapper_jobs={}, + new=True, + run_only_members=config.get_member_list(run_only=True), + show_log=True, + ) + job_list = job_list.get_job_list() + self.assertEqual(24, len(job_list)) + + submitter = Autosubmit._get_submitter(config) + submitter.load_platforms(config) + + hpcarch = config.get_platform() + for job in job_list: + job.date_format = "" + if job.platform_name == "" or job.platform_name is None: + job.platform_name = hpcarch + job.platform = submitter.platforms[job.platform_name] + + # Check splits + # Assert general + job = job_list[0] + parameters = job.update_parameters(config, parameters) + self.assertEqual(job.splits, 12) + self.assertEqual(job.running, 'chunk') + + self.assertEqual(parameters['SPLIT'], 1) + self.assertEqual(parameters['SPLITSIZE'], splitsize) + self.assertEqual(parameters['SPLITSIZEUNIT'], 'hour') + self.assertEqual(parameters['SPLITSCALENDAR'], 'standard') + # assert parameters + next_start = "00" + for i,job in enumerate(job_list[0:12]): + parameters = job.update_parameters(config, parameters) + end_hour = str(parameters['SPLIT'] * splitsize ).zfill(2) + if end_hour == "24": + end_hour = "00" + self.assertEqual(parameters['SPLIT'], i+1) + self.assertEqual(parameters['SPLITSIZE'], splitsize) + self.assertEqual(parameters['SPLITSIZEUNIT'], 'hour') + self.assertEqual(parameters['SPLIT_START_DATE'], '20000101') + self.assertEqual(parameters['SPLIT_START_YEAR'], '2000') + self.assertEqual(parameters['SPLIT_START_MONTH'], '01') + self.assertEqual(parameters['SPLIT_START_DAY'], '01') + self.assertEqual(parameters['SPLIT_START_HOUR'], next_start) + if parameters['SPLIT'] == 12: + self.assertEqual(parameters['SPLIT_END_DATE'], '20000102') + self.assertEqual(parameters['SPLIT_END_DAY'], '02') + self.assertEqual(parameters['SPLIT_END_DATE'], '20000102') + self.assertEqual(parameters['SPLIT_END_DAY'], '02') + self.assertEqual(parameters['SPLIT_END_YEAR'], '2000') + self.assertEqual(parameters['SPLIT_END_MONTH'], '01') + self.assertEqual(parameters['SPLIT_END_HOUR'], end_hour) + else: + self.assertEqual(parameters['SPLIT_END_DATE'], '20000101') + self.assertEqual(parameters['SPLIT_END_DAY'], '01') + self.assertEqual(parameters['SPLIT_END_YEAR'], '2000') + self.assertEqual(parameters['SPLIT_END_MONTH'], '01') + self.assertEqual(parameters['SPLIT_END_HOUR'], end_hour) + next_start = parameters['SPLIT_END_HOUR'] + next_start = "00" + for i,job in enumerate(job_list[12:24]): + parameters = job.update_parameters(config, parameters) + end_hour = str(parameters['SPLIT'] * splitsize ).zfill(2) + if end_hour == "24": + end_hour = "00" + self.assertEqual(parameters['SPLIT'], i+1) + self.assertEqual(parameters['SPLITSIZE'], splitsize) + self.assertEqual(parameters['SPLITSIZEUNIT'], 'hour') + self.assertEqual(parameters['SPLIT_START_DATE'], '20000105') + self.assertEqual(parameters['SPLIT_START_YEAR'], '2000') + self.assertEqual(parameters['SPLIT_START_MONTH'], '01') + self.assertEqual(parameters['SPLIT_START_DAY'], '05') + self.assertEqual(parameters['SPLIT_START_HOUR'], next_start) + if parameters['SPLIT'] == 12: + self.assertEqual(parameters['SPLIT_END_DATE'], '20000106') + self.assertEqual(parameters['SPLIT_END_DAY'], '06') + self.assertEqual(parameters['SPLIT_END_YEAR'], '2000') + self.assertEqual(parameters['SPLIT_END_MONTH'], '01') + self.assertEqual(parameters['SPLIT_END_HOUR'], end_hour) + else: + self.assertEqual(parameters['SPLIT_END_DATE'], '20000105') + self.assertEqual(parameters['SPLIT_END_DAY'], '05') + self.assertEqual(parameters['SPLIT_END_YEAR'], '2000') + self.assertEqual(parameters['SPLIT_END_MONTH'], '01') + self.assertEqual(parameters['SPLIT_END_HOUR'], end_hour) + next_start = parameters['SPLIT_END_HOUR'] + class FakeBasicConfig: -- GitLab From fe83d03393a863ae64c7ffeaf389cc31cb2a297f Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 20 Mar 2024 11:33:05 +0100 Subject: [PATCH 02/41] Added some error detection --- autosubmit/job/job_utils.py | 55 +++++++++++++++++++++++++++++++------ 1 file changed, 47 insertions(+), 8 deletions(-) diff --git a/autosubmit/job/job_utils.py b/autosubmit/job/job_utils.py index 08748edd2..97d74d22a 100644 --- a/autosubmit/job/job_utils.py +++ b/autosubmit/job/job_utils.py @@ -29,10 +29,47 @@ from autosubmit.job.job_package_persistence import JobPackagePersistence from autosubmitconfigparser.config.basicconfig import BasicConfig from typing import Dict +CALENDAR_UNITSIZE_ENUM = { + "hour": 0, + "day": 1, + "month": 2, + "year": 3 +} + + def is_leap_year(year): """Determine whether a year is a leap year.""" return year % 4 == 0 and (year % 100 != 0 or year % 400 == 0) + +def calendar_unitsize_isgreater(split_unit,chunk_unit): + """ + Check if the split unit is greater than the chunk unit + :param split_unit: + :param chunk_unit: + :return: + """ + split_unit = split_unit.lower() + chunk_unit = chunk_unit.lower() + try: + return CALENDAR_UNITSIZE_ENUM[split_unit] > CALENDAR_UNITSIZE_ENUM[chunk_unit] + except KeyError: + raise AutosubmitCritical(f"Invalid calendar unit size") + +def calendar_unitsize_getlowersize(unitsize): + """ + Get the lower size of a calendar unit + :return: + """ + unit_size = unitsize.lower() + unit_value = CALENDAR_UNITSIZE_ENUM[unit_size] + if unit_value == 0: + return "hour" + else: + return list(CALENDAR_UNITSIZE_ENUM.keys())[unit_value - 1] + + + def calendar_chunk_section(exp_data, section, date, chunk): """ Calendar for chunks @@ -45,6 +82,7 @@ def calendar_chunk_section(exp_data, section, date, chunk): jobs_data = exp_data.get('JOBS', {}) split_unit = str(exp_data.get("EXPERIMENT", {}).get('SPLITSIZEUNIT', jobs_data.get(section,{}).get("SPLITSIZEUNIT", None))).lower() chunk_unit = str(exp_data.get("EXPERIMENT", {}).get('CHUNKSIZEUNIT', "day")).lower() + split_policy = str(exp_data.get("EXPERIMENT", {}).get('SPLITPOLICY', jobs_data.get(section,{}).get("SPLITPOLICY", "flexible"))).lower() if chunk_unit == "hour": raise AutosubmitCritical("Chunk unit is hour, Autosubmit doesn't support lower than hour splits. Please change the chunk unit to day or higher. Or don't use calendar splits.") if jobs_data.get(section,{}).get("RUNNING","once") != "once": @@ -56,12 +94,9 @@ def calendar_chunk_section(exp_data, section, date, chunk): chunk_start, chunk_length, chunk_unit, cal) run_days = subs_dates(chunk_start, chunk_end, cal) if split_unit == "none": - if chunk_unit == "day": - split_unit = "hour" - elif chunk_unit == "month": - split_unit = "day" - elif chunk_unit == "year": - split_unit = "month" + split_unit = calendar_unitsize_getlowersize(chunk_unit) + if calendar_unitsize_isgreater(split_unit,chunk_unit): + raise AutosubmitCritical("Split unit is greater than chunk unit. Autosubmit doesn't support this configuration. Please change the split unit to day or lower. Or don't use calendar splits.") if split_unit == "hour": num_max_splits = run_days * 24 elif split_unit == "month": @@ -75,9 +110,13 @@ def calendar_chunk_section(exp_data, section, date, chunk): num_max_splits = run_days split_size = get_split_size(exp_data, section) splits = num_max_splits / split_size - if not splits.is_integer(): - Log.warning(f"The number of splits is not exact. This lead to one extra split.\nJOB_DATE:{date_str},JOB_CHUNK:{chunk}:n_splits:{num_max_splits}\{split_size}={splits}") + if not splits.is_integer() and split_policy == "flexible": + Log.warning(f"The number of splits:{num_max_splits}/{split_size} is not an integer. The number of splits will be rounded up due the flexible split policy.\n You can modify the SPLITPOLICY parameter in the section {section} to 'strict' to avoid this behavior.") + elif not splits.is_integer() and split_policy == "strict": + raise AutosubmitCritical(f"The number of splits is not an integer. The number of splits will be rounded up due the strict split policy.\n You can modify the SPLITPOLICY parameter in the section {section} to 'flexible' to roundup the number.") splits = math.ceil(splits) + Log.info(f"For the section {section} with date:{date_str} the number of splits is {splits}.") + return splits def get_split_size_unit(data, section): -- GitLab From 1f8c13f6c3419ebc2596c604a40f594bae1e5539 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 4 Mar 2024 12:47:43 +0100 Subject: [PATCH 03/41] needs test --- autosubmit/job/job.py | 4 +++ autosubmit/job/job_packages.py | 53 ++++++++++++++-------------------- 2 files changed, 26 insertions(+), 31 deletions(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index ed11c74cc..8b8897df0 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -1709,6 +1709,9 @@ class Job(object): if as_conf.get_project_type() != "none": parameters['EXTENDED_HEADER'] = self.read_header_tailer_script(self.ext_header_path, as_conf, True) parameters['EXTENDED_TAILER'] = self.read_header_tailer_script(self.ext_tailer_path, as_conf, False) + else: # If not, this show a warning when it tries to check the script + parameters['EXTENDED_HEADER'] = "" + parameters['EXTENDED_TAILER'] = "" parameters['CURRENT_QUEUE'] = self.queue parameters['RESERVATION'] = self.reservation parameters['CURRENT_EC_QUEUE'] = self.ec_queue @@ -2244,6 +2247,7 @@ class Job(object): variables_tmp = [variable[1:-1] for variable in variables_tmp] variables_tmp = [variable for variable in variables_tmp if variable not in self.default_parameters] variables.extend(variables_tmp) + out = set(parameters).issuperset(set(variables)) # Check if the variables in the templates are defined in the configurations if not out: diff --git a/autosubmit/job/job_packages.py b/autosubmit/job/job_packages.py index 86e790791..57376e602 100644 --- a/autosubmit/job/job_packages.py +++ b/autosubmit/job/job_packages.py @@ -96,22 +96,16 @@ class JobPackageBase(object): @threaded def check_scripts(self,jobs,configuration, parameters,only_generate,hold): for job in jobs: - if str(job.check).lower() == str(Job.CHECK_ON_SUBMISSION).lower(): - if only_generate: - exit_ = True - break - if not os.path.exists(os.path.join(configuration.get_project_dir(), job.file)): - lock.acquire() - if configuration.get_project_type().lower() != "none" and len(configuration.get_project_type()) > 0: - raise AutosubmitCritical( - "Template [ {0} ] using CHECK=On_submission has some empty variable {0}".format( - job.name), 7014) - lock.release() - if not job.check_script(configuration, parameters, show_logs=job.check_warnings): - Log.warning("Script {0} check failed", job.name) - Log.warning("On submission script has some empty variables") - else: - Log.result("Script {0} OK", job.name) + if only_generate and not os.path.exists(os.path.join(configuration.get_project_dir(), job.file)): + break + else: + if configuration.get_project_type().lower() != "none" and len(configuration.get_project_type()) > 0: + raise AutosubmitCritical(f"Job script:{job.file} does not exists",7014) + if not job.check_script(configuration, parameters, show_logs=job.check_warnings): + Log.warning("Script {0} check failed", job.name) + Log.warning("On submission script has some empty variables") + else: + Log.result("Script {0} OK", job.name) # looking for directives on jobs self._custom_directives = self._custom_directives | set(job.custom_directives) @threaded @@ -123,6 +117,7 @@ class JobPackageBase(object): pass + def submit(self, configuration, parameters,only_generate=False,hold=False): """ :param hold: @@ -148,21 +143,17 @@ class JobPackageBase(object): try: if len(self.jobs) < thread_number: for job in self.jobs: - if job.check == Job.CHECK_ON_SUBMISSION.lower(): - if only_generate: - exit_=True - break - if not os.path.exists(os.path.join(configuration.get_project_dir(), job.file)): - if configuration.get_project_type().lower() != "none" and len(configuration.get_project_type()) > 0: - raise AutosubmitCritical("Template [ {0} ] using CHECK=On_submission has some empty variable {0}".format(job.name),7014) - if not job.check_script(configuration, parameters,show_logs=job.check_warnings): - Log.warning("Script {0} check failed",job.name) - Log.warning("On submission script has some empty variables") - else: - Log.result("Script {0} OK",job.name) - job.update_parameters(configuration, parameters) - # Looking for special variables - + if only_generate and not os.path.exists(os.path.join(configuration.get_project_dir(), job.file)): + exit_=True + break + if not os.path.exists(os.path.join(configuration.get_project_dir(), job.file)): + if configuration.get_project_type().lower() != "none" and len(configuration.get_project_type()) > 0: + raise AutosubmitCritical("Template [ {0} ] using CHECK=On_submission has some empty variable {0}".format(job.name),7014) + if not job.check_script(configuration, parameters,show_logs=job.check_warnings): + Log.warning("Script {0} check failed",job.name) + Log.warning("On submission script has some empty variables") + else: + Log.result("Script {0} OK",job.name) # looking for directives on jobs self._custom_directives = self._custom_directives | set(job.custom_directives) else: -- GitLab From 72270204ec13393ac6430cb9978d1fd511b6a6cb Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 5 Mar 2024 17:05:26 +0100 Subject: [PATCH 04/41] Fixed issues with the exp_history Fixed issues related to clena_runtime_parameters # deleted --- autosubmit/autosubmit.py | 7 - .../experiment_history_db_manager.py | 3 + autosubmit/history/experiment_history.py | 187 ++++++++++-------- autosubmit/job/job.py | 11 -- 4 files changed, 106 insertions(+), 102 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 6abc3cc45..bb243e0d5 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -2386,9 +2386,6 @@ class Autosubmit: hold=hold) # Jobs that are being retrieved in batch. Right now, only available for slurm platforms. if not inspect and len(valid_packages_to_submit) > 0: - for package in (package for package in valid_packages_to_submit): - for job in (job for job in package.jobs): - job._clean_runtime_parameters() job_list.save() save_2 = False if platform.type.lower() in [ "slurm" , "pjm" ] and not inspect and not only_wrappers: @@ -2397,9 +2394,6 @@ class Autosubmit: failed_packages, error_message="", hold=hold) if not inspect and len(valid_packages_to_submit) > 0: - for package in (package for package in valid_packages_to_submit): - for job in (job for job in package.jobs): - job._clean_runtime_parameters() job_list.save() # Save wrappers(jobs that has the same id) to be visualized and checked in other parts of the code job_list.save_wrappers(valid_packages_to_submit, failed_packages, as_conf, packages_persistence, @@ -3414,7 +3408,6 @@ class Autosubmit: try: for job in job_list.get_job_list(): job_parameters = job.update_parameters(as_conf, {}) - job._clean_runtime_parameters() for key, value in job_parameters.items(): jobs_parameters["JOBS"+"."+job.section+"."+key] = value except: diff --git a/autosubmit/history/database_managers/experiment_history_db_manager.py b/autosubmit/history/database_managers/experiment_history_db_manager.py index 9e5662af6..8df415c94 100644 --- a/autosubmit/history/database_managers/experiment_history_db_manager.py +++ b/autosubmit/history/database_managers/experiment_history_db_manager.py @@ -234,6 +234,9 @@ class ExperimentHistoryDbManager(DatabaseManager): statement = self.get_built_select_statement("job_data", "last=1 and job_name=? ORDER BY counter DESC") arguments = (job_name,) job_data_rows_last = self.get_from_statement_with_arguments(self.historicaldb_file_path, statement, arguments) + if not job_data_rows_last: # if previous job didn't finished but a new create has been made + statement = self.get_built_select_statement("job_data", "last=0 and job_name=? ORDER BY counter DESC") + job_data_rows_last = self.get_from_statement_with_arguments(self.historicaldb_file_path, statement, arguments) return [Models.JobDataRow(*row) for row in job_data_rows_last] def get_job_data_dcs_last_by_run_id(self, run_id): diff --git a/autosubmit/history/experiment_history.py b/autosubmit/history/experiment_history.py index 7f6a49648..ee0558edd 100644 --- a/autosubmit/history/experiment_history.py +++ b/autosubmit/history/experiment_history.py @@ -26,12 +26,13 @@ from .data_classes.job_data import JobData from .data_classes.experiment_run import ExperimentRun from .platform_monitor.slurm_monitor import SlurmMonitor from .internal_logging import Logging +from log.log import Log from autosubmitconfigparser.config.basicconfig import BasicConfig SECONDS_WAIT_PLATFORM = 60 class ExperimentHistory: - def __init__(self, expid, jobdata_dir_path=DEFAULT_JOBDATA_DIR, historiclog_dir_path=DEFAULT_HISTORICAL_LOGS_DIR): + def __init__(self, expid, jobdata_dir_path=DEFAULT_JOBDATA_DIR, historiclog_dir_path=DEFAULT_HISTORICAL_LOGS_DIR): self.expid = expid BasicConfig.read() self._log = Logging(expid, BasicConfig.HISTORICAL_LOG_DIR) @@ -41,39 +42,42 @@ class ExperimentHistory: self.manager = ExperimentHistoryDbManager(self.expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR) except Exception as exp: self._log.log(str(exp), traceback.format_exc()) - self.manager = None + Log.debug(f'Historical Database error: {str(exp)} {traceback.format_exc()}') + self.manager = None def initialize_database(self): try: - self.manager.initialize() + self.manager.initialize() except Exception as exp: self._log.log(str(exp), traceback.format_exc()) + Log.debug(f'Historical Database error: {str(exp)} {traceback.format_exc()}') + self.manager = None - + def is_header_ready(self): - if self.manager: - return self.manager.is_header_ready_db_version() + if self.manager: + return self.manager.is_header_ready_db_version() return False - - def write_submit_time(self, job_name, submit=0, status="UNKNOWN", ncpus=0, wallclock="00:00", qos="debug", date="", + + def write_submit_time(self, job_name, submit=0, status="UNKNOWN", ncpus=0, wallclock="00:00", qos="debug", date="", member="", section="", chunk=0, platform="NA", job_id=0, wrapper_queue=None, wrapper_code=None, children=""): try: next_counter = self._get_next_counter_by_job_name(job_name) current_experiment_run = self.manager.get_experiment_run_dc_with_max_id() - job_data_dc = JobData(_id=0, - counter=next_counter, - job_name=job_name, - submit=submit, - status=status, - rowtype=self._get_defined_rowtype(wrapper_code), - ncpus=ncpus, - wallclock=wallclock, - qos=self._get_defined_queue_name(wrapper_queue, wrapper_code, qos), + job_data_dc = JobData(_id=0, + counter=next_counter, + job_name=job_name, + submit=submit, + status=status, + rowtype=self._get_defined_rowtype(wrapper_code), + ncpus=ncpus, + wallclock=wallclock, + qos=self._get_defined_queue_name(wrapper_queue, wrapper_code, qos), date=date, member=member, section=section, - chunk=chunk, + chunk=chunk, platform=platform, job_id=job_id, children=children, @@ -81,25 +85,27 @@ class ExperimentHistory: return self.manager.register_submitted_job_data_dc(job_data_dc) except Exception as exp: self._log.log(str(exp), traceback.format_exc()) + Log.debug(f'Historical Database error: {str(exp)} {traceback.format_exc()}') + return None - + def write_start_time(self, job_name, start=0, status="UNKNOWN", ncpus=0, wallclock="00:00", qos="debug", date="", member="", section="", chunk=0, platform="NA", job_id=0, wrapper_queue=None, wrapper_code=None, children=""): try: job_data_dc_last = self.manager.get_job_data_dc_unique_latest_by_job_name(job_name) if not job_data_dc_last: - job_data_dc_last = self.write_submit_time(job_name=job_name, - status=status, - ncpus=ncpus, - wallclock=wallclock, - qos=qos, - date=date, - member=member, - section=section, - chunk=chunk, - platform=platform, - job_id=job_id, - wrapper_queue=wrapper_queue, + job_data_dc_last = self.write_submit_time(job_name=job_name, + status=status, + ncpus=ncpus, + wallclock=wallclock, + qos=qos, + date=date, + member=member, + section=section, + chunk=chunk, + platform=platform, + job_id=job_id, + wrapper_queue=wrapper_queue, wrapper_code=wrapper_code) self._log.log("write_start_time {0} start not found.".format(job_name)) job_data_dc_last = self.manager.get_job_data_dc_unique_latest_by_job_name(job_name) @@ -114,26 +120,28 @@ class ExperimentHistory: return self.manager.update_job_data_dc_by_id(job_data_dc_last) except Exception as exp: self._log.log(str(exp), traceback.format_exc()) - - def write_finish_time(self, job_name, finish=0, status="UNKNOWN", ncpus=0, wallclock="00:00", qos="debug", date="", - member="", section="", chunk=0, platform="NA", job_id=0, out_file=None, err_file=None, + Log.debug(f'Historical Database error: {str(exp)} {traceback.format_exc()}') + + + def write_finish_time(self, job_name, finish=0, status="UNKNOWN", ncpus=0, wallclock="00:00", qos="debug", date="", + member="", section="", chunk=0, platform="NA", job_id=0, out_file=None, err_file=None, wrapper_queue=None, wrapper_code=None, children=""): try: job_data_dc_last = self.manager.get_job_data_dc_unique_latest_by_job_name(job_name) if not job_data_dc_last: - job_data_dc_last = self.write_submit_time(job_name=job_name, - status=status, - ncpus=ncpus, - wallclock=wallclock, - qos=qos, - date=date, - member=member, - section=section, - chunk=chunk, - platform=platform, - job_id=job_id, - wrapper_queue=wrapper_queue, - wrapper_code=wrapper_code, + job_data_dc_last = self.write_submit_time(job_name=job_name, + status=status, + ncpus=ncpus, + wallclock=wallclock, + qos=qos, + date=date, + member=member, + section=section, + chunk=chunk, + platform=platform, + job_id=job_id, + wrapper_queue=wrapper_queue, + wrapper_code=wrapper_code, children=children) self._log.log("write_finish_time {0} submit not found.".format(job_name)) job_data_dc_last = self.manager.get_job_data_dc_unique_latest_by_job_name(job_name) @@ -141,26 +149,28 @@ class ExperimentHistory: raise Exception("Job {0} has not been found in the database.".format(job_name)) job_data_dc_last.finish = finish if finish > 0 else int(time()) job_data_dc_last.status = status - job_data_dc_last.job_id = job_id + job_data_dc_last.job_id = job_id job_data_dc_last.rowstatus = Models.RowStatus.PENDING_PROCESS job_data_dc_last.out = out_file if out_file else "" job_data_dc_last.err = err_file if err_file else "" return self.manager.update_job_data_dc_by_id(job_data_dc_last) except Exception as exp: - self._log.log(str(exp), traceback.format_exc()) - + self._log.log(str(exp), traceback.format_exc()) + Log.debug(f'Historical Database error: {str(exp)} {traceback.format_exc()}') + + def write_platform_data_after_finish(self, job_data_dc, platform_obj): - """ + """ Call it in a thread. """ try: sleep(SECONDS_WAIT_PLATFORM) - ssh_output = platform_obj.check_job_energy(job_data_dc.job_id) + ssh_output = platform_obj.check_job_energy(job_data_dc.job_id) slurm_monitor = SlurmMonitor(ssh_output) self._verify_slurm_monitor(slurm_monitor, job_data_dc) job_data_dcs_in_wrapper = self.manager.get_job_data_dcs_last_by_wrapper_code(job_data_dc.wrapper_code) job_data_dcs_in_wrapper = sorted([job for job in job_data_dcs_in_wrapper if job.status == "COMPLETED"], key=lambda x: x._id) - job_data_dcs_to_update = [] + job_data_dcs_to_update = [] if len(job_data_dcs_in_wrapper) > 0: info_handler = PlatformInformationHandler(StraightWrapperAssociationStrategy(self._historiclog_dir_path)) job_data_dcs_to_update = info_handler.execute_distribution(job_data_dc, job_data_dcs_in_wrapper, slurm_monitor) @@ -172,21 +182,27 @@ class ExperimentHistory: job_data_dcs_to_update = info_handler.execute_distribution(job_data_dc, job_data_dcs_in_wrapper, slurm_monitor) else: info_handler = PlatformInformationHandler(SingleAssociationStrategy(self._historiclog_dir_path)) - job_data_dcs_to_update = info_handler.execute_distribution(job_data_dc, job_data_dcs_in_wrapper, slurm_monitor) - return self.manager.update_list_job_data_dc_by_each_id(job_data_dcs_to_update) + job_data_dcs_to_update = info_handler.execute_distribution(job_data_dc, job_data_dcs_in_wrapper, slurm_monitor) + return self.manager.update_list_job_data_dc_by_each_id(job_data_dcs_to_update) except Exception as exp: - self._log.log(str(exp), traceback.format_exc()) + self._log.log(str(exp), traceback.format_exc()) + Log.debug(f'Historical Database error: {str(exp)} {traceback.format_exc()}') + def _verify_slurm_monitor(self, slurm_monitor, job_data_dc): try: if slurm_monitor.header.status not in ["COMPLETED", "FAILED"]: - self._log.log("Assertion Error on job {0} with ssh_output {1}".format(job_data_dc.job_name, slurm_monitor.original_input), + self._log.log("Assertion Error on job {0} with ssh_output {1}".format(job_data_dc.job_name, slurm_monitor.original_input), "Slurm status {0} is not COMPLETED nor FAILED for ID {1}.\n".format(slurm_monitor.header.status, slurm_monitor.header.name)) + Log.debug(f'Historical Database error: Slurm status {slurm_monitor.header.status} is not COMPLETED nor FAILED for ID {slurm_monitor.header.name}.') if not slurm_monitor.steps_plus_extern_approximate_header_energy(): self._log.log("Assertion Error on job {0} with ssh_output {1}".format(job_data_dc.job_name, slurm_monitor.original_input), "Steps + extern != total energy for ID {0}. Number of steps {1}.\n".format(slurm_monitor.header.name, slurm_monitor.step_count)) + Log.debug(f'Historical Database error: Steps + extern != total energy for ID {slurm_monitor.header.name}. Number of steps {slurm_monitor.step_count}.') except Exception as exp: - self._log.log(str(exp), traceback.format_exc()) + self._log.log(str(exp), traceback.format_exc()) + Log.debug(f'Historical Database error: {str(exp)} {traceback.format_exc()}') + def process_status_changes(self, job_list=None, chunk_unit="NA", chunk_size=0, current_config="",create=False): """ Detect status differences between job_list and current job_data rows, and update. Creates a new run if necessary. """ @@ -206,7 +222,9 @@ class ExperimentHistory: return self.update_counts_on_experiment_run_dc(current_experiment_run_dc, job_list) except Exception as exp: self._log.log(str(exp), traceback.format_exc()) - + Log.debug(f'Historical Database error: {str(exp)} {traceback.format_exc()}') + + def _get_built_list_of_changes(self, job_list): """ Return: List of (current timestamp, current datetime str, status, rowstatus, id in job_data). One tuple per change. """ job_data_dcs = self.detect_changes_in_job_list(job_list) @@ -215,11 +233,13 @@ class ExperimentHistory: def process_job_list_changes_to_experiment_totals(self, job_list=None): """ Updates current experiment_run row with totals calculated from job_list. """ try: - current_experiment_run_dc = self.manager.get_experiment_run_dc_with_max_id() + current_experiment_run_dc = self.manager.get_experiment_run_dc_with_max_id() return self.update_counts_on_experiment_run_dc(current_experiment_run_dc, job_list) except Exception as exp: self._log.log(str(exp), traceback.format_exc()) - + Log.debug(f'Historical Database error: {str(exp)} {traceback.format_exc()}') + + def should_we_create_a_new_run(self, job_list, changes_count, current_experiment_run_dc, new_chunk_unit, new_chunk_size,create=False): if create: return True @@ -229,7 +249,7 @@ class ExperimentHistory: if changes_count > int(self._get_date_member_completed_count(job_list)): return True return self._chunk_config_has_changed(current_experiment_run_dc, new_chunk_unit, new_chunk_size) - + def _chunk_config_has_changed(self, current_exp_run_dc, new_chunk_unit, new_chunk_size): if not current_exp_run_dc: return True @@ -264,19 +284,19 @@ class ExperimentHistory: def _create_new_experiment_run_dc_with_counts(self, chunk_unit, chunk_size, current_config="", job_list=None): """ Create new experiment_run row and return the new Models.ExperimentRun data class from database. """ status_counts = self.get_status_counts_from_job_list(job_list) - experiment_run_dc = ExperimentRun(0, - chunk_unit=chunk_unit, - chunk_size=chunk_size, - metadata=current_config, + experiment_run_dc = ExperimentRun(0, + chunk_unit=chunk_unit, + chunk_size=chunk_size, + metadata=current_config, start=int(time()), - completed=status_counts[HUtils.SupportedStatus.COMPLETED], - total=status_counts["TOTAL"], - failed=status_counts[HUtils.SupportedStatus.FAILED], - queuing=status_counts[HUtils.SupportedStatus.QUEUING], - running=status_counts[HUtils.SupportedStatus.RUNNING], - submitted=status_counts[HUtils.SupportedStatus.SUBMITTED], + completed=status_counts[HUtils.SupportedStatus.COMPLETED], + total=status_counts["TOTAL"], + failed=status_counts[HUtils.SupportedStatus.FAILED], + queuing=status_counts[HUtils.SupportedStatus.QUEUING], + running=status_counts[HUtils.SupportedStatus.RUNNING], + submitted=status_counts[HUtils.SupportedStatus.SUBMITTED], suspended=status_counts[HUtils.SupportedStatus.SUSPENDED]) - return self.manager.register_experiment_run_dc(experiment_run_dc) + return self.manager.register_experiment_run_dc(experiment_run_dc) def detect_changes_in_job_list(self, job_list): """ Detect changes in job_list compared to the current contents of job_data table. Returns a list of JobData data classes where the status of each item is the new status.""" @@ -292,12 +312,12 @@ class ExperimentHistory: differences.append(job_dc) return differences - def _get_defined_rowtype(self, code): + def _get_defined_rowtype(self, code): if code: return code else: return Models.RowType.NORMAL - + def _get_defined_queue_name(self, wrapper_queue, wrapper_code, qos): if wrapper_code and wrapper_code > 2 and wrapper_queue is not None and len(str(wrapper_queue)) > 0: return wrapper_queue @@ -314,12 +334,12 @@ class ExperimentHistory: def _get_date_member_completed_count(self, job_list): """ Each item in the job_list must have attributes: date, member, status_str. """ - job_list = job_list if job_list else [] + job_list = job_list if job_list else [] return sum(1 for job in job_list if job.date is not None and job.member is not None and job.status_str == HUtils.SupportedStatus.COMPLETED) - + def get_status_counts_from_job_list(self, job_list): - """ - Return dict with keys COMPLETED, FAILED, QUEUING, SUBMITTED, RUNNING, SUSPENDED, TOTAL. + """ + Return dict with keys COMPLETED, FAILED, QUEUING, SUBMITTED, RUNNING, SUSPENDED, TOTAL. """ result = { HUtils.SupportedStatus.COMPLETED: 0, @@ -329,14 +349,13 @@ class ExperimentHistory: HUtils.SupportedStatus.RUNNING: 0, HUtils.SupportedStatus.SUSPENDED: 0, "TOTAL": 0 - } + } if not job_list: job_list = [] - - for job in job_list: - if job.status_str in result: + + for job in job_list: + if job.status_str in result: result[job.status_str] += 1 result["TOTAL"] = len(job_list) return result - \ No newline at end of file diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 8b8897df0..301fe9510 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -256,17 +256,6 @@ class Job(object): self._memory = '' self._memory_per_task = '' - def _clean_runtime_parameters(self): - # hetjobs - self.het = None - self.parameters = None - self._tasks = None - self._nodes = None - self.default_parameters = None - self._threads = None - self._processors = None - self._memory = None - self._memory_per_task = None @property @autosubmit_parameter(name='tasktype') def section(self): -- GitLab From 765d1539bee45a96b6f5bf476537043ee952f7b7 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 20 Feb 2024 09:15:14 +0100 Subject: [PATCH 05/41] wrapper --- autosubmit/job/job_packager.py | 74 +++++++++++++++------------------- 1 file changed, 32 insertions(+), 42 deletions(-) diff --git a/autosubmit/job/job_packager.py b/autosubmit/job/job_packager.py index 675f11301..df4a2ed90 100644 --- a/autosubmit/job/job_packager.py +++ b/autosubmit/job/job_packager.py @@ -226,7 +226,7 @@ class JobPackager(object): min_h = len(package.jobs) return min_v, min_h, balanced - def check_packages_respect_wrapper_policy(self,built_packages_tmp,packages_to_submit,max_jobs_to_submit,wrapper_limits): + def check_packages_respect_wrapper_policy(self,built_packages_tmp,packages_to_submit,max_jobs_to_submit,wrapper_limits, any_simple_packages = False): """ Check if the packages respect the wrapper policy and act in base of it ( submit wrapper, submit sequential, wait for more jobs to form a wrapper) :param built_packages_tmp: List of packages to be submitted @@ -237,6 +237,7 @@ class JobPackager(object): :rtype: List of packages to be submitted, int :return: packages_to_submit, max_jobs_to_submit """ + for p in built_packages_tmp: if max_jobs_to_submit == 0: break @@ -270,40 +271,27 @@ class JobPackager(object): packages_to_submit.append(p) max_jobs_to_submit = max_jobs_to_submit - 1 else: # Check if there is a deadlock or an infinite deadlock. Once checked, act in base of the wrapper policy. - deadlock = True - if deadlock: # Remaining jobs if chunk is the last one - for job in p.jobs: - if (job.running == "chunk" and job.chunk == int( - job.parameters["EXPERIMENT.NUMCHUNKS"])) and balanced: - deadlock = False + wallclock_sum = p.jobs[0].wallclock + for seq in range(1, min_v): + wallclock_sum = sum_str_hours(wallclock_sum, p.jobs[0].wallclock) + next_wrappable_jobs = self._jobs_list.get_jobs_by_section(self.jobs_in_wrapper[self.current_wrapper_section]) + next_wrappable_jobs = [job for job in next_wrappable_jobs if + job.status == Status.WAITING and job not in p.jobs] # Get only waiting jobs + active_jobs = list() + aux_active_jobs = list() + for job in next_wrappable_jobs: # Prone tree by looking only the closest children + direct_children = False + for related in job.parents: + if related in p.jobs: + direct_children = True break - if not deadlock: # Submit package if deadlock has been liberated - for job in p.jobs: - job.packed = True - packages_to_submit.append(p) - max_jobs_to_submit = max_jobs_to_submit - 1 - else: - wallclock_sum = p.jobs[0].wallclock - for seq in range(1, min_v): - wallclock_sum = sum_str_hours(wallclock_sum, p.jobs[0].wallclock) - next_wrappable_jobs = self._jobs_list.get_jobs_by_section(self.jobs_in_wrapper[self.current_wrapper_section]) - next_wrappable_jobs = [job for job in next_wrappable_jobs if - job.status == Status.WAITING and job not in p.jobs] # Get only waiting jobs - active_jobs = list() - aux_active_jobs = list() - for job in next_wrappable_jobs: # Prone tree by looking only the closest children - direct_children = False - for related in job.parents: - if related in p.jobs: - direct_children = True - break - if direct_children: # Get parent of direct children that aren't in wrapper - aux_active_jobs += [aux_parent for aux_parent in job.parents if ( - aux_parent.status != Status.COMPLETED and aux_parent.status != Status.FAILED) and ( - aux_parent.section not in self.jobs_in_wrapper[ - self.current_wrapper_section] or ( - aux_parent.section in self.jobs_in_wrapper[ - self.current_wrapper_section] and aux_parent.status != Status.COMPLETED and aux_parent.status != Status.FAILED and aux_parent.status != Status.WAITING and aux_parent.status != Status.READY))] + if direct_children: # Get parent of direct children that aren't in wrapper + aux_active_jobs += [aux_parent for aux_parent in job.parents if ( + aux_parent.status != Status.COMPLETED and aux_parent.status != Status.FAILED) and ( + aux_parent.section not in self.jobs_in_wrapper[ + self.current_wrapper_section] or ( + aux_parent.section in self.jobs_in_wrapper[ + self.current_wrapper_section] and aux_parent.status != Status.COMPLETED and aux_parent.status != Status.FAILED and aux_parent.status != Status.WAITING and aux_parent.status != Status.READY))] aux_active_jobs = list(set(aux_active_jobs)) track = [] # Tracker to prone tree for avoid the checking of the same parent from different nodes. active_jobs_names = [job.name for job in @@ -341,7 +329,7 @@ class JobPackager(object): for job in p.jobs: job.packed = False if len(active_jobs) > 0: - Log.printlog(f'Wrapper policy is set to STRICT and there are not enough jobs to form a wrapper.[wrappable:{wrapper_limits["min"]} <= defined_min:{wrapper_limits["min"]}] [wrappeable_h:{min_h} <= defined_min_h:{wrapper_limits["min_h"]}]|[wrappeable_v:{min_v} <= defined_min_v:{wrapper_limits["min_v"]}] waiting until the wrapper can be formed.\nIf all values are <=, some innerjob has failed under strict policy', 6013) + Log.printlog(f'Wrapper policy is set to STRICT and there are not enough jobs to form a wrapper.[wrappable:{wrapper_limits["min"]} <= defined_min:{min_h*min_v}] [wrappeable_h:{min_h} <= defined_min_h:{wrapper_limits["min_h"]}]|[wrappeable_v:{min_v} <= defined_min_v:{wrapper_limits["min_v"]}] waiting until the wrapper can be formed.\nIf all values are <=, some innerjob has failed under strict policy', 6013) else: if len(self._jobs_list.get_in_queue()) == 0: raise AutosubmitCritical(self.error_message_policy(min_h, min_v, wrapper_limits, hard_deadlock, wallclock_sum, balanced), 7014) @@ -367,7 +355,7 @@ class JobPackager(object): if error: if len(active_jobs) > 0: if show_log: - Log.printlog(f'Wrapper policy is set to MIXED and there are not enough jobs to form a wrapper.[wrappable:{wrapper_limits["min"]} < defined_min:{wrapper_limits["min"]}] [wrappable_h:{min_h} < defined_min_h:{wrapper_limits["min_h"]}]|[wrappeable_v:{min_v} < defined_min_v:{wrapper_limits["min_v"]}] waiting until the wrapper can be formed.', 6013) + Log.printlog(f'Wrapper policy is set to MIXED and there are not enough jobs to form a wrapper.[wrappable:{wrapper_limits["min"]} < defined_min:{min_h*min_v}] [wrappable_h:{min_h} < defined_min_h:{wrapper_limits["min_h"]}]|[wrappeable_v:{min_v} < defined_min_v:{wrapper_limits["min_v"]}] waiting until the wrapper can be formed.', 6013) else: if len(self._jobs_list.get_in_queue()) == 0: # When there are not more possible jobs, autosubmit will stop the execution raise AutosubmitCritical(self.error_message_policy(min_h, min_v, wrapper_limits, hard_deadlock, wallclock_sum, balanced), 7014) @@ -389,7 +377,7 @@ class JobPackager(object): return packages_to_submit, max_jobs_to_submit def error_message_policy(self,min_h,min_v,wrapper_limits,hard_deadlock,wallclock_sum,balanced): - message = f"Wrapper couldn't be formed under {self.wrapper_policy[self.current_wrapper_section]} POLICY due minimum limit not being reached: [wrappable:{wrapper_limits['min']} < defined_min:{wrapper_limits['min']}] [wrappable_h:{min_h} < defined_min_h:{wrapper_limits['min_h']}]|[wrappeable_v:{min_v} < defined_min_v:{wrapper_limits['min_v']}] " + message = f"Wrapper couldn't be formed under {self.wrapper_policy[self.current_wrapper_section]} POLICY due minimum limit not being reached: [wrappable:{wrapper_limits['min']} < defined_min:{min_h*min_v}] [wrappable_h:{min_h} < defined_min_h:{wrapper_limits['min_h']}]|[wrappeable_v:{min_v} < defined_min_v:{wrapper_limits['min_v']}] " if hard_deadlock: message += "\nCheck your configuration: The next wrappable job can't be wrapped until some of inner jobs of current packages finishes which is impossible" if min_v > 1: @@ -510,6 +498,10 @@ class JobPackager(object): job.packed = False jobs_to_wrap = self._divide_list_by_section(jobs_to_submit) non_wrapped_jobs = jobs_to_wrap.pop("SIMPLE",[]) + if len(non_wrapped_jobs) > 0: + any_simple_packages = True + else: + any_simple_packages = False # Prepare packages for wrapped jobs for wrapper_name, jobs in jobs_to_wrap.items(): if max_jobs_to_submit == 0: @@ -536,15 +528,13 @@ class JobPackager(object): if self.wrapper_type[self.current_wrapper_section] == 'vertical': built_packages_tmp = self._build_vertical_packages(jobs, wrapper_limits,wrapper_info=current_info) elif self.wrapper_type[self.current_wrapper_section] == 'horizontal': - if len(jobs) >= wrapper_limits["min_h"]: - built_packages_tmp = self._build_horizontal_packages(jobs, wrapper_limits, section,wrapper_info=current_info) + built_packages_tmp = self._build_horizontal_packages(jobs, wrapper_limits, section,wrapper_info=current_info) elif self.wrapper_type[self.current_wrapper_section] in ['vertical-horizontal', 'horizontal-vertical']: - if len(jobs) >= wrapper_limits["min_h"]: - built_packages_tmp.append(self._build_hybrid_package(jobs, wrapper_limits, section,wrapper_info=current_info)) + built_packages_tmp.append(self._build_hybrid_package(jobs, wrapper_limits, section,wrapper_info=current_info)) else: built_packages_tmp = self._build_vertical_packages(jobs, wrapper_limits) - packages_to_submit,max_jobs_to_submit = self.check_packages_respect_wrapper_policy(built_packages_tmp,packages_to_submit,max_jobs_to_submit,wrapper_limits) + packages_to_submit,max_jobs_to_submit = self.check_packages_respect_wrapper_policy(built_packages_tmp,packages_to_submit,max_jobs_to_submit,wrapper_limits,any_simple_packages) # Now, prepare the packages for non-wrapper jobs for job in non_wrapped_jobs: if max_jobs_to_submit == 0: -- GitLab From bd36276e8d331203047e6f98b1344d23133068d2 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Tue, 20 Feb 2024 16:09:28 +0100 Subject: [PATCH 06/41] wrapper deadlock logic changed --- autosubmit/job/job_packager.py | 152 ++++++++++----------------------- 1 file changed, 46 insertions(+), 106 deletions(-) diff --git a/autosubmit/job/job_packager.py b/autosubmit/job/job_packager.py index df4a2ed90..5bec96435 100644 --- a/autosubmit/job/job_packager.py +++ b/autosubmit/job/job_packager.py @@ -237,11 +237,10 @@ class JobPackager(object): :rtype: List of packages to be submitted, int :return: packages_to_submit, max_jobs_to_submit """ - + not_wrappeable_package_info = list() for p in built_packages_tmp: if max_jobs_to_submit == 0: break - infinite_deadlock = False # This will raise an autosubmit critical if true, infinite deadlock is when there are no more non-wrapped jobs in waiting or ready status failed_innerjobs = False # Check if the user is using the option to run first some jobs. if so, remove non-first jobs from the package and submit them sequentially following a flexible policy if len(self._jobs_list.jobs_to_run_first) > 0: @@ -270,116 +269,57 @@ class JobPackager(object): job.packed = True packages_to_submit.append(p) max_jobs_to_submit = max_jobs_to_submit - 1 - else: # Check if there is a deadlock or an infinite deadlock. Once checked, act in base of the wrapper policy. - wallclock_sum = p.jobs[0].wallclock - for seq in range(1, min_v): - wallclock_sum = sum_str_hours(wallclock_sum, p.jobs[0].wallclock) - next_wrappable_jobs = self._jobs_list.get_jobs_by_section(self.jobs_in_wrapper[self.current_wrapper_section]) - next_wrappable_jobs = [job for job in next_wrappable_jobs if - job.status == Status.WAITING and job not in p.jobs] # Get only waiting jobs - active_jobs = list() - aux_active_jobs = list() - for job in next_wrappable_jobs: # Prone tree by looking only the closest children - direct_children = False - for related in job.parents: - if related in p.jobs: - direct_children = True + else: + not_wrappeable_package_info.append([p, min_v, min_h, balanced]) + # It is a deadlock when: + # 1. There are no more non-wrapped jobs in ready status + # 2. And there are no more jobs in the queue ( submitted, queuing, running, held ) + # 3. And all current packages are not wrappable. + if not any_simple_packages and len(self._jobs_list.get_in_queue()) == 0 and len(not_wrappeable_package_info) == len(built_packages_tmp): + for p, min_v, min_h, balanced in not_wrappeable_package_info: + if self.wrapper_policy[self.current_wrapper_section] == "strict": + for job in p.jobs: + job.packed = False + raise AutosubmitCritical(self.error_message_policy(min_h, min_v, wrapper_limits, p.wallclock, balanced), 7014) + elif self.wrapper_policy[self.current_wrapper_section] == "mixed": + error = True + for job in p.jobs: + if max_jobs_to_submit == 0: break - if direct_children: # Get parent of direct children that aren't in wrapper - aux_active_jobs += [aux_parent for aux_parent in job.parents if ( - aux_parent.status != Status.COMPLETED and aux_parent.status != Status.FAILED) and ( - aux_parent.section not in self.jobs_in_wrapper[ - self.current_wrapper_section] or ( - aux_parent.section in self.jobs_in_wrapper[ - self.current_wrapper_section] and aux_parent.status != Status.COMPLETED and aux_parent.status != Status.FAILED and aux_parent.status != Status.WAITING and aux_parent.status != Status.READY))] - aux_active_jobs = list(set(aux_active_jobs)) - track = [] # Tracker to prone tree for avoid the checking of the same parent from different nodes. - active_jobs_names = [job.name for job in - p.jobs] # We want to search if the actual wrapped jobs needs to run for add more jobs to this wrapper - hard_deadlock = False - for job in aux_active_jobs: - parents_to_check = [] - if job.status == Status.WAITING: # We only want to check uncompleted parents - aux_job = job - for parent in aux_job.parents: # First case - if parent.name in active_jobs_names: - hard_deadlock = True - infinite_deadlock = True - break - if (parent.status == Status.WAITING) and parent.name != aux_job.name: - parents_to_check.append(parent) - track.extend(parents_to_check) - while len( - parents_to_check) > 0 and not infinite_deadlock: # We want to look deeper on the tree until all jobs are completed, or we find an unresolvable deadlock. - aux_job = parents_to_check.pop(0) - for parent in aux_job.parents: - if parent.name in active_jobs_names: - hard_deadlock = True - infinite_deadlock = True - break - if ( - parent.status == Status.WAITING) and parent.name != aux_job.name and parent not in track: - parents_to_check.append(parent) - track.extend(parents_to_check) - if not infinite_deadlock: - active_jobs.append(job) # List of jobs that can continue to run without run this wrapper - - # Act in base of active_jobs and Policies - if self.wrapper_policy[self.current_wrapper_section] == "strict": - for job in p.jobs: + if job.fail_count > 0 and job.status == Status.READY: job.packed = False - if len(active_jobs) > 0: - Log.printlog(f'Wrapper policy is set to STRICT and there are not enough jobs to form a wrapper.[wrappable:{wrapper_limits["min"]} <= defined_min:{min_h*min_v}] [wrappeable_h:{min_h} <= defined_min_h:{wrapper_limits["min_h"]}]|[wrappeable_v:{min_v} <= defined_min_v:{wrapper_limits["min_v"]}] waiting until the wrapper can be formed.\nIf all values are <=, some innerjob has failed under strict policy', 6013) - else: - if len(self._jobs_list.get_in_queue()) == 0: - raise AutosubmitCritical(self.error_message_policy(min_h, min_v, wrapper_limits, hard_deadlock, wallclock_sum, balanced), 7014) - elif self.wrapper_policy[self.current_wrapper_section] == "mixed": - error = True - show_log = True - for job in p.jobs: - if max_jobs_to_submit == 0: - break - if job.fail_count > 0 and job.status == Status.READY: - job.packed = False - Log.printlog( - "Wrapper policy is set to mixed, there is a failed job that will be sent sequential") - error = False - show_log = False - if job.type == Type.PYTHON and not self._platform.allow_python_jobs: - package = JobPackageSimpleWrapped( - [job]) - else: - package = JobPackageSimple([job]) - packages_to_submit.append(package) - max_jobs_to_submit = max_jobs_to_submit - 1 - if error: - if len(active_jobs) > 0: - if show_log: - Log.printlog(f'Wrapper policy is set to MIXED and there are not enough jobs to form a wrapper.[wrappable:{wrapper_limits["min"]} < defined_min:{min_h*min_v}] [wrappable_h:{min_h} < defined_min_h:{wrapper_limits["min_h"]}]|[wrappeable_v:{min_v} < defined_min_v:{wrapper_limits["min_v"]}] waiting until the wrapper can be formed.', 6013) + Log.printlog( + "Wrapper policy is set to mixed, there is a failed job that will be sent sequential") + error = False + if job.type == Type.PYTHON and not self._platform.allow_python_jobs: + package = JobPackageSimpleWrapped( + [job]) else: - if len(self._jobs_list.get_in_queue()) == 0: # When there are not more possible jobs, autosubmit will stop the execution - raise AutosubmitCritical(self.error_message_policy(min_h, min_v, wrapper_limits, hard_deadlock, wallclock_sum, balanced), 7014) - else: - Log.info( - "Wrapper policy is set to flexible and there is a deadlock, Autosubmit will submit the jobs sequentially") - for job in p.jobs: - if max_jobs_to_submit == 0: - break - job.packed = False - if job.status == Status.READY: - if job.type == Type.PYTHON and not self._platform.allow_python_jobs: - package = JobPackageSimpleWrapped( - [job]) - else: - package = JobPackageSimple([job]) - packages_to_submit.append(package) - max_jobs_to_submit = max_jobs_to_submit - 1 + package = JobPackageSimple([job]) + packages_to_submit.append(package) + max_jobs_to_submit = max_jobs_to_submit - 1 + if error: + if len(self._jobs_list.get_in_queue()) == 0: # When there are not more possible jobs, autosubmit will stop the execution + raise AutosubmitCritical(self.error_message_policy(min_h, min_v, wrapper_limits, p.wallclock, balanced), 7014) + else: + Log.info( + "Wrapper policy is set to flexible and there is a deadlock, Autosubmit will submit the jobs sequentially") + for job in p.jobs: + if max_jobs_to_submit == 0: + break + job.packed = False + if job.status == Status.READY: + if job.type == Type.PYTHON and not self._platform.allow_python_jobs: + package = JobPackageSimpleWrapped( + [job]) + else: + package = JobPackageSimple([job]) + packages_to_submit.append(package) + max_jobs_to_submit = max_jobs_to_submit - 1 return packages_to_submit, max_jobs_to_submit - def error_message_policy(self,min_h,min_v,wrapper_limits,hard_deadlock,wallclock_sum,balanced): + def error_message_policy(self,min_h,min_v,wrapper_limits,wallclock_sum,balanced): message = f"Wrapper couldn't be formed under {self.wrapper_policy[self.current_wrapper_section]} POLICY due minimum limit not being reached: [wrappable:{wrapper_limits['min']} < defined_min:{min_h*min_v}] [wrappable_h:{min_h} < defined_min_h:{wrapper_limits['min_h']}]|[wrappeable_v:{min_v} < defined_min_v:{wrapper_limits['min_v']}] " - if hard_deadlock: - message += "\nCheck your configuration: The next wrappable job can't be wrapped until some of inner jobs of current packages finishes which is impossible" if min_v > 1: message += f"\nCheck your configuration: Check if current {wallclock_sum} vertical wallclock has reached the max defined on platforms.conf." else: -- GitLab From 58b1716557ab5ce4f6445d20aae543e8bf09544a Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 6 Mar 2024 10:36:28 +0100 Subject: [PATCH 07/41] fixed tests --- autosubmit/job/job_list_persistence.py | 4 +++- test/unit/test_job.py | 3 +++ test/unit/test_job_list.py | 6 ++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/autosubmit/job/job_list_persistence.py b/autosubmit/job/job_list_persistence.py index 951771bed..948f21c01 100644 --- a/autosubmit/job/job_list_persistence.py +++ b/autosubmit/job/job_list_persistence.py @@ -22,6 +22,7 @@ from sys import setrecursionlimit import shutil from autosubmit.database.db_manager import DbManager from log.log import AutosubmitCritical, Log +from contextlib import suppress class JobListPersistence(object): @@ -100,8 +101,9 @@ class JobListPersistencePkl(JobListPersistence): """ path = os.path.join(persistence_path, persistence_file + '.pkl' + '.tmp') - if os.path.exists(path): + with suppress(FileNotFoundError, PermissionError): os.remove(path) + setrecursionlimit(500000000) Log.debug("Saving JobList: " + path) with open(path, 'wb') as fd: diff --git a/test/unit/test_job.py b/test/unit/test_job.py index cf8125b58..d1526d9bb 100644 --- a/test/unit/test_job.py +++ b/test/unit/test_job.py @@ -610,6 +610,7 @@ CONFIG: run_only_members=[], #config.get_member_list(run_only=True), show_log=True, + create=True, ) job_list = job_list_obj.get_job_list() @@ -829,6 +830,7 @@ CONFIG: new=True, run_only_members=config.get_member_list(run_only=True), show_log=True, + create=True, ) job_list = job_list_obj.get_job_list() @@ -973,6 +975,7 @@ CONFIG: new=True, run_only_members=config.get_member_list(run_only=True), show_log=True, + create=True, ) job_list = job_list_obj.get_job_list() self.assertEqual(1, len(job_list)) diff --git a/test/unit/test_job_list.py b/test/unit/test_job_list.py index d5ce5b030..d02322503 100644 --- a/test/unit/test_job_list.py +++ b/test/unit/test_job_list.py @@ -248,6 +248,7 @@ class TestJobList(TestCase): default_job_type=Type.BASH, wrapper_jobs={}, new=True, + create=True, ) @@ -317,6 +318,7 @@ class TestJobList(TestCase): default_job_type=Type.BASH, wrapper_jobs={}, new=True, + create=True, ) job_list._job_list[0].member = "fake-member1" job_list._job_list[1].member = "fake-member2" @@ -363,6 +365,7 @@ class TestJobList(TestCase): default_job_type=Type.BASH, wrapper_jobs={}, new=True, + create=True, ) job_list._job_list[0].section = "fake-section" job_list._job_list[0].date = "fake-date1" @@ -446,6 +449,7 @@ class TestJobList(TestCase): default_job_type=Type.BASH, wrapper_jobs={}, new=True, + create=True, ) job_list.save() job_list2 = self.new_job_list(factory,temp_dir) @@ -461,6 +465,7 @@ class TestJobList(TestCase): default_job_type=Type.BASH, wrapper_jobs={}, new=False, + create=True, ) #return False job_list2.update_from_file = Mock() @@ -526,6 +531,7 @@ class TestJobList(TestCase): default_job_type=Type.BASH, wrapper_jobs={}, new=False, + create=True, ) # assert update_genealogy called with right values # When using an 4.0 experiment, the pkl has to be recreated and act as a new one. -- GitLab From 346a74908a6f339cd55845b416049f5077f4fee8 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 6 Mar 2024 10:40:26 +0100 Subject: [PATCH 08/41] fix feedback --- autosubmit/job/job_packager.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/autosubmit/job/job_packager.py b/autosubmit/job/job_packager.py index 5bec96435..d02b551c2 100644 --- a/autosubmit/job/job_packager.py +++ b/autosubmit/job/job_packager.py @@ -438,10 +438,7 @@ class JobPackager(object): job.packed = False jobs_to_wrap = self._divide_list_by_section(jobs_to_submit) non_wrapped_jobs = jobs_to_wrap.pop("SIMPLE",[]) - if len(non_wrapped_jobs) > 0: - any_simple_packages = True - else: - any_simple_packages = False + any_simple_packages = len(non_wrapped_jobs) > 0 # Prepare packages for wrapped jobs for wrapper_name, jobs in jobs_to_wrap.items(): if max_jobs_to_submit == 0: -- GitLab From 1a8c07b508ea8fce90f99b5958ee2eef65e773d6 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 6 Mar 2024 11:57:33 +0100 Subject: [PATCH 09/41] fix test --- test/unit/test_job_package.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/test/unit/test_job_package.py b/test/unit/test_job_package.py index a5b1085cf..e12aa8eb6 100644 --- a/test/unit/test_job_package.py +++ b/test/unit/test_job_package.py @@ -188,16 +188,20 @@ class TestJobPackage(TestCase): job._tmp_path = MagicMock() job._get_paramiko_template = MagicMock("false", "empty") job.update_parameters = MagicMock() + job.file = "fake-file" self.job_package._create_scripts = MagicMock() self.job_package._send_files = MagicMock() self.job_package._do_submission = MagicMock() - + configuration = MagicMock() + configuration.get_project_dir = MagicMock() + configuration.get_project_dir.return_value = "fake-proj-dir" # act - self.job_package.submit('fake-config', 'fake-params') + self.job_package.submit(configuration, 'fake-params') # assert for job in self.jobs: - job.update_parameters.assert_called_once_with('fake-config', 'fake-params') + job.update_parameters.assert_called() # Should be called once for each job, but currently it needs two calls (for additional files ) to change the code + #job.update_parameters.assert_called_once_with(configuration, 'fake-params') self.job_package._create_scripts.is_called_once_with() self.job_package._send_files.is_called_once_with() -- GitLab From 6c039ce683c4de410f036fe287764cae44588c8b Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 6 Mar 2024 12:42:14 +0100 Subject: [PATCH 10/41] Fix indent issue --- autosubmit/job/job_list.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index 6e924f109..629280c28 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -221,9 +221,9 @@ class JobList(object): if monitor: as_conf.experiment_data = as_conf.last_experiment_data as_conf.data_changed = False - if not as_conf.data_changed: - self._dic_jobs._job_list = {job["job"].name: job["job"] for _, job in self.graph.nodes.data() if - job.get("job", None)} + if not as_conf.data_changed: + self._dic_jobs._job_list = {job["job"].name: job["job"] for _, job in self.graph.nodes.data() if + job.get("job", None)} else: self._dic_jobs.compare_backbone_sections() # fast-look if graph existed, skips some steps -- GitLab From 07817775bc501f39432dc9a820d645873627b6bc Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 6 Mar 2024 14:22:17 +0100 Subject: [PATCH 11/41] Now experiment metadata is always saved --- autosubmit/job/job_list.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index 629280c28..0095752b5 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -220,21 +220,19 @@ class JobList(object): Log.info("Load finished") if monitor: as_conf.experiment_data = as_conf.last_experiment_data - as_conf.data_changed = False - if not as_conf.data_changed: - self._dic_jobs._job_list = {job["job"].name: job["job"] for _, job in self.graph.nodes.data() if - job.get("job", None)} + self._dic_jobs.changes = {} else: self._dic_jobs.compare_backbone_sections() + if not self._dic_jobs.changes: + self._dic_jobs._job_list = {job["job"].name: job["job"] for _, job in self.graph.nodes.data() if + job.get("job", None)} + else: # fast-look if graph existed, skips some steps # If VERSION in CONFIG or HPCARCH in DEFAULT it will exist, if not it won't. if not new and not self._dic_jobs.changes.get("EXPERIMENT", {}) and not self._dic_jobs.changes.get( "CONFIG", {}) and not self._dic_jobs.changes.get("DEFAULT", {}): self._dic_jobs._job_list = {job["job"].name: job["job"] for _, job in self.graph.nodes.data() if job.get("job", None)} - - # Force to use the last known job_list when autosubmit monitor is running. - self._dic_jobs.last_experiment_data = as_conf.last_experiment_data else: if not create: raise AutosubmitCritical("Autosubmit couldn't load the workflow graph. Please run autosubmit create first. If the pkl file exists and was generated with Autosubmit v4.1+, try again.",7013) @@ -376,7 +374,7 @@ class JobList(object): Log.debug("No changes detected, keeping edges") else: changes = True - Log.debug("No dependencies detected, calculating dependencies") + Log.debug("Changes detected, calculating dependencies") sections_gen = (section for section in jobs_data.keys()) for job_section in sections_gen: # Changes when all jobs of a section are added -- GitLab From 831481e8a9470222b835380af3b95ff950df096b Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 6 Mar 2024 14:23:50 +0100 Subject: [PATCH 12/41] update configparser version --- requeriments.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requeriments.txt b/requeriments.txt index ae0b28c5c..5b251a123 100644 --- a/requeriments.txt +++ b/requeriments.txt @@ -1,7 +1,7 @@ zipp>=3.1.0 setuptools>=60.8.2 cython -autosubmitconfigparser==1.0.56 +autosubmitconfigparser==1.0.57 paramiko>=2.9.2 bcrypt>=3.2 PyNaCl>=1.5.0 -- GitLab From 9afde768a5ec5a85f78209aa93dd8bee7e2bf6ce Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 6 Mar 2024 14:37:22 +0100 Subject: [PATCH 13/41] Change message for clarity --- autosubmit/job/job_packages.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/autosubmit/job/job_packages.py b/autosubmit/job/job_packages.py index 57376e602..8bb679ae9 100644 --- a/autosubmit/job/job_packages.py +++ b/autosubmit/job/job_packages.py @@ -102,8 +102,7 @@ class JobPackageBase(object): if configuration.get_project_type().lower() != "none" and len(configuration.get_project_type()) > 0: raise AutosubmitCritical(f"Job script:{job.file} does not exists",7014) if not job.check_script(configuration, parameters, show_logs=job.check_warnings): - Log.warning("Script {0} check failed", job.name) - Log.warning("On submission script has some empty variables") + Log.warning(f'Script {job.name} has some empty variables. An empty value has substituted these variables') else: Log.result("Script {0} OK", job.name) # looking for directives on jobs @@ -150,8 +149,7 @@ class JobPackageBase(object): if configuration.get_project_type().lower() != "none" and len(configuration.get_project_type()) > 0: raise AutosubmitCritical("Template [ {0} ] using CHECK=On_submission has some empty variable {0}".format(job.name),7014) if not job.check_script(configuration, parameters,show_logs=job.check_warnings): - Log.warning("Script {0} check failed",job.name) - Log.warning("On submission script has some empty variables") + Log.warning(f'Script {job.name} has some empty variables. An empty value has substituted these variables') else: Log.result("Script {0} OK",job.name) # looking for directives on jobs -- GitLab From da4d76b8511dd252b03aa39cc7a2256078dad700 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 7 Mar 2024 09:17:54 +0100 Subject: [PATCH 14/41] Log.status could fail if there is no id Added retrials for recovery --- autosubmit/job/job_list.py | 11 +++++++++-- autosubmit/platforms/platform.py | 9 +++++---- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index 0095752b5..9f5fbfcc4 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -2396,8 +2396,15 @@ class JobList(object): else: queue = job.queue platform_name = job.platform.name if job.platform else "no-platform" - Log.status("{0:<35}{1:<15}{2:<15}{3:<20}{4:<15}", job.name, job.id, Status( - ).VALUE_TO_KEY[job.status], platform_name, queue) + if job.id is None: + job_id = "no-id" + else: + job_id = job.id + try: + Log.status("{0:<35}{1:<15}{2:<15}{3:<20}{4:<15}", job.name, job_id, Status( + ).VALUE_TO_KEY[job.status], platform_name, queue) + except: + Log.debug("Couldn't print job status for job {0}".format(job.name)) for job in failed_job_list: if len(job.queue) < 1: queue = "no-scheduler" diff --git a/autosubmit/platforms/platform.py b/autosubmit/platforms/platform.py index 10d7e1051..05340a526 100644 --- a/autosubmit/platforms/platform.py +++ b/autosubmit/platforms/platform.py @@ -558,10 +558,11 @@ class Platform(object): :rtype: bool """ if recovery: - if self.get_file('{0}_COMPLETED'.format(job_name), False, ignore_log=recovery): - return True - else: - return False + retries = 5 + for i in range(retries): + if self.get_file('{0}_COMPLETED'.format(job_name), False, ignore_log=recovery): + return True + return False if self.check_file_exists('{0}_COMPLETED'.format(job_name), wrapper_failed=wrapper_failed): if self.get_file('{0}_COMPLETED'.format(job_name), True, wrapper_failed=wrapper_failed): return True -- GitLab From 78de75e1680c8a8f812d9c6cfe11e89cc8c38507 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 7 Mar 2024 13:41:18 +0100 Subject: [PATCH 15/41] Fixes "attempt to cancel an het job" failure message --- autosubmit/platforms/slurmplatform.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/autosubmit/platforms/slurmplatform.py b/autosubmit/platforms/slurmplatform.py index b51920d7d..e741239db 100644 --- a/autosubmit/platforms/slurmplatform.py +++ b/autosubmit/platforms/slurmplatform.py @@ -176,7 +176,11 @@ class SlurmPlatform(ParamikoPlatform): job_name = package.name if hasattr(package, "name") else package.jobs[0].name jobid = self.get_jobid_by_jobname(job_name) if len(jobid) > 1: # Cancel each job that is not the associated - for id_ in [ jobid for jobid in jobid if jobid != package.jobs[0].id ]: + ids_to_check = [package.jobs[0].id] + if package.jobs[0].het: + for i in range(1,package.jobs[0].het.get("HETSIZE",1)): + ids_to_check.append(str(int(ids_to_check[0]) + i)) + for id_ in [ jobid for jobid in jobid if jobid not in ids_to_check]: self.send_command(self.cancel_job(id_)) # This can be faster if we cancel all jobs at once but there is no cancel_all_jobs call right now so todo in future Log.debug(f'Job {id_} with the assigned name: {job_name} has been cancelled') Log.debug(f'Job {package.jobs[0].id} with the assigned name: {job_name} has been submitted') -- GitLab From a567694048c1a5742352a3d1d0ed3598746acaf2 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 7 Mar 2024 15:40:48 +0100 Subject: [PATCH 16/41] Changes "backup error message" --- autosubmit/autosubmit.py | 4 +-- autosubmit/job/job_list.py | 44 ++++++++++++++------------ autosubmit/job/job_list_persistence.py | 9 ++++-- 3 files changed, 31 insertions(+), 26 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index bb243e0d5..1bcf000cb 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -2476,12 +2476,10 @@ class Autosubmit: except AutosubmitCritical as e: raise except BaseException as e: - raise AutosubmitCritical("Error while checking the configuration files or loading the job_list", 7040, - str(e)) + raise finally: if profile: profiler.stop() - try: jobs = [] if not isinstance(job_list, type([])): diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index 9f5fbfcc4..cbd7e21af 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -204,7 +204,7 @@ class JobList(object): chunk_list = list(range(chunk_ini, num_chunks + 1)) self._chunk_list = chunk_list try: - self.graph = self.load() + self.graph = self.load(create) if type(self.graph) is not DiGraph: self.graph = nx.DiGraph() except AutosubmitCritical: @@ -213,8 +213,6 @@ class JobList(object): self.graph = nx.DiGraph() self._dic_jobs = DicJobs(date_list, member_list, chunk_list, date_format, default_retrials, as_conf) self._dic_jobs.graph = self.graph - if show_log: - Log.info("Creating jobs...") if len(self.graph.nodes) > 0: if show_log: Log.info("Load finished") @@ -246,6 +244,8 @@ class JobList(object): os.remove(os.path.join(self._persistence_path, self._persistence_file + "_backup.pkl")) new = True # This generates the job object and also finds if dic_jobs has modified from previous iteration in order to expand the workflow + if show_log: + Log.info("Creating jobs...") self._create_jobs(self._dic_jobs, 0, default_job_type) # not needed anymore all data is inside their correspondent sections in dic_jobs # This dic_job is key to the dependencies management as they're ordered by date[member[chunk]] @@ -2312,32 +2312,36 @@ class JobList(object): "Autosubmit will use a backup for recover the job_list", 6010) return list() - def load(self): + def load(self, create=False, backup=""): """ Recreates a stored job list from the persistence :return: loaded job list object :rtype: JobList """ - Log.info("Loading JobList") + if backup == "": + Log.info("Loading JobList") try: - return self._persistence.load(self._persistence_path, self._persistence_file) + return self._persistence.load(self._persistence_path, self._persistence_file + backup) except AutosubmitCritical: raise - except: - Log.printlog( - "Autosubmit will use a backup for recover the job_list", 6010) - return self.backup_load() - - def backup_load(self): - """ - Recreates a stored job list from the persistence - - :return: loaded job list object - :rtype: JobList - """ - Log.info("Loading backup JobList") - return self._persistence.load(self._persistence_path, self._persistence_file + "_backup") + except ValueError as e: + if not create: + raise AutosubmitCritical(f'JobList could not be loaded due pkl being saved with a different version of Autosubmit or Python version. {e}') + else: + Log.warning(f'Job list will be created from scratch due pkl being saved with a different version of Autosubmit or Python version. {e}') + except BaseException as e: + if backup == "": + Log.printlog( + "Autosubmit will use a backup for recover the job_list", 6010) + return self.load(create, "_backup") + else: + if not create: + raise AutosubmitCritical( + f'JobList backup could not be loaded due: {e}') + else: + Log.warning( + f'Joblist backup will be created from scratch due: {e}') def save(self): """ diff --git a/autosubmit/job/job_list_persistence.py b/autosubmit/job/job_list_persistence.py index 948f21c01..1a271a1fc 100644 --- a/autosubmit/job/job_list_persistence.py +++ b/autosubmit/job/job_list_persistence.py @@ -78,9 +78,12 @@ class JobListPersistencePkl(JobListPersistence): # copy the path to a tmp file randomseed to avoid corruption path_tmp = f'{path}.tmp_{os.urandom(8).hex()}' shutil.copy(path, path_tmp) - with open(path_tmp, 'rb') as fd: - graph = pickle.load(fd) - os.remove(path_tmp) + try: + with open(path_tmp, 'rb') as fd: + graph = pickle.load(fd) + except: + os.remove(path_tmp) + raise for u in ( node for node in graph ): # Set after the dependencies are set graph.nodes[u]["job"].children = set() -- GitLab From c59aff7ea02ed26c66a3ad02fc9e44d468451273 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 7 Mar 2024 16:31:50 +0100 Subject: [PATCH 17/41] Changed the location of pkl.tmp as pkl is not Writeable for the users --- autosubmit/job/job_list.py | 2 +- autosubmit/job/job_list_persistence.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index cbd7e21af..ec71b0ab5 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -2341,7 +2341,7 @@ class JobList(object): f'JobList backup could not be loaded due: {e}') else: Log.warning( - f'Joblist backup will be created from scratch due: {e}') + f'Joblist will be created from scratch due: {e}') def save(self): """ diff --git a/autosubmit/job/job_list_persistence.py b/autosubmit/job/job_list_persistence.py index 1a271a1fc..e232102c0 100644 --- a/autosubmit/job/job_list_persistence.py +++ b/autosubmit/job/job_list_persistence.py @@ -67,6 +67,8 @@ class JobListPersistencePkl(JobListPersistence): """ path = os.path.join(persistence_path, persistence_file + '.pkl') + path_tmp = os.path.join(persistence_path[:-3]+"tmp", persistence_file + f'.pkl.tmp_{os.urandom(8).hex()}') + try: open(path).close() except PermissionError: @@ -76,11 +78,11 @@ class JobListPersistencePkl(JobListPersistence): return list() else: # copy the path to a tmp file randomseed to avoid corruption - path_tmp = f'{path}.tmp_{os.urandom(8).hex()}' - shutil.copy(path, path_tmp) try: + shutil.copy(path, path_tmp) with open(path_tmp, 'rb') as fd: graph = pickle.load(fd) + os.remove(path_tmp) except: os.remove(path_tmp) raise -- GitLab From 58a7d9a3327cc92ab654b68f576c5acbd7dea416 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 8 Mar 2024 11:33:19 +0100 Subject: [PATCH 18/41] added unit test changed code based on feedback --- autosubmit/job/job_list.py | 30 ++++++------ autosubmit/job/job_list_persistence.py | 11 ++--- test/unit/test_job_list.py | 68 ++++++++++++++++++++++++++ 3 files changed, 89 insertions(+), 20 deletions(-) diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index ec71b0ab5..2bf7933f6 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -2312,36 +2312,38 @@ class JobList(object): "Autosubmit will use a backup for recover the job_list", 6010) return list() - def load(self, create=False, backup=""): + def load(self, create=False, backup=False): """ Recreates a stored job list from the persistence :return: loaded job list object :rtype: JobList """ - if backup == "": - Log.info("Loading JobList") try: - return self._persistence.load(self._persistence_path, self._persistence_file + backup) - except AutosubmitCritical: - raise + if not backup: + Log.info("Loading JobList") + return self._persistence.load(self._persistence_path, self._persistence_file) + else: + return self._persistence.load(self._persistence_path, self._persistence_file + "_backup") except ValueError as e: if not create: raise AutosubmitCritical(f'JobList could not be loaded due pkl being saved with a different version of Autosubmit or Python version. {e}') else: Log.warning(f'Job list will be created from scratch due pkl being saved with a different version of Autosubmit or Python version. {e}') + except PermissionError as e: + if not create: + raise AutosubmitCritical(f'JobList could not be loaded due to permission error. {e}') + else: + Log.warning(f'Job list will be created from scratch due to permission error. {e}') except BaseException as e: - if backup == "": - Log.printlog( - "Autosubmit will use a backup for recover the job_list", 6010) - return self.load(create, "_backup") + if not backup: + Log.debug("Autosubmit will use a backup to recover the job_list") + return self.load(create, True) else: if not create: - raise AutosubmitCritical( - f'JobList backup could not be loaded due: {e}') + raise AutosubmitCritical(f"JobList could not be loaded due: {e}\nAutosubmit won't do anything") else: - Log.warning( - f'Joblist will be created from scratch due: {e}') + Log.warning(f'Joblist will be created from scratch due: {e}') def save(self): """ diff --git a/autosubmit/job/job_list_persistence.py b/autosubmit/job/job_list_persistence.py index e232102c0..791de7864 100644 --- a/autosubmit/job/job_list_persistence.py +++ b/autosubmit/job/job_list_persistence.py @@ -72,20 +72,19 @@ class JobListPersistencePkl(JobListPersistence): try: open(path).close() except PermissionError: - raise AutosubmitCritical(f'Permission denied to read {path}', 7012) + Log.warning(f'Permission denied to read {path}') + raise except FileNotFoundError: - Log.printlog(f'File {path} does not exist. ',Log.WARNING) - return list() + Log.warning(f'File {path} does not exist. ') + raise else: # copy the path to a tmp file randomseed to avoid corruption try: shutil.copy(path, path_tmp) with open(path_tmp, 'rb') as fd: graph = pickle.load(fd) + finally: os.remove(path_tmp) - except: - os.remove(path_tmp) - raise for u in ( node for node in graph ): # Set after the dependencies are set graph.nodes[u]["job"].children = set() diff --git a/test/unit/test_job_list.py b/test/unit/test_job_list.py index d02322503..0dc87554c 100644 --- a/test/unit/test_job_list.py +++ b/test/unit/test_job_list.py @@ -1,3 +1,4 @@ +import os from unittest import TestCase from copy import copy import networkx @@ -15,6 +16,7 @@ from autosubmit.job.job_common import Type from autosubmit.job.job_list import JobList from autosubmit.job.job_list_persistence import JobListPersistencePkl from autosubmitconfigparser.config.yamlparser import YAMLParserFactory +from log.log import AutosubmitCritical class TestJobList(TestCase): @@ -66,6 +68,72 @@ class TestJobList(TestCase): def tearDown(self) -> None: shutil.rmtree(self.temp_directory) + def test_load(self): + as_conf = Mock() + as_conf.experiment_data = dict() + parser_mock = Mock() + parser_mock.read = Mock() + factory = YAMLParserFactory() + factory.create_parser = Mock(return_value=parser_mock) + date_list = ['fake-date1', 'fake-date2'] + member_list = ['fake-member1', 'fake-member2'] + num_chunks = 999 + parameters = {'fake-key': 'fake-value', + 'fake-key2': 'fake-value2'} + with tempfile.TemporaryDirectory() as temp_dir: + job_list = self.new_job_list(factory, temp_dir) + FakeBasicConfig.LOCAL_ROOT_DIR = str(temp_dir) + Path(temp_dir, self.experiment_id).mkdir() + for path in [f'{self.experiment_id}/tmp', f'{self.experiment_id}/tmp/ASLOGS', + f'{self.experiment_id}/tmp/ASLOGS_{self.experiment_id}', f'{self.experiment_id}/proj', + f'{self.experiment_id}/conf', f'{self.experiment_id}/pkl']: + Path(temp_dir, path).mkdir() + job_list.changes = Mock(return_value=['random_section', 'random_section']) + as_conf.detailed_deep_diff = Mock(return_value={}) + # as_conf.get_member_list = Mock(return_value=member_list) + # act + job_list.generate( + as_conf=as_conf, + date_list=date_list, + member_list=member_list, + num_chunks=num_chunks, + chunk_ini=1, + parameters=parameters, + date_format='H', + default_retrials=9999, + default_job_type=Type.BASH, + wrapper_jobs={}, + new=True, + create=True, + ) + job_list.save() + # Test load + job_list_to_load = self.new_job_list(factory, temp_dir) + # chmod + job_list_to_load.load(False) + self.assertEqual(job_list_to_load._job_list, job_list._job_list) + job_list_to_load.load(True) + self.assertEqual(job_list_to_load._job_list, job_list._job_list) + os.chmod(f'{temp_dir}/{self.experiment_id}/pkl/job_list_random-id.pkl', 0o000) + with self.assertRaises(AutosubmitCritical): + job_list_to_load.load(False) + job_list_to_load.load(True) + self.assertEqual(job_list_to_load._job_list, job_list._job_list) + os.chmod(f'{temp_dir}/{self.experiment_id}/pkl/job_list_random-id.pkl', 0o777) + shutil.copy(f'{temp_dir}/{self.experiment_id}/pkl/job_list_random-id.pkl',f'{temp_dir}/{self.experiment_id}/pkl/job_list_random-id_backup.pkl') + os.remove(f'{temp_dir}/{self.experiment_id}/pkl/job_list_random-id.pkl') + job_list_to_load.load(False) + self.assertEqual(job_list_to_load._job_list, job_list._job_list) + job_list_to_load.load(True) + self.assertEqual(job_list_to_load._job_list, job_list._job_list) + + + + + + + + def test_get_job_list_returns_the_right_list(self): job_list = self.job_list.get_job_list() self.assertEqual(self.job_list._job_list, job_list) -- GitLab From 82e83c707d491a9d2e83c134c4f0f5d2b4b44ab2 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 7 Mar 2024 11:32:14 +0100 Subject: [PATCH 19/41] Fix error autosubmit version Fix trace when exception as e is not an AutosubmitCritical or AutosubmitError --- autosubmit/autosubmit.py | 10 +++------- bin/autosubmit | 2 +- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 1bcf000cb..a360cdc5d 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -649,13 +649,9 @@ class Autosubmit: if args.command is None: parser.print_help() parser.exit() - - except Exception as e: - if type(e) is SystemExit: # todo check - # Version keyword force an exception in parse arg due and os_exit(0) but the program is successfully finished - if "0" in str(e): - print(Autosubmit.autosubmit_version) - return 0 + except SystemExit as e: + return 0 + except BaseException as e: raise AutosubmitCritical( "Incorrect arguments for this command", 7011) diff --git a/bin/autosubmit b/bin/autosubmit index 21c056019..45354e15c 100755 --- a/bin/autosubmit +++ b/bin/autosubmit @@ -36,7 +36,7 @@ def exit_from_error(e): with suppress(FileNotFoundError, PermissionError): os.remove(os.path.join(Log.file_path, "autosubmit.lock")) try: - if not e.trace: + if hasattr(e,"trace") and e.trace: Log.debug("Trace: {0}", str(e.trace)) Log.critical("{1} [eCode={0}]", e.code, e.message) except: -- GitLab From 99642adc3e6aa95a9d659fa9e1116a8ef0efbcf9 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 7 Mar 2024 11:33:30 +0100 Subject: [PATCH 20/41] Fix error autosubmit version Fix trace when exception as e is not an AutosubmitCritical or AutosubmitError update version --- bin/autosubmit | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/autosubmit b/bin/autosubmit index 45354e15c..5715dc687 100755 --- a/bin/autosubmit +++ b/bin/autosubmit @@ -39,7 +39,7 @@ def exit_from_error(e): if hasattr(e,"trace") and e.trace: Log.debug("Trace: {0}", str(e.trace)) Log.critical("{1} [eCode={0}]", e.code, e.message) - except: + except BaseException as e: Log.critical("An Unknown error occurred: {0}.\n Please report it to Autosubmit Developers through Git", str(e)) Log.info("More info at https://autosubmit.readthedocs.io/en/master/troubleshooting/error-codes.html") os._exit(1) -- GitLab From 0da1af6cd4ebcff7ca06d2874c93cb7147c3757f Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 7 Mar 2024 11:33:52 +0100 Subject: [PATCH 21/41] Fix error autosubmit version Fix trace when exception as e is not an AutosubmitCritical or AutosubmitError update version --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index ee74734aa..4d0dcda01 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -4.1.0 +4.1.2 -- GitLab From 9f7f9bc0e955da9ee13cf498709e7a96034ac48c Mon Sep 17 00:00:00 2001 From: dbeltran Date: Thu, 7 Mar 2024 11:34:08 +0100 Subject: [PATCH 22/41] update version ( empty line ) --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 4d0dcda01..cd9b8f559 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -4.1.2 +4.1.2 \ No newline at end of file -- GitLab From 2325b62b7859c1884db8c4aa965cdbfdc31c0c3d Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 8 Mar 2024 10:11:54 +0100 Subject: [PATCH 23/41] Address feedback --- autosubmit/autosubmit.py | 16 +++++++++------- bin/autosubmit | 28 +++++++++++++++++++++++----- 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index a360cdc5d..9dcc9d168 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -167,8 +167,8 @@ class Autosubmit: BasicConfig.read() parser = MyParser( description='Main executable for autosubmit. ') - parser.add_argument('-v', '--version', action='version', - version=Autosubmit.autosubmit_version) + parser.add_argument('-v', '--version', dest='version', action='store_true') + parser.add_argument('-lf', '--logfile', choices=('NO_LOG', 'INFO', 'WARNING', 'DEBUG'), default='DEBUG', type=str, help="sets file's log level.") @@ -644,11 +644,13 @@ class Autosubmit: help='Read job files generated by the inspect subcommand.') subparser.add_argument('ID', metavar='ID', help='An ID of a Workflow (eg a000) or a Job (eg a000_20220401_fc0_1_1_APPLICATION).') - args = parser.parse_args() - - if args.command is None: + args, unknown = parser.parse_known_args() + if args.version: + print(Autosubmit.autosubmit_version) + return 0 + if unknown or args.command is None: parser.print_help() - parser.exit() + return 0 except SystemExit as e: return 0 except BaseException as e: @@ -4789,7 +4791,7 @@ class Autosubmit: e.trace = traceback.format_exc() raise AutosubmitCritical(e.message, e.code, e.trace) except BaseException as e: - raise AutosubmitCritical(str(e), 7070) + raise finally: if profile: profiler.stop() diff --git a/bin/autosubmit b/bin/autosubmit index 5715dc687..dc87f07a6 100755 --- a/bin/autosubmit +++ b/bin/autosubmit @@ -30,20 +30,38 @@ sys.path.append(os.path.normpath(os.path.join(scriptdir, os.pardir))) # noinspection PyUnresolvedReferences from log.log import Log, AutosubmitCritical , AutosubmitError from autosubmit.autosubmit import Autosubmit +from typing import Union -def exit_from_error(e): +def exit_from_error(e: BaseException): + Log.debug(str(traceback.format_exc())) with suppress(FileNotFoundError, PermissionError): os.remove(os.path.join(Log.file_path, "autosubmit.lock")) - try: - if hasattr(e,"trace") and e.trace: + if isinstance(e, (AutosubmitCritical, AutosubmitError)): + e: Union[AutosubmitError, AutosubmitCritical] = e + if e.trace: Log.debug("Trace: {0}", str(e.trace)) Log.critical("{1} [eCode={0}]", e.code, e.message) - except BaseException as e: - Log.critical("An Unknown error occurred: {0}.\n Please report it to Autosubmit Developers through Git", str(e)) + else: + msg = "An Unknown error occurred: {0}.\n Please report it to Autosubmit Developers through Git" + args = [str(e)] + Log.critical(msg.format(*args)) Log.info("More info at https://autosubmit.readthedocs.io/en/master/troubleshooting/error-codes.html") os._exit(1) + +# def exit_from_error(e: BaseException): +# with suppress(FileNotFoundError, PermissionError): +# os.remove(os.path.join(Log.file_path, "autosubmit.lock")) +# try: +# if hasattr(e,"trace") and e.trace: +# Log.debug("Trace: {0}", str(e.trace)) +# Log.critical("{1} [eCode={0}]", e.code, e.message) +# except BaseException as e: +# Log.critical("An Unknown error occurred: {0}.\n Please report it to Autosubmit Developers through Git", str(e)) +# Log.info("More info at https://autosubmit.readthedocs.io/en/master/troubleshooting/error-codes.html") +# os._exit(1) + # noinspection PyProtectedMember def main(): try: -- GitLab From 6dff07aadb93f7d228fb5bcd64f8cedd363ff1f3 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 8 Mar 2024 12:16:32 +0100 Subject: [PATCH 24/41] Deleted systemexit --- autosubmit/autosubmit.py | 2 -- bin/autosubmit | 19 ++++--------------- 2 files changed, 4 insertions(+), 17 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 9dcc9d168..aeac6033f 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -651,8 +651,6 @@ class Autosubmit: if unknown or args.command is None: parser.print_help() return 0 - except SystemExit as e: - return 0 except BaseException as e: raise AutosubmitCritical( "Incorrect arguments for this command", 7011) diff --git a/bin/autosubmit b/bin/autosubmit index dc87f07a6..d87c7adce 100755 --- a/bin/autosubmit +++ b/bin/autosubmit @@ -21,7 +21,6 @@ import os import sys import traceback -from io import StringIO # for handling the traceback print from contextlib import suppress scriptdir = os.path.abspath(os.path.dirname(sys.argv[0])) @@ -34,7 +33,10 @@ from typing import Union def exit_from_error(e: BaseException): - Log.debug(str(traceback.format_exc())) + try: + Log.debug(traceback.format_exc()) + except: + Log.debug("No traceback available") with suppress(FileNotFoundError, PermissionError): os.remove(os.path.join(Log.file_path, "autosubmit.lock")) if isinstance(e, (AutosubmitCritical, AutosubmitError)): @@ -49,19 +51,6 @@ def exit_from_error(e: BaseException): Log.info("More info at https://autosubmit.readthedocs.io/en/master/troubleshooting/error-codes.html") os._exit(1) - -# def exit_from_error(e: BaseException): -# with suppress(FileNotFoundError, PermissionError): -# os.remove(os.path.join(Log.file_path, "autosubmit.lock")) -# try: -# if hasattr(e,"trace") and e.trace: -# Log.debug("Trace: {0}", str(e.trace)) -# Log.critical("{1} [eCode={0}]", e.code, e.message) -# except BaseException as e: -# Log.critical("An Unknown error occurred: {0}.\n Please report it to Autosubmit Developers through Git", str(e)) -# Log.info("More info at https://autosubmit.readthedocs.io/en/master/troubleshooting/error-codes.html") -# os._exit(1) - # noinspection PyProtectedMember def main(): try: -- GitLab From 62e4098e7895d2c5b16ff9d768ca8f9250eb8ffd Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 8 Mar 2024 12:22:55 +0100 Subject: [PATCH 25/41] added unit test --- autosubmit/autosubmit.py | 4 ++-- test/unit/test_version.py | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) create mode 100644 test/unit/test_version.py diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index aeac6033f..980ed4745 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -647,10 +647,10 @@ class Autosubmit: args, unknown = parser.parse_known_args() if args.version: print(Autosubmit.autosubmit_version) - return 0 + return 1 if unknown or args.command is None: parser.print_help() - return 0 + return 1 except BaseException as e: raise AutosubmitCritical( "Incorrect arguments for this command", 7011) diff --git a/test/unit/test_version.py b/test/unit/test_version.py new file mode 100644 index 000000000..767f38080 --- /dev/null +++ b/test/unit/test_version.py @@ -0,0 +1,20 @@ +import subprocess +from pathlib import Path +from unittest import TestCase + +import sys + +from autosubmit.autosubmit import Autosubmit + + +class TestAutosubmit(TestCase): + + def testAutosubmitVersion(self): + bin_path = Path(__file__, '../../../bin/autosubmit').resolve() + out = subprocess.getoutput(' '.join([sys.executable, str(bin_path), '-v'])) + self.assertEquals(Autosubmit.autosubmit_version, out.strip()) + + def testAutosubmitVersionBroken(self): + bin_path = Path(__file__, '../../../bin/autosubmit').resolve() + exit_code, _ = subprocess.getstatusoutput(' '.join([sys.executable, str(bin_path), '-abcdefg'])) + self.assertEquals(1, exit_code) -- GitLab From 06c103cce0a87073f452a850fcaeacf4e8e4c5c3 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 8 Mar 2024 13:18:20 +0100 Subject: [PATCH 26/41] Patched --- autosubmit/autosubmit.py | 2 +- test/unit/test_version.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 980ed4745..f40c65b88 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -647,7 +647,7 @@ class Autosubmit: args, unknown = parser.parse_known_args() if args.version: print(Autosubmit.autosubmit_version) - return 1 + return 0 if unknown or args.command is None: parser.print_help() return 1 diff --git a/test/unit/test_version.py b/test/unit/test_version.py index 767f38080..856979767 100644 --- a/test/unit/test_version.py +++ b/test/unit/test_version.py @@ -11,10 +11,11 @@ class TestAutosubmit(TestCase): def testAutosubmitVersion(self): bin_path = Path(__file__, '../../../bin/autosubmit').resolve() - out = subprocess.getoutput(' '.join([sys.executable, str(bin_path), '-v'])) - self.assertEquals(Autosubmit.autosubmit_version, out.strip()) + exit_code, out = subprocess.getstatusoutput(' '.join([sys.executable, str(bin_path), '-v'])) + self.assertEqual(0, exit_code) + self.assertEqual(Autosubmit.autosubmit_version, out.strip()) def testAutosubmitVersionBroken(self): bin_path = Path(__file__, '../../../bin/autosubmit').resolve() exit_code, _ = subprocess.getstatusoutput(' '.join([sys.executable, str(bin_path), '-abcdefg'])) - self.assertEquals(1, exit_code) + self.assertEqual(1, exit_code) -- GitLab From 16722ea9dc31433a0f0d24051b87731ea476d1a5 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 8 Mar 2024 14:39:17 +0100 Subject: [PATCH 27/41] Removed argparser --- bin/autosubmit | 2 +- requeriments.txt | 1 - setup.py | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/bin/autosubmit b/bin/autosubmit index d87c7adce..53a7f58e6 100755 --- a/bin/autosubmit +++ b/bin/autosubmit @@ -36,7 +36,7 @@ def exit_from_error(e: BaseException): try: Log.debug(traceback.format_exc()) except: - Log.debug("No traceback available") + print(traceback.format_exc()) with suppress(FileNotFoundError, PermissionError): os.remove(os.path.join(Log.file_path, "autosubmit.lock")) if isinstance(e, (AutosubmitCritical, AutosubmitError)): diff --git a/requeriments.txt b/requeriments.txt index 5b251a123..4d3ddea93 100644 --- a/requeriments.txt +++ b/requeriments.txt @@ -6,7 +6,6 @@ paramiko>=2.9.2 bcrypt>=3.2 PyNaCl>=1.5.0 configobj>=5.0.6 -argparse>=1.4.0 python-dateutil>=2.8.2 matplotlib<3.6 py3dotplus>=1.1.0 diff --git a/setup.py b/setup.py index 164dae7c7..56cbf3f52 100644 --- a/setup.py +++ b/setup.py @@ -39,7 +39,7 @@ setup( url='http://www.bsc.es/projects/earthscience/autosubmit/', download_url='https://earth.bsc.es/wiki/doku.php?id=tools:autosubmit', keywords=['climate', 'weather', 'workflow', 'HPC'], - install_requires=['zipp>=3.1.0','ruamel.yaml==0.17.21','cython','autosubmitconfigparser','bcrypt>=3.2','packaging>19','six>=1.10.0','configobj>=5.0.6','argparse>=1.4.0','python-dateutil>=2.8.2','matplotlib<3.6','py3dotplus>=1.1.0','pyparsing>=3.0.7','paramiko>=2.9.2','mock>=4.0.3','portalocker>=2.3.2,<=2.7.0','networkx==2.6.3','requests>=2.27.1','bscearth.utils>=0.5.2','cryptography>=36.0.1','setuptools>=60.8.2','xlib>=0.21','pip>=22.0.3','pythondialog','pytest','nose','coverage','PyNaCl>=1.5.0','Pygments','psutil','rocrate==0.*'], + install_requires=['zipp>=3.1.0','ruamel.yaml==0.17.21','cython','autosubmitconfigparser','bcrypt>=3.2','packaging>19','six>=1.10.0','configobj>=5.0.6','python-dateutil>=2.8.2','matplotlib<3.6','py3dotplus>=1.1.0','pyparsing>=3.0.7','paramiko>=2.9.2','mock>=4.0.3','portalocker>=2.3.2,<=2.7.0','networkx==2.6.3','requests>=2.27.1','bscearth.utils>=0.5.2','cryptography>=36.0.1','setuptools>=60.8.2','xlib>=0.21','pip>=22.0.3','pythondialog','pytest','nose','coverage','PyNaCl>=1.5.0','Pygments','psutil','rocrate==0.*'], classifiers=[ "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.9", -- GitLab From 84249b90494e0a3bcc4025a985eac893ff4b0996 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 8 Mar 2024 17:11:53 +0100 Subject: [PATCH 28/41] Removed argparser changed print for log.info --- autosubmit/autosubmit.py | 2 +- requeriments.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index f40c65b88..6b4c173ce 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -646,7 +646,7 @@ class Autosubmit: args, unknown = parser.parse_known_args() if args.version: - print(Autosubmit.autosubmit_version) + Log.info(Autosubmit.autosubmit_version) return 0 if unknown or args.command is None: parser.print_help() diff --git a/requeriments.txt b/requeriments.txt index 4d3ddea93..9c0395571 100644 --- a/requeriments.txt +++ b/requeriments.txt @@ -1,7 +1,7 @@ zipp>=3.1.0 setuptools>=60.8.2 cython -autosubmitconfigparser==1.0.57 +autosubmitconfigparser==1.0.58 paramiko>=2.9.2 bcrypt>=3.2 PyNaCl>=1.5.0 -- GitLab From f575c45d5d0e84a3beb3d71c5d4efed23542b2d7 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Fri, 8 Mar 2024 20:11:35 +0100 Subject: [PATCH 29/41] fixed monitor for sure ( Tested in vml ) --- autosubmit/job/job_list.py | 1 - 1 file changed, 1 deletion(-) diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py index 2bf7933f6..f23ca4e73 100644 --- a/autosubmit/job/job_list.py +++ b/autosubmit/job/job_list.py @@ -217,7 +217,6 @@ class JobList(object): if show_log: Log.info("Load finished") if monitor: - as_conf.experiment_data = as_conf.last_experiment_data self._dic_jobs.changes = {} else: self._dic_jobs.compare_backbone_sections() -- GitLab From 5759010155ed00bbb56c9cdc8a9ed8c2f39d014b Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 11 Mar 2024 10:14:31 +0100 Subject: [PATCH 30/41] Reverting --version commented the code that should work --- autosubmit/autosubmit.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 6b4c173ce..97a625459 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -167,7 +167,9 @@ class Autosubmit: BasicConfig.read() parser = MyParser( description='Main executable for autosubmit. ') - parser.add_argument('-v', '--version', dest='version', action='store_true') + # parser.add_argument('-v', '--version', dest='version', action='store_true') Should work but doesn't in a module + parser.add_argument('-v', '--version', action='version', version=Autosubmit.autosubmit_version) + parser.add_argument('-lf', '--logfile', choices=('NO_LOG', 'INFO', 'WARNING', 'DEBUG'), default='DEBUG', type=str, @@ -644,14 +646,22 @@ class Autosubmit: help='Read job files generated by the inspect subcommand.') subparser.add_argument('ID', metavar='ID', help='An ID of a Workflow (eg a000) or a Job (eg a000_20220401_fc0_1_1_APPLICATION).') - args, unknown = parser.parse_known_args() - if args.version: - Log.info(Autosubmit.autosubmit_version) - return 0 - if unknown or args.command is None: + args = parser.parse_args() + if args.command is None: parser.print_help() - return 1 + parser.exit() + # Should work but doesn't in a module + # args, unknown = parser.parse_known_args() + # if args.version: + # print(Autosubmit.autosubmit_version) + # return 0 + # if unknown or args.command is None: + # parser.print_help() + # return 1 + except SystemExit: + return 0 except BaseException as e: + parser.print_help() raise AutosubmitCritical( "Incorrect arguments for this command", 7011) -- GitLab From 85424cc84ef158a78505b6d2b3c9d7cd7dd87076 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 11 Mar 2024 10:41:13 +0100 Subject: [PATCH 31/41] Revert "Reverting --version commented the code that should work" This reverts commit a46196294d83e955a362feee19817ced49d9913e. --- autosubmit/autosubmit.py | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 97a625459..6b4c173ce 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -167,9 +167,7 @@ class Autosubmit: BasicConfig.read() parser = MyParser( description='Main executable for autosubmit. ') - # parser.add_argument('-v', '--version', dest='version', action='store_true') Should work but doesn't in a module - parser.add_argument('-v', '--version', action='version', version=Autosubmit.autosubmit_version) - + parser.add_argument('-v', '--version', dest='version', action='store_true') parser.add_argument('-lf', '--logfile', choices=('NO_LOG', 'INFO', 'WARNING', 'DEBUG'), default='DEBUG', type=str, @@ -646,22 +644,14 @@ class Autosubmit: help='Read job files generated by the inspect subcommand.') subparser.add_argument('ID', metavar='ID', help='An ID of a Workflow (eg a000) or a Job (eg a000_20220401_fc0_1_1_APPLICATION).') - args = parser.parse_args() - if args.command is None: + args, unknown = parser.parse_known_args() + if args.version: + Log.info(Autosubmit.autosubmit_version) + return 0 + if unknown or args.command is None: parser.print_help() - parser.exit() - # Should work but doesn't in a module - # args, unknown = parser.parse_known_args() - # if args.version: - # print(Autosubmit.autosubmit_version) - # return 0 - # if unknown or args.command is None: - # parser.print_help() - # return 1 - except SystemExit: - return 0 + return 1 except BaseException as e: - parser.print_help() raise AutosubmitCritical( "Incorrect arguments for this command", 7011) -- GitLab From 4158ed27fb2afd0d77e9f528de6bed0613b9bc31 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 11 Mar 2024 10:50:09 +0100 Subject: [PATCH 32/41] Added unbuffered setting --- autosubmit/autosubmit.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 6b4c173ce..aed7a07ba 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -157,12 +157,18 @@ class Autosubmit: exit = False + @staticmethod + def environ_init(): + """Initialise AS environment.""" + # Python output buffering delays appearance of stdout and stderr + # when output is not directed to a terminal + os.environ['PYTHONUNBUFFERED'] = 'true' @staticmethod def parse_args(): """ Parse arguments given to an executable and start execution of command given """ - + Autosubmit.environ_init() try: BasicConfig.read() parser = MyParser( -- GitLab From c882b835625dfc8dd0f7b96357a593d47c207ba8 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 11 Mar 2024 13:28:20 +0100 Subject: [PATCH 33/41] Allows -txt from non-owner --- autosubmit/autosubmit.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index aed7a07ba..53e7f528a 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -830,6 +830,8 @@ class Autosubmit: os.mkdir(aslogs_path) if owner: os.chmod(tmp_path, 0o775) + with suppress(PermissionError, FileNotFoundError, Exception): # for -txt option + os.chmod(f'{exp_path}/status', 0o775) Log.set_file(os.path.join(aslogs_path, args.command + '.log'), "out", log_level) Log.set_file(os.path.join(aslogs_path, args.command + '_err.log'), "err") -- GitLab From a0e128e1018bb7d3c6426957210d6d56913911bf Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 11 Mar 2024 14:56:18 +0100 Subject: [PATCH 34/41] changelog --- CHANGELOG | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/CHANGELOG b/CHANGELOG index e69de29bb..63750b5a8 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -0,0 +1,16 @@ +4.1 - Workflow optimizations and bug fixes + +- Improved the performance and memory usage of the workflow generation process. + - Improved the performance and memory usage of jobs generation process. + - Improved the performance and memory usage of the dependency generation process. +- Improved the performance and memory usage of the workflow visualization process. +- Added a new filter to setstatus ( -ftcs ) to filter by split. +- Added -no-requeue to avoid requeueing jobs. +- Added a mechanism to detect duplicated jobs. +- Fixed multiples issues with the splits usage. +- Fixed multiples issues with Totaljobs. +- Reworked the deadlock detection mechanism. +- Changed multiple debug messages to be more clear. +- Changed the load/save pkl procedure +- Fixed issues with check command +- Added previous keyword. -- GitLab From e9ada8c5ebdfef266dad64db10d2cf14ad883ebd Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 11 Mar 2024 15:00:53 +0100 Subject: [PATCH 35/41] changelog --- CHANGELOG | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 63750b5a8..87afef2b1 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,13 @@ -4.1 - Workflow optimizations and bug fixes + +4.1.2 - Bug fixes +================= +- Fixed issues with version. +- Fixed issues with the duplication of jobs when using the heterogeneous option. +- Fixed some error messages. +- Fixed issues with monitoring non-owned experiments. + +4.1.1 - Workflow optimizations and bug fixes +========================================== - Improved the performance and memory usage of the workflow generation process. - Improved the performance and memory usage of jobs generation process. @@ -12,5 +21,7 @@ - Reworked the deadlock detection mechanism. - Changed multiple debug messages to be more clear. - Changed the load/save pkl procedure -- Fixed issues with check command +- Fixed issues with check command and additional files regex. - Added previous keyword. +- Fixed an issue with the historical db. +- Fixed an issue with historical db logs. -- GitLab From 07191ce2d80eafb1c791a99bd5b4cea9b510a40f Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 11 Mar 2024 15:01:27 +0100 Subject: [PATCH 36/41] changelog --- CHANGELOG | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG b/CHANGELOG index 87afef2b1..a2e6dd86a 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -9,6 +9,8 @@ 4.1.1 - Workflow optimizations and bug fixes ========================================== +In this version, Autosubmit supports much larger workflows and has improved performance and memory usage. We have also fixed several bugs and added new features. + - Improved the performance and memory usage of the workflow generation process. - Improved the performance and memory usage of jobs generation process. - Improved the performance and memory usage of the dependency generation process. -- GitLab From 5a5ba99057e8c39c41ed340c4cef441fe2f4760b Mon Sep 17 00:00:00 2001 From: dbeltran Date: Mon, 18 Mar 2024 09:09:43 +0100 Subject: [PATCH 37/41] fixed some typos --- CHANGELOG | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index a2e6dd86a..5e69a0f3d 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,3 @@ - 4.1.2 - Bug fixes ================= - Fixed issues with version. @@ -9,21 +8,22 @@ 4.1.1 - Workflow optimizations and bug fixes ========================================== -In this version, Autosubmit supports much larger workflows and has improved performance and memory usage. We have also fixed several bugs and added new features. +Autosubmit supports much larger workflows in this version and has improved performance and memory usage. We have also fixed several bugs and added new features. - Improved the performance and memory usage of the workflow generation process. - - Improved the performance and memory usage of jobs generation process. + - Improved the performance and memory usage of the jobs generation process. - Improved the performance and memory usage of the dependency generation process. - Improved the performance and memory usage of the workflow visualization process. - Added a new filter to setstatus ( -ftcs ) to filter by split. - Added -no-requeue to avoid requeueing jobs. -- Added a mechanism to detect duplicated jobs. -- Fixed multiples issues with the splits usage. -- Fixed multiples issues with Totaljobs. +- A mechanism was added to detect duplicate jobs. +- Fixed multiple issues with the splits usage. +- Fixed multiple issues with Totaljobs. - Reworked the deadlock detection mechanism. -- Changed multiple debug messages to be more clear. +- Changed multiple debug messages to make them more straightforward. - Changed the load/save pkl procedure - Fixed issues with check command and additional files regex. -- Added previous keyword. +- Added the previous keyword. - Fixed an issue with the historical db. - Fixed an issue with historical db logs. + -- GitLab From 002ee5e79e19ac4ab003a6a429d5ceb6de58e7a2 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 20 Mar 2024 11:45:26 +0100 Subject: [PATCH 38/41] Added some error detection --- autosubmit/job/job_utils.py | 42 +++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/autosubmit/job/job_utils.py b/autosubmit/job/job_utils.py index 97d74d22a..3fcd40920 100644 --- a/autosubmit/job/job_utils.py +++ b/autosubmit/job/job_utils.py @@ -68,6 +68,46 @@ def calendar_unitsize_getlowersize(unitsize): else: return list(CALENDAR_UNITSIZE_ENUM.keys())[unit_value - 1] +def calendar_get_month_days(date_str): + """ + Get the number of days in a month + :param date_str: Date in string format (YYYYMMDD) + :return: + """ + year = int(date_str[0:4]) + month = int(date_str[4:6]) + if month == 2: + if is_leap_year(year): + return 29 + else: + return 28 + elif month in [4, 6, 9, 11]: + return 30 + else: + return 31 + +def calendar_split_size_isvalid(datestr, size, unit_size): + """ + Check if the split size is valid for the calendar + :param datestr: Date in string format (YYYYMMDD) + :param size: Split size + :param unit_size: Split unit size ( hour, day, month, year) + :return: + """ + if unit_size == "hour": + return size <= 24 + elif unit_size == "day": + return size <= calendar_get_month_days(datestr) + elif unit_size == "month": + return size <= 12 + elif unit_size == "year": + return size <= 1 + else: + return False + + + + def calendar_chunk_section(exp_data, section, date, chunk): @@ -109,6 +149,8 @@ def calendar_chunk_section(exp_data, section, date, chunk): else: num_max_splits = run_days split_size = get_split_size(exp_data, section) + if not calendar_split_size_isvalid(date_str, split_size, split_unit): + raise AutosubmitCritical(f"Invalid split size for the calendar. The split size is {split_size} and the unit is {split_unit}.") splits = num_max_splits / split_size if not splits.is_integer() and split_policy == "flexible": Log.warning(f"The number of splits:{num_max_splits}/{split_size} is not an integer. The number of splits will be rounded up due the flexible split policy.\n You can modify the SPLITPOLICY parameter in the section {section} to 'strict' to avoid this behavior.") -- GitLab From f65b5cd7c9d45b35eb40bb60b39112b9ee2a7579 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 20 Mar 2024 12:01:33 +0100 Subject: [PATCH 39/41] Added some error detection --- autosubmit/job/job_utils.py | 49 +++++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/autosubmit/job/job_utils.py b/autosubmit/job/job_utils.py index 3fcd40920..d407f0160 100644 --- a/autosubmit/job/job_utils.py +++ b/autosubmit/job/job_utils.py @@ -86,25 +86,44 @@ def calendar_get_month_days(date_str): else: return 31 -def calendar_split_size_isvalid(datestr, size, unit_size): + +def calendar_split_size_isvalid(date_str, split_size, split_unit, chunk_unit, chunk_length): """ Check if the split size is valid for the calendar - :param datestr: Date in string format (YYYYMMDD) - :param size: Split size - :param unit_size: Split unit size ( hour, day, month, year) - :return: + :param date_str: Date in string format (YYYYMMDD) + :param split_size: Size of the split + :param split_unit: Unit of the split + :param chunk_unit: Unit of the chunk + :param chunk_length: Size of the chunk + :return: Boolean """ - if unit_size == "hour": - return size <= 24 - elif unit_size == "day": - return size <= calendar_get_month_days(datestr) - elif unit_size == "month": - return size <= 12 - elif unit_size == "year": - return size <= 1 + if is_leap_year(int(date_str[0:4])): + num_days_in_a_year = 366 else: - return False + num_days_in_a_year = 365 + if chunk_unit == "year": + chunk_size_in_hours = num_days_in_a_year * 24 * chunk_length + elif chunk_unit == "month": + chunk_size_in_hours = calendar_get_month_days(date_str) * 24 * chunk_length + elif chunk_unit == "day": + chunk_size_in_hours = 24 * chunk_length + else: + chunk_size_in_hours = chunk_length + + if split_unit == "year": + split_size_in_hours = num_days_in_a_year * 24 * split_size + elif split_unit == "month": + split_size_in_hours = calendar_get_month_days(date_str) * 24 * split_size + elif split_unit == "day": + split_size_in_hours = 24 * split_size + else: + split_size_in_hours = split_size + if split_size_in_hours != chunk_size_in_hours: + Log.warning(f"After calculations, the total sizes are: SplitSize*SplitUnitSize:{split_size_in_hours} hours, ChunkSize*ChunkUnitsize:{chunk_size_in_hours} hours.") + else: + Log.debug(f"Split size in hours: {split_size_in_hours}, Chunk size in hours: {chunk_size_in_hours}") + return split_size_in_hours <= chunk_size_in_hours @@ -149,7 +168,7 @@ def calendar_chunk_section(exp_data, section, date, chunk): else: num_max_splits = run_days split_size = get_split_size(exp_data, section) - if not calendar_split_size_isvalid(date_str, split_size, split_unit): + if not calendar_split_size_isvalid(date_str, split_size, split_unit, chunk_unit, chunk_length): raise AutosubmitCritical(f"Invalid split size for the calendar. The split size is {split_size} and the unit is {split_unit}.") splits = num_max_splits / split_size if not splits.is_integer() and split_policy == "flexible": -- GitLab From a240f2618564827435ff6e496b0786424d57b967 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 20 Mar 2024 12:21:53 +0100 Subject: [PATCH 40/41] fixing some stuff that was not merged correctly at some point --- autosubmit/job/job.py | 145 +++++++++++++++++++----------------------- test/unit/test_job.py | 1 + 2 files changed, 66 insertions(+), 80 deletions(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 301fe9510..23879febf 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -1768,90 +1768,75 @@ class Job(object): self.total_jobs = job_data.get("TOTALJOBS",job_data.get("TOTALJOBS", job_platform.get("TOTALJOBS", job_platform.get("TOTAL_JOBS", -1)))) self.max_waiting_jobs = job_data.get("MAXWAITINGJOBS",job_data.get("MAXWAITINGJOBS", job_platform.get("MAXWAITINGJOBS", job_platform.get("MAX_WAITING_JOBS", -1)))) - def update_job_parameters(self,as_conf, parameters): - self.splits = as_conf.jobs_data[self.section].get("SPLITS", None) - self.delete_when_edgeless = as_conf.jobs_data[self.section].get("DELETE_WHEN_EDGELESS", True) - self.check = as_conf.jobs_data[self.section].get("CHECK", False) - self.check_warnings = as_conf.jobs_data[self.section].get("CHECK_WARNINGS", False) - if self.checkpoint: # To activate placeholder sustitution per in the template - parameters["AS_CHECKPOINT"] = self.checkpoint - parameters['JOBNAME'] = self.name - parameters['FAIL_COUNT'] = str(self.fail_count) - parameters['SDATE'] = self.sdate - parameters['MEMBER'] = self.member - parameters['SPLIT'] = self.split - parameters['SPLITS'] = self.splits - parameters['DELAY'] = self.delay - parameters['FREQUENCY'] = self.frequency - parameters['SYNCHRONIZE'] = self.synchronize - parameters['PACKED'] = self.packed - parameters['CHUNK'] = 1 - parameters['RETRIALS'] = self.retrials - parameters['DELAY_RETRIALS'] = self.delay_retrials - parameters['DELETE_WHEN_EDGELESS'] = self.delete_when_edgeless + def calendar_split(self, as_conf, parameters): + """ + Calendar for splits + :param parameters: + :return: + """ + # Calendar struct type numbered ( year, month, day, hour ) + + + job_data = as_conf.jobs_data.get(self.section,{}) + if job_data.get("SPLITS", None) and self.running != "once": # once jobs has no date + # total_split = int(self.splits) + split_unit = get_split_size_unit(as_conf.experiment_data, self.section) + cal = str(parameters.get('EXPERIMENT.CALENDAR', "standard")).lower() + split_length = get_split_size(as_conf.experiment_data, self.section) + start_date = parameters.get('CHUNK_START_DATE', None) + if start_date: + self.date = datetime.datetime.strptime(start_date, "%Y%m%d") + split_start = self.split_start_date(self.date, int(self.split), split_length, split_unit, cal) + split_end = self.split_end_date(split_start, split_length, split_unit, cal) + if split_unit == 'hour': + split_end_1 = split_end + else: + split_end_1 = previous_day(split_end, cal) + + parameters['SPLIT'] = self.split + parameters['SPLITSCALENDAR'] = cal + parameters['SPLITSIZE'] = split_length + parameters['SPLITSIZEUNIT'] = split_unit + + parameters['SPLIT_START_DATE'] = date2str( + split_start, self.date_format) + parameters['SPLIT_START_YEAR'] = str(split_start.year) + parameters['SPLIT_START_MONTH'] = str(split_start.month).zfill(2) + parameters['SPLIT_START_DAY'] = str(split_start.day).zfill(2) + parameters['SPLIT_START_HOUR'] = str(split_start.hour).zfill(2) + + parameters['SPLIT_SECOND_TO_LAST_DATE'] = date2str( + split_end_1, self.date_format) + parameters['SPLIT_SECOND_TO_LAST_YEAR'] = str(split_end_1.year) + parameters['SPLIT_SECOND_TO_LAST_MONTH'] = str(split_end_1.month).zfill(2) + parameters['SPLIT_SECOND_TO_LAST_DAY'] = str(split_end_1.day).zfill(2) + parameters['SPLIT_SECOND_TO_LAST_HOUR'] = str(split_end_1.hour).zfill(2) + + parameters['SPLIT_END_DATE'] = date2str( + split_end, self.date_format) + parameters['SPLIT_END_YEAR'] = str(split_end.year) + parameters['SPLIT_END_MONTH'] = str(split_end.month).zfill(2) + parameters['SPLIT_END_DAY'] = str(split_end.day).zfill(2) + parameters['SPLIT_END_HOUR'] = str(split_end.hour).zfill(2) + if int(self.split) == 1: + parameters['SPLIT_FIRST'] = 'TRUE' + else: + parameters['SPLIT_FIRST'] = 'FALSE' - def calendar_split(self, as_conf, parameters): - """ - Calendar for splits - :param parameters: - :return: - """ - # Calendar struct type numbered ( year, month, day, hour ) - - - job_data = as_conf.jobs_data.get(self.section,{}) - if job_data.get("SPLITS", None) and self.running != "once": # once jobs has no date - # total_split = int(self.splits) - split_unit = get_split_size_unit(as_conf.experiment_data, self.section) - cal = str(parameters.get('EXPERIMENT.CALENDAR', "standard")).lower() - split_length = get_split_size(as_conf.experiment_data, self.section) - start_date = parameters.get('CHUNK_START_DATE', None) - if start_date: - self.date = datetime.datetime.strptime(start_date, "%Y%m%d") - split_start = self.split_start_date(self.date, int(self.split), split_length, split_unit, cal) - split_end = self.split_end_date(split_start, split_length, split_unit, cal) - if split_unit == 'hour': - split_end_1 = split_end - else: - split_end_1 = previous_day(split_end, cal) - - parameters['SPLIT'] = self.split - parameters['SPLITSCALENDAR'] = cal - parameters['SPLITSIZE'] = split_length - parameters['SPLITSIZEUNIT'] = split_unit - - parameters['SPLIT_START_DATE'] = date2str( - split_start, self.date_format) - parameters['SPLIT_START_YEAR'] = str(split_start.year) - parameters['SPLIT_START_MONTH'] = str(split_start.month).zfill(2) - parameters['SPLIT_START_DAY'] = str(split_start.day).zfill(2) - parameters['SPLIT_START_HOUR'] = str(split_start.hour).zfill(2) - - parameters['SPLIT_SECOND_TO_LAST_DATE'] = date2str( - split_end_1, self.date_format) - parameters['SPLIT_SECOND_TO_LAST_YEAR'] = str(split_end_1.year) - parameters['SPLIT_SECOND_TO_LAST_MONTH'] = str(split_end_1.month).zfill(2) - parameters['SPLIT_SECOND_TO_LAST_DAY'] = str(split_end_1.day).zfill(2) - parameters['SPLIT_SECOND_TO_LAST_HOUR'] = str(split_end_1.hour).zfill(2) - - parameters['SPLIT_END_DATE'] = date2str( - split_end, self.date_format) - parameters['SPLIT_END_YEAR'] = str(split_end.year) - parameters['SPLIT_END_MONTH'] = str(split_end.month).zfill(2) - parameters['SPLIT_END_DAY'] = str(split_end.day).zfill(2) - parameters['SPLIT_END_HOUR'] = str(split_end.hour).zfill(2) - if int(self.split) == 1: - parameters['SPLIT_FIRST'] = 'TRUE' - else: - parameters['SPLIT_FIRST'] = 'FALSE' + # if int(total_split) == int(self.split): + # parameters['SPLIT_LAST'] = 'TRUE' + # else: + # parameters['SPLIT_LAST'] = 'FALSE' - # if int(total_split) == int(self.split): - # parameters['SPLIT_LAST'] = 'TRUE' - # else: - # parameters['SPLIT_LAST'] = 'FALSE' + return parameters - return parameters + def calendar_chunk(self, parameters): + """ + Calendar for chunks + :param parameters: + :return: + """ if self.date is not None and len(str(self.date)) > 0: if self.chunk is None and len(str(self.chunk)) > 0: chunk = 1 diff --git a/test/unit/test_job.py b/test/unit/test_job.py index d1526d9bb..fe41cc439 100644 --- a/test/unit/test_job.py +++ b/test/unit/test_job.py @@ -1323,6 +1323,7 @@ CONFIG: new=True, run_only_members=config.get_member_list(run_only=True), show_log=True, + create=True, ) job_list = job_list.get_job_list() self.assertEqual(24, len(job_list)) -- GitLab From a782b4379501a8dea38e4241a67fd6784ef1f3c0 Mon Sep 17 00:00:00 2001 From: dbeltran Date: Wed, 20 Mar 2024 12:36:24 +0100 Subject: [PATCH 41/41] Fixed code --- autosubmit/job/job.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 23879febf..bed0521b3 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -1786,8 +1786,8 @@ class Job(object): start_date = parameters.get('CHUNK_START_DATE', None) if start_date: self.date = datetime.datetime.strptime(start_date, "%Y%m%d") - split_start = self.split_start_date(self.date, int(self.split), split_length, split_unit, cal) - split_end = self.split_end_date(split_start, split_length, split_unit, cal) + split_start = chunk_start_date(self.date, int(self.split), split_length, split_unit, cal) + split_end = chunk_end_date(split_start, split_length, split_unit, cal) if split_unit == 'hour': split_end_1 = split_end else: -- GitLab