From 97773f5eef2a9671a623aefb81b528f54b36be69 Mon Sep 17 00:00:00 2001 From: "Bruno P. Kinoshita" Date: Mon, 11 Nov 2024 12:43:51 +0100 Subject: [PATCH 1/2] GL-945: Fix portalocker error handling. This change modifies how Autosubmit reacts to exceptions raised in commands like `autosubmit run` or `autosubmit create`. Previously, Autosubmit would capture exceptions raised during the execution of these commands it in a central function, however, in this function it would log the error and then release the portalocker lock. This caused the issue (reported ~2 years ago?) that running these commands three times caused errors. On the first run the lock was created. On the second command run the command would be prevented from running by portalocker and the error logged but the lock was also deleted. And then on the third run there was no lock, so it would cause the issue reported. Now, the code will never release the lock, until the context-manager created finalizes, or someone/something deletes the lock file. Errors are still captured and logged, but if the error/exception raised is a portalocker exception, we log but without releasing the lock (it is like letting a prisoner go if we realize it is in jail, and then complaining that it was set free :) --- .coveragerc | 53 ++++++++++++++++++++++++ CHANGELOG | 3 ++ autosubmit/__init__.py | 70 ++++++++++++++++++++++++++++++++ autosubmit/autosubmit.py | 55 +++++++------------------ bin/autosubmit | 46 +++++---------------- test/unit/test_lockfile.py | 82 ++++++++++++++++++++++++++++++++++++++ 6 files changed, 231 insertions(+), 78 deletions(-) create mode 100644 .coveragerc create mode 100644 test/unit/test_lockfile.py diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 000000000..7500d7d4b --- /dev/null +++ b/.coveragerc @@ -0,0 +1,53 @@ +# This is the Coverage.py configuration file. This is used by CI when running +# the tests and collecting coverage + +[run] +branch = True +cover_pylib = False +concurrency = thread +data_file = .coverage +disable_warnings = + trace-changed + module-not-python + module-not-imported + no-data-collected + module-not-measured +omit = + tests/* +parallel = True +source = ./autosubmit +timid = False + +[report] +exclude_lines = + pragma: no cover + + # Don't complain if tests don't hit defensive assertion code: + raise NotImplementedError + return NotImplemented + + # Ignore type checking code: + if (typing\.)?TYPE_CHECKING: + @overload( |$) + + # Don't complain about ellipsis (exception classes, typing overloads etc): + \.\.\. + + # Ignore abstract methods + @(abc\.)?abstractmethod + +fail_under=0 +ignore_errors = False +omit = + tests/* +precision = 2 +show_missing = False +skip_covered = False +sort = Name + +[html] +directory = htmlcov + +[xml] +output = coverage.xml +package_depth = 99 diff --git a/CHANGELOG b/CHANGELOG index ef6a0eec1..6cb3b3b44 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -9,6 +9,9 @@ - #1397: Use `tini` as entrypoint in our Docker image. - #1428: Use natural sort order for graph entries (adapted from Cylc), enable doctests to be executed with pytest. +- #945: Fix portalocker releasing the lock when a portalocker exception + is raised. Now it prints the warnings but only releases the lock when + the command finished (successfully or not). 4.1.10 - Hotfix =============== diff --git a/autosubmit/__init__.py b/autosubmit/__init__.py index e69de29bb..3281913d2 100644 --- a/autosubmit/__init__.py +++ b/autosubmit/__init__.py @@ -0,0 +1,70 @@ +import traceback +from contextlib import suppress +from os import _exit # type: ignore +from pathlib import Path +from typing import Union + +from portalocker.exceptions import BaseLockException + +from log.log import Log, AutosubmitCritical, AutosubmitError + + +def delete_lock_file(base_path: str = Log.file_path, lock_file: str = 'autosubmit.lock') -> None: + """Delete lock file if it exists. Suppresses permission errors raised. + + :param base_path: Base path to locate the lock file. Defaults to the experiment ``tmp`` directory. + :type base_path: str + :param lock_file: The name of the lock file. Defaults to ``autosubmit.lock``. + :type lock_file: str + :return: None + """ + with suppress(PermissionError): + Path(base_path, lock_file).unlink(missing_ok=True) + + +def exit_from_error(e: BaseException) -> None: + """Called by ``Autosubmit`` when an exception is raised during a command execution. + + Prints the exception in ``DEBUG`` level. + + Prints the exception in ``CRITICAL`` if is it an ``AutosubmitCritical`` or an + ``AutosubmitError`` exception. + + Exceptions raised by ``porta-locker` library print a message informing the user + about the locked experiment. Other exceptions raised cause the lock to be deleted. + + After printing the exception, this function calls ``os._exit(1)``, which will + forcefully exit the executable running. + + :param e: The exception being raised. + :type e: BaseException + :return: None + """ + trace = traceback.format_exc() + try: + Log.debug(trace) + except: + print(trace) + + is_portalocker_error = isinstance(e, BaseLockException) + is_autosubmit_error = isinstance(e, (AutosubmitCritical, AutosubmitError)) + + if isinstance(e, BaseLockException): + Log.warning('Another Autosubmit instance using the experiment\n. Stop other Autosubmit instances that are ' + 'using the experiment or delete autosubmit.lock file located on the /tmp folder.') + else: + delete_lock_file() + + if is_autosubmit_error: + e: Union[AutosubmitError, AutosubmitCritical] = e # type: ignore + if e.trace: + Log.debug("Trace: {0}", str(e.trace)) + Log.critical("{1} [eCode={0}]", e.code, e.message) + + if not is_portalocker_error and not is_autosubmit_error: + msg = "Unexpected error: {0}.\n Please report it to Autosubmit Developers through Git" + args = [str(e)] + Log.critical(msg.format(*args)) + + Log.info("More info at https://autosubmit.readthedocs.io/en/master/troubleshooting/error-codes.html") + _exit(1) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index f36565694..0b1dda208 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -21,7 +21,6 @@ import requests # You should have received a copy of the GNU General Public License # along with Autosubmit. If not, see . import threading -import traceback from bscearth.utils.date import date2str from configparser import ConfigParser from distutils.util import strtobool @@ -74,10 +73,11 @@ import random import signal import datetime # import log.fd_show as fd_show -import portalocker -from importlib.resources import read_text, files as read_files +from importlib.resources import files as read_files from importlib.metadata import version from collections import defaultdict +from portalocker import Lock +from portalocker.exceptions import BaseLockException from pyparsing import nestedExpr from .history.experiment_status import ExperimentStatus from .history.experiment_history import ExperimentHistory @@ -2188,10 +2188,8 @@ class Autosubmit: raise AutosubmitCritical("Failure during the loading of the experiment configuration, check file paths", 7014, str(e)) - # checking if there is a lock file to avoid multiple running on the same expid try: - # Portalocker is used to avoid multiple autosubmit running on the same experiment, we have to change this system in #806 - with portalocker.Lock(os.path.join(tmp_path, 'autosubmit.lock'), timeout=1): + with Lock(os.path.join(tmp_path, 'autosubmit.lock'), timeout=1): try: Log.debug("Preparing run") # This function is called only once, when the experiment is started. It is used to initialize the experiment and to check the correctness of the configuration files. @@ -2382,10 +2380,7 @@ class Autosubmit: 7051, e.message) except AutosubmitCritical as e: # Critical errors can't be recovered. Failed configuration or autosubmit error raise AutosubmitCritical(e.message, e.code, e.trace) - except (portalocker.AlreadyLocked, portalocker.LockException) as e: - message = "We have detected that there is another Autosubmit instance using the experiment\n. Stop other Autosubmit instances that are using the experiment or delete autosubmit.lock file located on tmp folder" - raise AutosubmitCritical(message, 7000) - except BaseException as e: + except BaseException: raise # If this happens, there is a bug in the code or an exception not-well caught Log.result("No more jobs to run.") if not did_run and len(job_list.get_completed_without_logs()) > 0: @@ -2438,14 +2433,13 @@ class Autosubmit: exp_history.finish_current_experiment_run() except Exception: Log.warning("Database is locked") - except (portalocker.AlreadyLocked, portalocker.LockException) as e: - message = "We have detected that there is another Autosubmit instance using the experiment\n. Stop other Autosubmit instances that are using the experiment or delete autosubmit.lock file located on tmp folder" + except BaseLockException: terminate_child_process(expid) - raise AutosubmitCritical(message, 7000) - except AutosubmitCritical as e: + raise + except AutosubmitCritical: terminate_child_process(expid) raise - except BaseException as e: + except BaseException: terminate_child_process(expid) raise finally: @@ -4076,7 +4070,7 @@ class Autosubmit: backup_pkl_path = os.path.join( pkl_folder_path, "job_list_{}_backup.pkl".format(expid)) try: - with portalocker.Lock(os.path.join(tmp_path, 'autosubmit.lock'), timeout=1): + with Lock(os.path.join(tmp_path, 'autosubmit.lock'), timeout=1): # Not locked Log.info("Looking for backup file {}".format(backup_pkl_path)) if os.path.exists(backup_pkl_path): @@ -4132,13 +4126,8 @@ class Autosubmit: else: Log.info( "Backup file not found. Pkl restore operation stopped. No changes have been made.") - except (portalocker.AlreadyLocked, portalocker.LockException) as e: - message = "Another Autosubmit instance using the experiment\n. Stop other Autosubmit instances that are using the experiment or delete autosubmit.lock file located on the /tmp folder." - raise AutosubmitCritical(message, 7000) except AutosubmitCritical as e: raise AutosubmitCritical(e.message, e.code, e.trace) - except BaseException as e: - raise @staticmethod def database_backup(expid): @@ -4506,8 +4495,7 @@ class Autosubmit: Autosubmit._check_ownership(expid, raise_error=True) exp_path = os.path.join(BasicConfig.LOCAL_ROOT_DIR, expid) tmp_path = os.path.join(exp_path, BasicConfig.LOCAL_TMP_DIR) - # Encapsulating the lock - with portalocker.Lock(os.path.join(tmp_path, 'autosubmit.lock'), timeout=1) as fh: + with Lock(os.path.join(tmp_path, 'autosubmit.lock'), timeout=1) as fh: try: Log.info( "Preparing .lock file to avoid multiple instances with same expid.") @@ -4669,23 +4657,14 @@ class Autosubmit: Autosubmit.detail(job_list) return True # catching Exception - except KeyboardInterrupt as e: + except KeyboardInterrupt: # Setting signal handler to handle subsequent CTRL-C signal.signal(signal.SIGINT, signal_handler_create) fh.flush() os.fsync(fh.fileno()) raise AutosubmitCritical("Stopped by user input", 7010) - except BaseException as e: + except BaseException: raise - except (portalocker.AlreadyLocked, portalocker.LockException) as e: - message = "We have detected that there is another Autosubmit instance using the experiment\n. Stop other Autosubmit instances that are using the experiment or delete autosubmit.lock file located on tmp folder" - raise AutosubmitCritical(message, 7000) - except AutosubmitError as e: - raise - except AutosubmitCritical as e: - raise - except BaseException as e: - raise finally: if profile: profiler.stop() @@ -5211,9 +5190,8 @@ class Autosubmit: tmp_path = os.path.join(exp_path, BasicConfig.LOCAL_TMP_DIR) section_validation_message = " " job_validation_message = " " - # checking if there is a lock file to avoid multiple running on the same expid try: - with portalocker.Lock(os.path.join(tmp_path, 'autosubmit.lock'), timeout=1): + with Lock(os.path.join(tmp_path, 'autosubmit.lock'), timeout=1): Log.info( "Preparing .lock file to avoid multiple instances with same expid.") @@ -5452,11 +5430,6 @@ class Autosubmit: groups=groups_dict, job_list_object=job_list) return True - except (portalocker.AlreadyLocked, portalocker.LockException) as e: - message = "We have detected that there is another Autosubmit instance using the experiment\n. Stop other Autosubmit instances that are using the experiment or delete autosubmit.lock file located on tmp folder" - raise AutosubmitCritical(message, 7000) - except (AutosubmitError, AutosubmitCritical): - raise except BaseException as e: raise AutosubmitCritical( "An Error has occurred while setting some of the workflow jobs, no changes were made", 7040, str(e)) diff --git a/bin/autosubmit b/bin/autosubmit index d08f47593..6f53abf1f 100755 --- a/bin/autosubmit +++ b/bin/autosubmit @@ -20,54 +20,26 @@ """Script for handling experiment monitoring""" import os import sys -import traceback -from contextlib import suppress -scriptdir = os.path.abspath(os.path.dirname(sys.argv[0])) -sys.path.append(scriptdir) -sys.path.append(os.path.normpath(os.path.join(scriptdir, os.pardir))) +script_dir = os.path.abspath(os.path.dirname(sys.argv[0])) +sys.path.append(script_dir) +sys.path.append(os.path.normpath(os.path.join(script_dir, os.pardir))) # noinspection PyUnresolvedReferences -from log.log import Log, AutosubmitCritical , AutosubmitError +from log.log import Log, AutosubmitCritical, AutosubmitError +from autosubmit import delete_lock_file, exit_from_error from autosubmit.autosubmit import Autosubmit -from typing import Union - -def exit_from_error(e: BaseException): - trace = traceback.format_exc() - try: - Log.debug(trace) - except: - print(trace) - with suppress(FileNotFoundError, PermissionError): - os.remove(os.path.join(Log.file_path, "autosubmit.lock")) - if isinstance(e, (AutosubmitCritical, AutosubmitError)): - e: Union[AutosubmitError, AutosubmitCritical] = e - if e.trace: - Log.debug("Trace: {0}", str(e.trace)) - Log.critical("{1} [eCode={0}]", e.code, e.message) - else: - msg = "A not admitted configuration or error in the code has happened: {0}.\n Please report it to Autosubmit Developers through Git" - args = [str(e)] - Log.critical(msg.format(*args)) - Log.info("More info at https://autosubmit.readthedocs.io/en/master/troubleshooting/error-codes.html") - os._exit(1) - # noinspection PyProtectedMember def main(): try: return_value = Autosubmit.parse_args() - if os.path.exists(os.path.join(Log.file_path, "autosubmit.lock")): - os.remove(os.path.join(Log.file_path, "autosubmit.lock")) - if type(return_value) is int: - os._exit(return_value) - os._exit(0) - except AutosubmitError as e: - exit_from_error(e) - except AutosubmitCritical as e: - exit_from_error(e) + delete_lock_file() + return_value = return_value if type(return_value) is int else 0 + os._exit(return_value) except BaseException as e: exit_from_error(e) + if __name__ == "__main__": main() diff --git a/test/unit/test_lockfile.py b/test/unit/test_lockfile.py new file mode 100644 index 000000000..6b7f40837 --- /dev/null +++ b/test/unit/test_lockfile.py @@ -0,0 +1,82 @@ +import pytest +from portalocker.exceptions import BaseLockException + +from autosubmit import delete_lock_file, exit_from_error +from log.log import AutosubmitCritical, AutosubmitError + + +def test_delete_lockfile(tmp_path): + fake_lock = tmp_path / 'autosubmit.lock' + fake_lock.touch() + + delete_lock_file(str(tmp_path), 'not-found') + + assert fake_lock.exists() + delete_lock_file(str(tmp_path), fake_lock.name) + assert not fake_lock.exists() + + +def test_log_debug_raises_error(mocker): + """TODO: this probably should never happen?""" + + def _fn(): + raise ValueError + + try: + _fn() + except BaseException as e: + mocker.patch('autosubmit._exit') # mock this to avoid the system from exiting + mocked_log = mocker.patch('autosubmit.Log') + mocked_print = mocker.patch('autosubmit.print') + + mocked_log.debug.side_effect = BaseException() + + exit_from_error(e) + + assert mocked_print.called + + +_TEST_EXCEPTION = AutosubmitCritical() +_TEST_EXCEPTION.trace = 'a trace' + + +@pytest.mark.parametrize( + 'exception,debug_calls,critical_calls,delete_called', + [ + (ValueError, 1, 1, True), + (BaseLockException, 1, 0, False), + (AutosubmitCritical, 1, 1, True), + (_TEST_EXCEPTION, 2, 1, True), + (AutosubmitError, 1, 1, True) + ], + ids=[ + 'normal_exception', + 'portalocker_exception', + 'autosubmit_critical', + 'autosubmit_critical_with_trace', + 'autosubmit_error' + ] +) +def test_exit_from_error( + mocker, + tmp_path, + exception: BaseException, + debug_calls: int, + critical_calls: int, + delete_called: bool +): + def _fn(): + raise exception + + try: + _fn() + except BaseException as e: + mocker.patch('autosubmit._exit') # mock this to avoid the system from exiting + mocked_log = mocker.patch('autosubmit.Log') + mocked_delete = mocker.patch('autosubmit.delete_lock_file') + + exit_from_error(e) + + assert mocked_log.debug.call_count == debug_calls + assert mocked_log.critical.call_count == critical_calls + assert mocked_delete.called == delete_called -- GitLab From 0c98a70a1381d2ddd4f08d9b9931eb687a83cda4 Mon Sep 17 00:00:00 2001 From: "Bruno P. Kinoshita" Date: Mon, 11 Nov 2024 12:52:52 +0100 Subject: [PATCH 2/2] GL-945: Log warnings by default, other a command raises a warning and the user will only ever know if s/he looks at the file on disk, or enable console logging. --- autosubmit/autosubmit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 0b1dda208..f70000216 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -191,7 +191,7 @@ class Autosubmit: default='DEBUG', type=str, help="sets file's log level.") parser.add_argument('-lc', '--logconsole', choices=('NO_LOG', 'INFO', 'WARNING', 'DEBUG'), - default='INFO', type=str, + default='WARNING', type=str, help="sets console's log level") subparsers = parser.add_subparsers(dest='command') -- GitLab