From 04f66c88787abaa854df818cfcbdec8fd358345a Mon Sep 17 00:00:00 2001 From: "Bruno P. Kinoshita" Date: Thu, 16 Feb 2023 22:06:44 +0100 Subject: [PATCH] [rocrate] Add RO-Crate support to Autosubmit. This commit includes work from several other commits, squashed. It started around February 2023, and by July 2023 it was validated by the RO-Crate community, thanks especially to Simone Leo. Unit tests and documentation were added as well. It add support to the following three RO-Crate profiles in Autosubmit: - Process Run Crate - Workflow Run Crate - Workflow RO-Crate profile 1.0 This is available through the Autosubmit commands archive and unarchive. --- autosubmit/autosubmit.py | 163 +++++- autosubmit/provenance/__init__.py | 15 + autosubmit/provenance/rocrate.py | 562 ++++++++++++++++++ docs/source/_static/css/autosubmit.css | 5 + docs/source/conf.py | 4 +- docs/source/ext/runcmd.py | 206 +++++++ docs/source/index.rst | 6 +- docs/source/introduction/index.rst | 23 +- docs/source/userguide/manage/index.rst | 58 +- docs/source/userguide/provenance.rst | 66 +++ requeriments.txt | 1 + setup.py | 2 +- test/unit/helpers/__init__.py | 0 test/unit/provenance/__init__.py | 0 test/unit/provenance/test_rocrate.py | 758 +++++++++++++++++++++++++ 15 files changed, 1786 insertions(+), 83 deletions(-) create mode 100644 autosubmit/provenance/__init__.py create mode 100644 autosubmit/provenance/rocrate.py create mode 100644 docs/source/ext/runcmd.py create mode 100644 docs/source/userguide/provenance.rst create mode 100644 test/unit/helpers/__init__.py create mode 100644 test/unit/provenance/__init__.py create mode 100644 test/unit/provenance/test_rocrate.py diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 1796337f0..fa565a28e 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -75,7 +75,7 @@ import signal import datetime import log.fd_show as fd_show import portalocker -from pkg_resources import require, resource_listdir, resource_string, resource_filename +from pkg_resources import require, resource_listdir, resource_exists, resource_string, resource_filename from collections import defaultdict from pyparsing import nestedExpr from .history.experiment_status import ExperimentStatus @@ -602,6 +602,8 @@ class Autosubmit: help='Only does a container without compress') subparser.add_argument('-v', '--update_version', action='store_true', default=False, help='Update experiment version') + subparser.add_argument('--rocrate', action='store_true', default=False, + help='Produce an RO-Crate file') # Unarchive subparser = subparsers.add_parser( 'unarchive', description='unarchives an experiment') @@ -612,6 +614,8 @@ class Autosubmit: help='Untar an uncompressed tar') subparser.add_argument('-v', '--update_version', action='store_true', default=False, help='Update experiment version') + subparser.add_argument('--rocrate', action='store_true', default=False, + help='Unarchive an RO-Crate file') # update proj files subparser = subparsers.add_parser('upgrade', description='Updates autosubmit 3 proj files to autosubmit 4') subparser.add_argument('expid', help='experiment identifier') @@ -715,9 +719,9 @@ class Autosubmit: elif args.command == 'upgrade': return Autosubmit.upgrade_scripts(args.expid,files=args.files) elif args.command == 'archive': - return Autosubmit.archive(args.expid, noclean=args.noclean, uncompress=args.uncompress) + return Autosubmit.archive(args.expid, noclean=args.noclean, uncompress=args.uncompress, rocrate=args.rocrate) elif args.command == 'unarchive': - return Autosubmit.unarchive(args.expid, uncompressed=args.uncompressed) + return Autosubmit.unarchive(args.expid, uncompressed=args.uncompressed, rocrate=args.rocrate) elif args.command == 'readme': if os.path.isfile(Autosubmit.readme_path): @@ -4316,7 +4320,91 @@ class Autosubmit: Log.critical(str(exp)) @staticmethod - def archive(expid, noclean=True, uncompress=True): + def rocrate(expid, path: Path): + """ + Produces an RO-Crate archive for an Autosubmit experiment. + + :param expid: experiment ID + :type expid: str + :param path: path to save the RO-Crate in + :type path: Path + :return: ``True`` if successful, ``False`` otherwise + :rtype: bool + """ + from autosubmit.statistics.statistics import Statistics + from textwrap import dedent + + as_conf = AutosubmitConfig(expid) + # ``.reload`` will call the function to unify the YAML configuration. + as_conf.reload(True) + + workflow_configuration = as_conf.experiment_data + + # Load the rocrate prepopulated file, or raise an error and write the template. + # Similar to what COMPSs does. + # See: https://github.com/bsc-wdc/compss/blob/9e79542eef60afa9e288e7246e697bd7ac42db08/compss/runtime/scripts/system/provenance/generate_COMPSs_RO-Crate.py + rocrate_json = workflow_configuration.get('ROCRATE', None) + if not rocrate_json: + Log.error(dedent('''\ + No ROCRATE configuration value provided! Use it to create your + JSON-LD schema, using @id, @type, and other schema.org attributes, + and it will be merged with the values retrieved from the workflow + configuration. Some values are not present in Autosubmit, such as + license, so you must provide it if you want to include in your + RO-Crate data, e.g. create a file $expid/conf/rocrate.yml (or use + an existing one) with a top level ROCRATE key, containing your + JSON-LD data: + + ROCRATE: + INPUTS: + # Add the extra keys to be exported. + - "MHM" + OUTPUTS: + # Relative to the Autosubmit project folder. + - "*/*.gif" + PATCH: | + { + "@graph": [ + { + "@id": "./", + "license": "Apache-2.0", + "creator": { + "@id": "https://orcid.org/0000-0001-8250-4074" + } + }, + { + "@id": "https://orcid.org/0000-0001-8250-4074", + "@type": "Person", + "affiliation": { + "@id": "https://ror.org/05sd8tv96" + } + }, + ... + ] + } + ''').replace('{', '{{').replace('}', '}}')) + raise AutosubmitCritical("You must provide an ROCRATE configuration key when using RO-Crate...", 7014) + + # Read job list (from pickles) to retrieve start and end time. + # Code adapted from ``autosubmit stats``. + job_list = Autosubmit.load_job_list(expid, as_conf, notransitive=False) + jobs = job_list.get_job_list() + exp_stats = Statistics(jobs=jobs, start=None, end=None, queue_time_fix={}) + exp_stats.calculate_statistics() + start_time = None + end_time = None + # N.B.: ``exp_stats.jobs_stat`` is sorted in reverse order. + number_of_jobs = len(exp_stats.jobs_stat) + if number_of_jobs > 0: + start_time = exp_stats.jobs_stat[-1].start_time.replace(microsecond=0).isoformat() + if number_of_jobs > 1: + end_time = exp_stats.jobs_stat[0].finish_time.replace(microsecond=0).isoformat() + + from autosubmit.provenance.rocrate import create_rocrate_archive + return create_rocrate_archive(as_conf, rocrate_json, jobs, start_time, end_time, path) + + @staticmethod + def archive(expid, noclean=True, uncompress=True, rocrate=False): """ Archives an experiment: call clean (if experiment is of version 3 or later), compress folder to tar.gz and moves to year's folder @@ -4327,9 +4415,10 @@ class Autosubmit: :type noclean: bool :param uncompress: flag telling it whether to decompress or not. :type uncompress: bool + :param rocrate: flag to enable RO-Crate + :type rocrate: bool :return: ``True`` if the experiment has been successfully archived. ``False`` otherwise. :rtype: bool - """ exp_path = os.path.join(BasicConfig.LOCAL_ROOT_DIR, expid) @@ -4356,29 +4445,36 @@ class Autosubmit: if year is None: year = time.localtime(os.path.getmtime(exp_folder)).tm_year - Log.info("Archiving in year {0}", year) - - # Creating tar file - Log.info("Creating tar file ... ") try: year_path = os.path.join(BasicConfig.LOCAL_ROOT_DIR, str(year)) if not os.path.exists(year_path): os.mkdir(year_path) os.chmod(year_path, 0o775) - if not uncompress: - compress_type = "w:gz" - output_filepath = '{0}.tar.gz'.format(expid) - else: - compress_type = "w" - output_filepath = '{0}.tar'.format(expid) - with tarfile.open(os.path.join(year_path, output_filepath), compress_type) as tar: - tar.add(exp_folder, arcname='') - tar.close() - os.chmod(os.path.join(year_path, output_filepath), 0o775) except Exception as e: - raise AutosubmitCritical("Can not write tar file", 7012, str(e)) + raise AutosubmitCritical(f"Failed to create year-directory {str(year)} for experiment {expid}", 7012, str(e)) + Log.info(f"Archiving in year {str(year)}") - Log.info("Tar file created!") + if rocrate: + Autosubmit.rocrate(expid, Path(year_path)) + Log.info('RO-Crate ZIP file created!') + else: + # Creating tar file + Log.info("Creating tar file ... ") + try: + if not uncompress: + compress_type = "w:gz" + output_filepath = '{0}.tar.gz'.format(expid) + else: + compress_type = "w" + output_filepath = '{0}.tar'.format(expid) + with tarfile.open(os.path.join(year_path, output_filepath), compress_type) as tar: + tar.add(exp_folder, arcname='') + tar.close() + os.chmod(os.path.join(year_path, output_filepath), 0o775) + except Exception as e: + raise AutosubmitCritical("Can not write tar file", 7012, str(e)) + + Log.info("Tar file created!") try: shutil.rmtree(exp_folder) @@ -4394,7 +4490,7 @@ class Autosubmit: Log.warning("Experiment folder renamed to: {0}".format( exp_folder + "_to_delete ")) except Exception as e: - Autosubmit.unarchive(expid, uncompressed=False) + Autosubmit.unarchive(expid, uncompressed=False, rocrate=rocrate) raise AutosubmitCritical( "Can not remove or rename experiments folder", 7012, str(e)) @@ -4402,7 +4498,7 @@ class Autosubmit: return True @staticmethod - def unarchive(experiment_id, uncompressed=True): + def unarchive(experiment_id, uncompressed=True, rocrate=False): """ Unarchives an experiment: uncompress folder from tar.gz and moves to experiment root folder @@ -4410,14 +4506,18 @@ class Autosubmit: :type experiment_id: str :param uncompressed: if True, the tar file is uncompressed :type uncompressed: bool - + :param rocrate: flag to enable RO-Crate + :type rocrate: bool """ exp_folder = os.path.join(BasicConfig.LOCAL_ROOT_DIR, experiment_id) # Searching by year. We will store it on database year = datetime.datetime.today().year archive_path = None - if not uncompressed: + if rocrate: + compress_type = None + output_pathfile = f'{experiment_id}.zip' + elif not uncompressed: compress_type = "r:gz" output_pathfile = '{0}.tar.gz'.format(experiment_id) else: @@ -4440,12 +4540,17 @@ class Autosubmit: if not os.path.isdir(exp_folder): os.mkdir(exp_folder) try: - with tarfile.open(os.path.join(archive_path), compress_type) as tar: - tar.extractall(exp_folder) - tar.close() + if rocrate: + import zipfile + with zipfile.ZipFile(archive_path, 'r') as zip: + zip.extractall(exp_folder) + else: + with tarfile.open(os.path.join(archive_path), compress_type) as tar: + tar.extractall(exp_folder) + tar.close() except Exception as e: shutil.rmtree(exp_folder, ignore_errors=True) - Log.printlog("Can not extract tar file: {0}".format(str(e)), 6012) + Log.printlog("Can not extract file: {0}".format(str(e)), 6012) return False Log.info("Unpacking finished") diff --git a/autosubmit/provenance/__init__.py b/autosubmit/provenance/__init__.py new file mode 100644 index 000000000..9113b0954 --- /dev/null +++ b/autosubmit/provenance/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2015-2023 Earth Sciences Department, BSC-CNS +# +# This file is part of Autosubmit. +# +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +"""Code for workflow and data provenance.""" diff --git a/autosubmit/provenance/rocrate.py b/autosubmit/provenance/rocrate.py new file mode 100644 index 000000000..de77b3e5b --- /dev/null +++ b/autosubmit/provenance/rocrate.py @@ -0,0 +1,562 @@ +# Copyright 2015-2023 Earth Sciences Department, BSC-CNS +# +# This file is part of Autosubmit. +# +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +"""RO-Crate is a human and machine-readable format, widely used in the +workflow community with a wide variety of tools and use cases, built +focused on reproducibility. + +For more about RO-Crate: https://www.researchobject.org/ro-crate/ +""" + +import datetime +import json +import mimetypes +import os +import subprocess +from pathlib import Path +from textwrap import dedent +from typing import List, Tuple, Union, Dict, Any + +from rocrate.model.contextentity import ContextEntity +from rocrate.rocrate import ROCrate, File +from rocrate.utils import iso_now + +from autosubmit.database.db_common import get_autosubmit_version +from autosubmit.database.db_common import get_experiment_descrip +from autosubmit.job.job import Job +from autosubmit.job.job_common import Status +from autosubmitconfigparser.config.basicconfig import BasicConfig +from autosubmitconfigparser.config.configcommon import AutosubmitConfig +from log.log import Log, AutosubmitCritical + +"""List of profiles used in our RO-Crate implementation, plus the one used +as graph context.""" +PROFILES = [ + { + "@id": "https://w3id.org/ro/wfrun/process/0.1", + "@type": "CreativeWork", + "name": "Process Run Crate", + "version": "0.1" + }, + { + "@id": "https://w3id.org/ro/wfrun/workflow/0.1", + "@type": "CreativeWork", + "name": "Workflow Run Crate", + "version": "0.1" + }, + { + "@id": "https://w3id.org/workflowhub/workflow-ro-crate/1.0", + "@type": "CreativeWork", + "name": "Workflow RO-Crate", + "version": "1.0" + } +] + +# TODO: This could be a useful feature in ro-crate-py? Given a Python type, +# give me the equivalent type in RO-Crate/JSON-LD. +# Some parameters in Autosubmit will contain dictionaries (like CUSTOM_CONFIG.PRE). +# We need to convert those to string in order to serialize into JSON-LD. +PARAMETER_TYPES_MAP = { + 'str': 'Text', + 'int': 'Integer', + 'float': 'Float', + 'bool': 'Boolean', + 'dict': str, + 'list': str +} + +# These are the default keys exported as FormalParameters automatically. +# Others are added depending on the workflow configuration, and what the +# user has requested to export. +DEFAULT_EXPORTED_KEYS = [ + 'DEFAULT', + 'EXPERIMENT', + 'CONFIG', + 'PROJECT' +] + + +def _add_dir_and_files(crate: ROCrate, base_path: str, relative_path: str, encoding_format: str = None) -> None: + """Add a directory and its files into the RO-Crate. + + :param crate: the RO-Crate instance. + :param base_path: the base path for the files being added. + :param relative_path: the relative path (to the ``base_path``). + :param encoding_format: the encoding format (if any). + """ + folder = Path(base_path, relative_path) + for root, dirs, files in os.walk(folder, topdown=True): + for file in files: + file_path = Path(root, file) + _add_file(crate, base_path, file_path, encoding_format) + crate.add_dataset( + source=folder, + dest_path=folder.relative_to(base_path) + ) + + +def _add_file(crate: ROCrate, base_path: Union[str, None], file_path: Path, encoding_format: str = None, use_uri: bool = False, **args: Any) -> Any: + """Add a file into the RO-Crate. + + :param crate: the RO-Crate instance. + :param base_path: the base path for the files being added. Optional. + :param file_path: the path for the file being added. + :param encoding_format: the encoding format (if any). + :param use_uri: whether to use the Path as a URI or as a source directly. Defaults to ``False``. + :return: the object returned by ro-crate-py + :rtype: Any + """ + properties = { + "name": file_path.name, + "sdDatePublished": iso_now(), + "dateModified": datetime.datetime.utcfromtimestamp(file_path.stat().st_mtime).replace( + microsecond=0).isoformat(), + "contentSize": file_path.stat().st_size, + **args + } + encoding_format = encoding_format if encoding_format is not None else mimetypes.guess_type(file_path)[0] + if encoding_format is not None: + # N.B.: We must not write ``None``'s or other missing or empty values + # to the encoding format if none found. + properties['encodingFormat'] = encoding_format + + source = file_path if not use_uri else file_path.as_uri() + + dest_path = None + if base_path: + dest_path = file_path.relative_to(base_path) + file = File(crate=crate, + source=source, + dest_path=dest_path, + fetch_remote=False, + validate_url=False, + properties=properties) + # This is to prevent ``metadata/experiment_data.yml`` to be added twice. + # Once as the workflow main file, and twice when scanning the experiment + # ``conf`` folder for YAML files. + # See: https://github.com/ResearchObject/ro-crate-py/issues/165 + if file.id not in [x['@id'] for x in crate.data_entities]: + return crate.add_file( + source=source, + dest_path=dest_path, + fetch_remote=False, + validate_url=False, + properties=properties + ) + return None + + +def _get_action_status(jobs: List[Job]) -> str: + """Get the status of the workflow action. + + :param jobs: list of jobs, used to infer the current workflow/action status. + :type jobs: List[str] + :return: a valid RO-Crate and Schema.org action status. + :rtype: str + """ + if not jobs: + return 'PotentialActionStatus' + if all([job.status == Status.COMPLETED for job in jobs]): + return 'CompletedActionStatus' + failed_statuses = [ + Status.FAILED + ] + if any([job.status in failed_statuses for job in jobs]): + return 'FailedActionStatus' + return 'PotentialActionStatus' + + +def _get_git_branch_and_commit(project_path: str) -> Tuple[str, str]: + """FIXME: workaround for: https://earth.bsc.es/gitlab/ces/autosubmit4-config-parser/-/merge_requests/2/diffs. + + :param project_path: the complete path for the Git project path. + :type project_path: str + :return: a tuple where the first element is the branch, and the second the commit hash + :rtype: Tuple[str, str] + """ + try: + output = subprocess.check_output( + "cd {0}; git rev-parse --abbrev-ref HEAD".format(project_path), + shell=True, text=True) + except subprocess.CalledProcessError as e: + raise AutosubmitCritical("Failed to retrieve project branch...", 7014, str(e)) + + project_branch = output.strip() + Log.debug("Project branch is: " + project_branch) + try: + output = subprocess.check_output("cd {0}; git rev-parse HEAD".format(project_path), shell=True, text=True) + except subprocess.CalledProcessError as e: + raise AutosubmitCritical("Failed to retrieve project commit SHA...", 7014, str(e)) + project_sha = output.strip() + Log.debug("Project commit SHA is: " + project_sha) + return project_branch, project_sha + + +# Add Autosubmit Project to the RO-Crate. +def _get_project_entity(as_configuration: AutosubmitConfig, crate: ROCrate) -> Union[ContextEntity, None]: + """Return a ``SoftwareSourceCode``, a specialized object from + ``CreativeEntity`` that contains a ``codeRepository`` property + that points to the location of files used by the Autosubmit + workflow. Ref: https://schema.org/SoftwareSourceCode + + :param as_configuration: Autosubmit configuration object + :type as_configuration: AutosubmitConfig + :param crate: RO-Crate object + :type crate: ROCrate + :return: an entity that can be added into the RO-Crate. + :rtype: Union[ContextEntity, None] + """ + project = as_configuration.experiment_data['PROJECT'] + project_type = project['PROJECT_TYPE'].upper() + project_values = as_configuration.experiment_data.get(project_type, {}) + project_path = as_configuration.get_project_dir() + + project_url = None + project_version = None # version is the commit/revision/etc., as per schema.org + if project_type == 'NONE': + project_url = '' + project_version = '' + elif project_type == 'SUBVERSION': + # TODO: Maybe AutosubmitConfig needs a function to persist the subversion revision? + raise AutosubmitCritical('Only Git and local projects are supported for RO-Crate.', 7014) + elif project_type == 'GIT': + project_url = project_values['PROJECT_ORIGIN'] + # TBD: Maybe the branch should be archived in the RO-Crate somehow too? + _, project_version = _get_git_branch_and_commit(project_path) + elif project_type == 'LOCAL': + project_url = f'file://{project_values["PROJECT_PATH"]}' + project_version = '' + else: + raise AutosubmitCritical(f'Project type {project_type} is not supported for RO-Crate.', 7014) + + parameter_value = { + '@id': project_url, + '@type': 'SoftwareSourceCode', + 'name': project_url, + 'sdDatePublished': iso_now(), + 'codeRepository': project_url, + 'version': project_version, + 'programmingLanguage': 'Any', + 'codeSampleType': 'template', + 'targetProduct': 'Autosubmit', + 'runtimePlatform': f'Autosubmit {as_configuration.get_version()}', + 'abstract': dedent('''\ +The Autosubmit project. It contains the templates used +by Autosubmit for the scripts used in the workflow, as well as any other +source code used by the scripts (i.e. any files sourced, or other source +code compiled or executed in the workflow).''') + } + + return ContextEntity(crate, properties=parameter_value) + + +def _create_formal_parameter(crate, parameter_name, name=None, **kwargs) -> Any: + """Create a ``FormalParameter``. + + The ID's of ``FormalParameter``s must start with `#` since these + are "internal" contextual entities. + """ + properties = { + '@id': f'#{parameter_name}-param', + '@type': 'FormalParameter', + 'name': name or parameter_name, + **kwargs + } + return crate.add(ContextEntity(crate, properties=properties)) + + +def _create_parameter(crate, parameter_name, parameter_value, formal_parameter, type='PropertyValue', **kwargs) -> Any: + properties = { + '@id': f'#{parameter_name}-pv', + '@type': type, + 'exampleOfWork': { + '@id': formal_parameter['@id'] + }, + 'name': parameter_name, + 'value': parameter_value, + **kwargs + } + return crate.add(ContextEntity(crate, properties=properties)) + + +def create_rocrate_archive( + as_conf: AutosubmitConfig, + rocrate_json: Dict[str, Any], + jobs: List[Job], + start_time: Union[str, None], + end_time: Union[str, None], + path: Path) -> ROCrate: + """Create an RO-Crate archive using the ro-crate-py library. + + It uses the Autosubmit configuration for the prospective provenance, and also + to locate the directories with perspective provenance. + + :param as_conf: Autosubmit configuration + :type as_conf: AutosubmitConfig + :param rocrate_json: RO-Crate JSON patch provided by the user + :type rocrate_json: Dict[str, Any] + :param jobs: List of Autosubmit jobs + :type jobs: List[Job] + :param start_time: Workflow run start time + :type start_time: Union[str, None] + :param end_time: Workflow run end time + :type end_time: Union[str, None] + :param path: path to save the RO-Crate in + :type path: Path + :return: ``True`` is the archive was created successful, ``False`` otherwise + :rtype: object()bool + """ + workflow_configuration = as_conf.experiment_data + expid = workflow_configuration['DEFAULT']['EXPID'] + as_version = get_autosubmit_version(expid) + experiment_path = os.path.join(BasicConfig.LOCAL_ROOT_DIR, expid) + unified_yaml_configuration = Path(experiment_path, "conf/metadata/experiment_data.yml") + + root_profiles = [ + {"@id": profile["@id"]} for profile in PROFILES + ] + rocrate_metadata_json_profiles = [ + # Graph context. + { + "@id": "https://w3id.org/ro/crate/1.1" + }, + { + "@id": "https://w3id.org/workflowhub/workflow-ro-crate/1.0" + } + ] + + mimetypes.init() + + crate = ROCrate() + crate.root_dataset.properties().update({ + 'conformsTo': root_profiles + }) + for profile in PROFILES: + crate.add(ContextEntity(crate, properties=profile)) + + Log.info('Creating RO-Crate archive...') + + # Create workflow configuration (prospective provenance) + main_entity = crate.add_workflow( + source=unified_yaml_configuration, + dest_path=unified_yaml_configuration.relative_to(experiment_path), + main=True, + lang="Autosubmit", + lang_version=as_version, + gen_cwl=False + ) + crate.metadata.properties().update({ + 'conformsTo': rocrate_metadata_json_profiles + }) + + # Fetch the experiment description from the main database + crate.description = get_experiment_descrip(expid)[0][0] + + # Add files generated after its execution (retrospective provenance) + + # Add original YAML configuration. + _add_dir_and_files(crate, experiment_path, "conf") + # Some external files could have been loaded too. That's why we use the + # ``as_conf.current_loaded_files`` dictionary instead (name: mtime). + experiment_configuration_path = Path(experiment_path, "conf") + for config_entry in as_conf.current_loaded_files.keys(): + config_entry_path = Path(config_entry) + # We do not want to add the entries under /conf/ again. + if experiment_configuration_path in config_entry_path.parents: + continue + + # Everything else is added as absolute URI, as it might be + # a file like ``/etc/fstab``, or a private configuration from + # the project. + if config_entry_path.is_dir(): + crate.add_dataset(source=config_entry_path.as_uri()) + else: + _add_file(crate, None, config_entry_path, encoding_format=None, use_uri=True) + # Add log files. + _add_dir_and_files(crate, experiment_path, BasicConfig.LOCAL_TMP_DIR, "text/plain") + # Add plots files. + _add_dir_and_files(crate, experiment_path, "plot") + # Add status files. + _add_dir_and_files(crate, experiment_path, "status") + # Add SQLite DB and pickle files. + _add_dir_and_files(crate, experiment_path, "pkl", "application/binary") + + # Register Workflow Run RO-Crate (WRROC) profile. This code was adapted from COMPSs and StreamFlow. + # + # See: https://gitlab.bsc.es/wdc/compss/framework/-/blob/9cc5a8a5ba76457cf9b71d698bb77b8fa0aa0c9c/compss/runtime/scripts/system/provenance/generate_COMPSs_RO-Crate.py + # https://github.com/alpha-unito/streamflow/blob/c04089b0c16d74f50c4380c8648f271dfd702b9d/streamflow/provenance/run_crate.py + # https://www.researchobject.org/workflow-run-crate/ + # https://about.workflowhub.eu/Workflow-RO-Crate/ + # NOTE: A ``CreateAction`` can have an agent, pointing to the author + # of the RO-Crate or to another user. However, since we do not + # store that information in Autosubmit. Users wanting to use it + # have to add the ``PATCH`` to have an agent with the right + # ``@id``. + create_action_properties = { + "@type": "CreateAction", + "actionStatus": {"@id": f"http://schema.org/{_get_action_status(jobs)}"}, + "description": crate.description + } + if start_time is not None: + create_action_properties['startTime'] = start_time + if end_time is not None: + create_action_properties['endTime'] = end_time + create_action = crate.add( + ContextEntity(crate, '#create-action', create_action_properties) + ) + crate.root_dataset.properties().update({ + 'mentions': {'@id': create_action.id} + }) + + # Here we add the Autosubmit project as ``SoftwareCode``, and as part (``isPartOf``) + # of the RO-Crate main ``SoftwareCode`` entity. + try: + project_entity = _get_project_entity(as_conf, crate) + crate.add(project_entity) + main_entity.append_to('hasPart', {'@id': project_entity['@id']}) + except ValueError as e: + raise AutosubmitCritical("Failed to read the Autosubmit Project for RO-Crate...", 7014, str(e)) + + # inputs and outputs + # FIXME: Blocked by: https://earth.bsc.es/gitlab/es/autosubmit/-/issues/1045 + # TODO: Need to add input and output to ``main_entity``. + # "input": [ { "@id": "#id-param" }, {}, ... ] + # Oh, and "output" in the same way. + # Each input and output has the following format: + # { "@id": "#id-param", "@type": "FormalParameter", "additionalType": "File", + # "name": "input_file", "valueRequired": True } + # (note, outputs won't have valueRequired). + # The actual value of the FormalParameter goes into another entity: + # { "@id": "#id-pv", "@type": "PropertyValue", "exampleOfWork": {"@id": "id-param"}, + # "name": id", "value": 42 } + # + # How the code will look like once we have fixed the issue linked above: + # + # for item in ins: + # formal_parameter = get_formal_parameter(item, type='in') + # property_value = get_parameter_value(item, parameter=formal_parameter) + # crate.add(formal_parameter) + # crate.add(property_value) + # if formal_parameter['@type'] == 'File': + # create_action.append_to('hasPart', {'@id': property_value.id}) + # create_action.append_to('input', {'@id': formal_parameter.id}) + # for item in outs: + # formal_parameter = get_formal_parameter(item, type='out') + # property_value = get_parameter_value(item, parameter=formal_parameter) + # crate.add(formal_parameter) + # crate.add(property_value) + # if formal_parameter['@type'] == 'File': + # create_action.append_to('hasPart', {'@id': property_value.id}) + # create_action.append_to('output', {'@id': formal_parameter.id}) + + project_type = as_conf.experiment_data['PROJECT']['PROJECT_TYPE'].upper() + exported_keys = DEFAULT_EXPORTED_KEYS.copy() + if project_type == 'LOCAL': + exported_keys.append('LOCAL') + elif project_type == 'GIT': + exported_keys.append('GIT') + # N.B.: Subversion is not supported at the moment. See ``_get_project_entity``. + # elif project_type == 'SUBVERSION': + # exported_keys.append('SUBVERSION') + else: + # Dummy? + pass + + ins = [] + outs = [] + # TODO: Modify when we manage to have dicts/objects in YAML, + # https://earth.bsc.es/gitlab/es/autosubmit/-/issues/1045 + if 'INPUTS' in rocrate_json and rocrate_json['INPUTS']: + ins.extend(rocrate_json['INPUTS']) + if 'OUTPUTS' in rocrate_json and rocrate_json['OUTPUTS']: + outs.extend(rocrate_json['OUTPUTS']) + # Add the extra keys defined by the user in the ``ROCRATE.INPUT``. + if ins: + exported_keys.extend(ins) + + # Inputs. + for exported_key in exported_keys: + for e_k, e_v in workflow_configuration[exported_key].items(): + param_name = '.'.join([exported_key, e_k]) + Log.debug(f'Create input parameter for {param_name} = {str(e_v)}'.replace('{', '{{').replace('}', '}}')) + python_type = type(e_v).__name__ + if python_type not in PARAMETER_TYPES_MAP: + raise AutosubmitCritical( + f"Could not locate a type in RO-Crate for parameter {param_name} type {python_type}", 7014) + # The formal parameters are added to the workflow (main entity). + additional_type = PARAMETER_TYPES_MAP[python_type] + if type(additional_type) != str: + additional_type = PARAMETER_TYPES_MAP[python_type](additional_type) + formal_parameter = _create_formal_parameter( + crate, + param_name, + additionalType=additional_type, + valueRequired='True' + ) + main_entity.append_to('input', {'@id': formal_parameter['@id']}) + # The parameter values are added to the CrateAction. + parameter_value = _create_parameter( + crate, + param_name, + e_v, + formal_parameter, + type='PropertyValue' + ) + + create_action.append_to('object', {'@id': parameter_value['@id']}) + + # Outputs. + project_path = Path(workflow_configuration['ROOTDIR'], 'proj', + workflow_configuration['PROJECT']['PROJECT_DESTINATION']) + # NOTE: Do **NOT** pass ``source=project_path`` or ro-crate-py will copy the whole + # proj folder into the exported RO-Crate (which can have several GB's). + crate.add_dataset( + dest_path=project_path.relative_to(experiment_path) + ) + for output_pattern in outs: + for output_file in project_path.rglob(output_pattern): + Log.debug(f'Create output parameter for {output_file}') + # The formal parameters are added to the workflow (main entity). + formal_parameter = _create_formal_parameter( + crate, + output_file.relative_to(experiment_path), + name=output_file.name, + additionalType='File', + valueRequired='True' + ) + main_entity.append_to('output', {'@id': formal_parameter['@id']}) + # The file, added to the ``CreateAction.result``, and an example + # of the file above. + file_entity = _add_file( + crate, + base_path=experiment_path, + file_path=output_file, + encoding_format=None, + exampleOfWork={'@id': formal_parameter['@id']}) + create_action.append_to('result', {'@id': file_entity['@id']}) + + # Merge with user provided values. + # NOTE: It is important that this call happens after the JSON-LD has + # been constructed by ro-crate-py, as methods like ``add`` will + # replace entries (i.e. if we added before ro-crate-py, then we + # could have our values replaced by newly added values). + if 'PATCH' in rocrate_json and '@graph' in rocrate_json['PATCH']: + patch = json.loads(rocrate_json['PATCH']) + for jsonld_node in patch['@graph']: + crate.add_or_update_jsonld(jsonld_node) + + # Write RO-Crate ZIP. + crate.write_zip(Path(path, f"{expid}.zip")) + Log.info(f'RO-Crate archive written to {experiment_path}') + return crate diff --git a/docs/source/_static/css/autosubmit.css b/docs/source/_static/css/autosubmit.css index fca5cf8ac..ee71e0c21 100644 --- a/docs/source/_static/css/autosubmit.css +++ b/docs/source/_static/css/autosubmit.css @@ -10,3 +10,8 @@ td, th { figure { margin-bottom: 2rem !important; } + +/* For code block caption */ +.code-block-caption { + padding: 0 0 1rem 0 !important; +} diff --git a/docs/source/conf.py b/docs/source/conf.py index 3312c09c2..317a9fe30 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -14,7 +14,6 @@ import sys import os -# import shlex # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the @@ -36,8 +35,9 @@ extensions = [ 'sphinx.ext.autosectionlabel', 'sphinx_rtd_theme', 'sphinx_reredirects', + 'sphinx.ext.graphviz', 'autosubmit_variables', - 'sphinx.ext.graphviz' + 'runcmd' ] # Set .svg output fot the graphs generated by GraphViz diff --git a/docs/source/ext/runcmd.py b/docs/source/ext/runcmd.py new file mode 100644 index 000000000..c36da4c6e --- /dev/null +++ b/docs/source/ext/runcmd.py @@ -0,0 +1,206 @@ +import csv +import os +import re +import shlex +import subprocess +import sys + +from pathlib import Path + +from docutils.parsers.rst import directives +from sphinx.directives import code + +# This code is adapted from CWL User Guide, licensed under +# the CC BY 4.0 license, quoting their license: +# +# Attribution---You must give appropriate credit (mentioning +# that your work is derived from work that is Copyright © +# the Common Workflow Language project, and, where practical, +# linking to https://www.commonwl.org/ ),... +# Ref: https://github.com/common-workflow-language/user_guide/blob/8abf537144d7b63c3561c1ff2b660543effd0eb0/LICENSE.md + +"""" +Patched version of https://github.com/sphinx-contrib/sphinxcontrib-runcmd +with default values to avoid having to re-type in every page. Also +prepends commands with a value (``$``), see https://github.com/invenia/sphinxcontrib-runcmd/issues/1. +Finally, it also checks if the command is ``cwltool``, and if then +tries to remove any paths from the command-line (not the logs). +""" + +__version__ = "0.2.0" + +# CONSTANTS +RE_SPLIT = re.compile(r"(?P.*)(?.*)") + + +# These classes were in the .util module of the original directive. +class _Singleton(type): + _instances = {} + + def __call__(cls, *args, **kwargs): + if cls not in cls._instances: + cls._instances[cls] = super(_Singleton, cls).__call__(*args, **kwargs) + return cls._instances[cls] + + +class Singleton(_Singleton("SingletonMeta", (object,), {})): + pass + + +class CMDCache(Singleton): + cache = {} + exclude_cache_cmd = {hash("cat output.txt")} + + def get(self, cmd, working_directory): + h = hash(cmd) + if h in self.exclude_cache_cmd: + return run_command(cmd, working_directory) + elif h in self.cache: + return self.cache[h] + else: + result = run_command(cmd, working_directory) + self.cache[h] = result + return result + + +def run_command(command, working_directory): + true_cmd = shlex.split(command) + try: + # The subprocess Popen function takes a ``cwd`` argument that + # conveniently changes the working directory to run the command. + # + # We also patched the stderr to redirect to STDOUT, + # so that stderr and stdout appear in order, as you would see in + # a terminal. + # + # Finally, note that ``cwltool`` by default emits ANSI colors in the + # terminal, which are harder to be parsed and/or rendered in Sphinx. + # For that reason, we define --disable-color in the CWLTOOL_OPTIONS + # environment variable, which is used by ``cwltool``. + env = os.environ + # cwl_options = set(env.get('CWLTOOL_OPTIONS', '').split(' ')) + # cwl_options.add('--disable-color') + # env['CWLTOOL_OPTIONS'] = ' '.join(cwl_options) + subp = subprocess.Popen( + true_cmd, + cwd=working_directory, + env=env, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT + ) + except Exception as e: + out = "" + err = e + else: + out, err = subp.communicate() + encoding = sys.getfilesystemencoding() + out = out.decode(encoding, "replace").rstrip() + # The stderr is now combined with stdout. + # err = err.decode(encoding, "replace").rstrip() + + if err and err != "": + print("Error in runcmd: {}".format(err)) + out = "{}\n{}".format(out, err) + + return out + + +class RunCmdDirective(code.CodeBlock): + has_content = False + final_argument_whitespace = False + required_arguments = 1 + optional_arguments = 99 + + option_spec = { + # code.CodeBlock option_spec + "linenos": directives.flag, + "dedent": int, + "lineno-start": int, + "emphasize-lines": directives.unchanged_required, + "caption": directives.unchanged_required, + "class": directives.class_option, + "name": directives.unchanged, + # RunCmdDirective option_spec + "syntax": directives.unchanged, + "replace": directives.unchanged, + "prompt": directives.flag, + "dedent-output": int, + "working-directory": directives.unchanged + } + + def run(self): + # Grab a cache singleton instance + cache = CMDCache() + + # The examples in our User Guide are stored in ``src/_includes/cwl``. + # For convenience, instead of including that in every command, we + # allow the directive to receive a working directory, so that we + # change to that working directory before running the desired command. + # The working directory is omitted from the final output. + working_directory = self.options.get('working-directory', 'source/') + if working_directory == '': + # subprocess default value, so that we can disable it if needed. + working_directory = None + else: + # You can run Sphinx from the root directory, with `make watch` + # for instance, or from the src directory (RTD does that). + working_directory_path = Path(working_directory) + if not working_directory_path.exists() and str(working_directory_path).startswith('src/'): + working_directory = Path(working_directory[4:]) + + # Get the command output + command = " ".join(self.arguments) + output = cache.get(command, working_directory) + + # Grab our custom commands + syntax = self.options.get("syntax", "bash") + replace = self.options.get("replace", '') + reader = csv.reader([replace], delimiter=",", escapechar="\\") + # prompt = "prompt" in self.options + # We patched this so that the prompt is displayed by default, similar + # to how ``{code-block} console`` works. + prompt = True + dedent_output = self.options.get("dedent-output", 0) + + # Dedent the output if required + if dedent_output > 0: + output = "\n".join([x[dedent_output:] for x in output.split("\n")]) + + # Add the prompt to our output if required + if prompt: + output = "$ {}\n{}".format(command, output) + + # Do our "replace" syntax on the command output + for items in reader: + for regex in items: + if regex != "": + match = RE_SPLIT.match(regex) + p = match.group("pattern") + # Let's unescape the escape chars here as we don't need them to be + # escaped in the replacement at this point + r = match.group("replacement").replace("\\", "") + output = re.sub(p, r, output) + + # Note: Sphinx's CodeBlock directive expects an array of command-line + # output lines: https://github.com/sphinx-doc/sphinx/blob/c51a88da8b7b40e8d8cbdb1fce85ca2346b2b59a/sphinx/directives/code.py#L114 + # But the runcmd original code was simply wrapping a string + # containing \n in the text as a one-element array, e.g. + # ["cwltool --debug ...\ncwltool Version..."]. + # That caused the output to be correctly rendered, but the + # emphasize-lines directive parameter to fail if the lines were + # anything greater than 0 (as the self.content array had 1 elem). + # See: https://github.com/common-workflow-language/user_guide/issues/269 + output = output.split("\n") + + # Set up our arguments to run the CodeBlock parent run function + self.arguments[0] = syntax + self.content = output + node = super(RunCmdDirective, self).run() + + return node + + +def setup(app): + app.add_directive("runcmd", RunCmdDirective) + + return {"version": __version__} diff --git a/docs/source/index.rst b/docs/source/index.rst index 06a425cfd..c26993703 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -44,6 +44,7 @@ Welcome to autosubmit's documentation! /userguide/set_and_share_the_configuration/index /userguide/variables /userguide/expids + /userguide/provenance .. toctree:: :caption: Database Documentation @@ -81,8 +82,9 @@ Autosubmit is a Python software to manage complicated workflows on HPC platforms Automatization Autosubmit manages job submission and dependencies without user intervention Data Provenance. - Autosubmit keeps tracks of data generated by each experiment by assigning to them - unique ids. + Autosubmit assigns unique ID's to experiments, uses open standards, and + applies other techniques to enable :doc:`data provenance ` + in the experiments and workflows. Failure Tolerance Autosubmit manages automatic retrials and has the ability to rerun specific parts of the experiment in case of failure diff --git a/docs/source/introduction/index.rst b/docs/source/introduction/index.rst index 3b0fd62de..63c46ff5b 100644 --- a/docs/source/introduction/index.rst +++ b/docs/source/introduction/index.rst @@ -5,8 +5,6 @@ Introduction What is Autosubmit ? ==================== - - Autosubmit is a lightweight workflow manager designed to meet climate research necessities. Unlike other workflow solutions in the domain, it integrates the capabilities of an experiment manager, workflow orchestrator and monitor in a self-contained application. The experiment manager allows for defining and configuring experiments, supported by a hierarchical database that ensures reproducibility and traceability. The orchestrator is designed to run complex workflows in research and operational mode by managing their dependencies and interfacing with local and remote hosts. These multi-scale workflows can involve from a few to thousands of steps and from one to multiple platforms. Autosubmit facilitates easy and fast integration and relocation on new platforms. On the one hand, users can rapidly execute general scripts and progressively parametrize them by reading Autosubmit variables. On the other hand, it is a self-contained desktop application capable of submitting jobs to remote platforms without any external deployment. @@ -32,13 +30,20 @@ Why is Autosubmit needed ? Autosubmit is the only existing tool that satisfies the following requirements from the weather and climate community: -- **Automatization** Job submission to machines and dependencies between jobs are managed by Autosubmit. No user intervention is needed. -- **Data provenance** Assigns unique identifiers for each experiment and stores information about model version, experiment configuration and computing facilities used in the whole process. -- **Failure tolerance** Automatic retrials and ability to rerun chunks in case of corrupted or missing data. -- **Resource management** Autosubmit manages supercomputer particularities, allowing users to run their experiments in the available machine without having to adapt the code. Autosubmit also allows to submit tasks from the same experiment to different platforms. - - - +- **Automatization** Job submission to machines and dependencies between + jobs are managed by Autosubmit. No user intervention is needed. +- **Data provenance** Assigns unique identifiers for each experiment + and stores information about model version, experiment configuration + and computing facilities used in the whole process. Read more in + the user guide section about :doc:`/userguide/provenance`. +- **Failure tolerance** Automatic retrials and ability to rerun chunks + in case of corrupted or missing data. +- **Resource management** Autosubmit manages supercomputer particularities, + allowing users to run their experiments in the available machine without + having to adapt the code. Autosubmit also allows to submit tasks from + the same experiment to different platforms. + +.. _RO-Crate: https://w3id.org/ro/crate How does Autosubmit work ? ========================== diff --git a/docs/source/userguide/manage/index.rst b/docs/source/userguide/manage/index.rst index 56144d479..b168399d4 100644 --- a/docs/source/userguide/manage/index.rst +++ b/docs/source/userguide/manage/index.rst @@ -1,6 +1,8 @@ Manage Experiments =================== +.. _clean: + How to clean the experiment --------------------------- @@ -51,62 +53,38 @@ A bare copy (which occupies less space on disk) will be automatically made. How to archive an experiment ---------------------------- -To archive the experiment, use the command: -:: - - autosubmit archive EXPID +When you archive an experiment in Autosubmit, it automatically :ref:`cleans ` +the experiment as well. This means the experiment will not be available for +use, unless it is unarchived. -*EXPID* is the experiment identifier. - -.. warning:: this command calls implicitly the clean command. Check clean command documentation. - -.. warning:: experiment will be unusable after archiving. If you want to use it, you will need to call first the - unarchive command +.. code-block:: + autosubmit archive Options: -:: - - usage: autosubmit archive [-h] expid - - expid experiment identifier - - -h, --help show this help message and exit - - -Example: -:: - autosubmit archive cxxx +.. runcmd:: autosubmit archive -h + :caption: ``autosubmit archive`` options -.. hint:: Archived experiment will be stored as a tar.gz file on a folder named after the year of the last - COMPLETED file date. If not COMPLETED file is present, it will be stored in the folder matching the - date at the time the archive command was run. +The archived experiment will be stored as a ``tar.gz` file, under +a directory named after the year of the last ``_COMPLETED`` file +date or, if no ``_COMPLETED`` job is present, it will use the year of +the date the ``autosubmit archive`` was run (e.g. for the selected +year ``2023``, the location will be ``$HOME/autosubmit/2023/.tar.gz``). How to unarchive an experiment ------------------------------ To unarchive an experiment, use the command: -:: - autosubmit unarchive EXPID +.. code-block:: -*EXPID* is the experiment identifier. + autosubmit unarchive Options: -:: - - usage: autosubmit unarchive [-h] expid - - expid experiment identifier - - -h, --help show this help message and exit - - -Example: -:: - autosubmit unarchive cxxx +.. runcmd:: autosubmit unarchive -h + :caption: ``autosubmit unarchive`` options How to delete the experiment ---------------------------- diff --git a/docs/source/userguide/provenance.rst b/docs/source/userguide/provenance.rst new file mode 100644 index 000000000..073fc6a70 --- /dev/null +++ b/docs/source/userguide/provenance.rst @@ -0,0 +1,66 @@ +########## +Provenance +########## + +Autosubmit manages experiments following the `FAIR data`_ principles, +findability, accessibility, interoperability, and reusability. It +supports and uses open standards such as YAML, RO-Crate, as well as +other standards such as ISO-8601. + +Each Autosubmit experiment is assigned a :doc:`unique experiment ID ` +(also called expid). It also provides a central database and utilities +that permit experiments to be referenced. + +Every Autosubmit command issued by a user generates a timestamped log +file in ``/tmp/ASLOGS/``. For example, when the user runs +``autosubmit create `` and ``autosubmit run ``, these +commands should create files like ``/tmp/ASLOGS/20230808_092350_create.log`` +and ``/tmp/ASLOGS/20230808_092400_run.log``, with the same content +that was displayed in the console output to the user running it. + +Users can :ref:`archive Autosubmit experiments `. These archives contain the complete +logs and other files in the experiment directory, and can be later unarchived +and executed again. Supported archival formats are ZIP and **RO-Crate**. + +RO-Crate +-------- + +RO-Crate is a community standard adopted by other workflow managers +to package research data with their metadata. It is extensible, and contains +profiles to package computational workflows. From the `RO-Crate`_ website, +“What is RO-Crate?”: + +.. pull-quote:: + RO-Crate is a community effort to establish a lightweight approach to + packaging research data with their metadata. It is based on schema.org + annotations in JSON-LD, and aims to make best-practice in formal + metadata description accessible and practical for use in a wider variety + of situations, from an individual researcher working with a folder of + data, to large data-intensive computational research environments. + +Autosubmit `conforms`_ to the following RO-Crate profiles: + +* Process Run Crate + +* Workflow Run Crate + +* Workflow RO-Crate + +Experiments archived as RO-Crate can also be uploaded to `Zenodo`_ and +to `WorkflowHub`_. The Autosubmit team worked with the WorkflowHub team +to add Autosubmit as a supported language for workflows. Both Zenodo +and WorkflowHub are issuers of `DOI`_'s (digital object identifiers), +which can be used as persistent identifiers to resolve Autosubmit +experiments referenced in papers and other documents. + +.. _FAIR data: https://en.wikipedia.org/wiki/FAIR_data + +.. _RO-Crate: https://www.researchobject.org/ro-crate/ + +.. _conforms: https://github.com/ResearchObject/workflow-run-crate/pull/61 + +.. _Zenodo: https://zenodo.org/ + +.. _WorkflowHub: https://workflowhub.eu/ + +.. _DOI: https://en.wikipedia.org/wiki/Digital_object_identifier diff --git a/requeriments.txt b/requeriments.txt index 937b6a284..d357f39dd 100644 --- a/requeriments.txt +++ b/requeriments.txt @@ -30,3 +30,4 @@ packaging==20 typing>=3.7 wheel psutil +rocrate==0.* diff --git a/setup.py b/setup.py index 16fdb0b4f..7ad4b3409 100644 --- a/setup.py +++ b/setup.py @@ -39,7 +39,7 @@ setup( url='http://www.bsc.es/projects/earthscience/autosubmit/', download_url='https://earth.bsc.es/wiki/doku.php?id=tools:autosubmit', keywords=['climate', 'weather', 'workflow', 'HPC'], - install_requires=['ruamel.yaml==0.17.21','cython','autosubmitconfigparser','bcrypt>=3.2','packaging>19','six>=1.10.0','configobj>=5.0.6','argparse>=1.4.0','python-dateutil>=2.8.2','matplotlib<3.6','py3dotplus>=1.1.0','pyparsing>=3.0.7','paramiko>=2.9.2','mock>=4.0.3','portalocker>=2.3.2,<=2.7.0','networkx==2.6.3','requests>=2.27.1','bscearth.utils>=0.5.2','cryptography>=36.0.1','setuptools>=60.8.2','xlib>=0.21','pip>=22.0.3','pythondialog','pytest','nose','coverage','PyNaCl>=1.5.0','Pygments','psutil'], + install_requires=['ruamel.yaml==0.17.21','cython','autosubmitconfigparser','bcrypt>=3.2','packaging>19','six>=1.10.0','configobj>=5.0.6','argparse>=1.4.0','python-dateutil>=2.8.2','matplotlib<3.6','py3dotplus>=1.1.0','pyparsing>=3.0.7','paramiko>=2.9.2','mock>=4.0.3','portalocker>=2.3.2,<=2.7.0','networkx==2.6.3','requests>=2.27.1','bscearth.utils>=0.5.2','cryptography>=36.0.1','setuptools>=60.8.2','xlib>=0.21','pip>=22.0.3','pythondialog','pytest','nose','coverage','PyNaCl>=1.5.0','Pygments','psutil','rocrate==0.*'], classifiers=[ "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.9", diff --git a/test/unit/helpers/__init__.py b/test/unit/helpers/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/unit/provenance/__init__.py b/test/unit/provenance/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/unit/provenance/test_rocrate.py b/test/unit/provenance/test_rocrate.py new file mode 100644 index 000000000..da12e8e4f --- /dev/null +++ b/test/unit/provenance/test_rocrate.py @@ -0,0 +1,758 @@ +import datetime +import json +import tempfile +from pathlib import Path +from subprocess import CalledProcessError +from tempfile import TemporaryDirectory +from unittest import TestCase + +import yaml +from mock import Mock, patch +from rocrate.rocrate import File +from rocrate.rocrate import ROCrate + +from autosubmit.autosubmit import Autosubmit +from autosubmit.job.job import Job +from autosubmit.job.job_common import Status +from autosubmit.provenance.rocrate import ( + _add_dir_and_files, + _get_action_status, + _create_formal_parameter, + _create_parameter, + _get_project_entity, + _get_git_branch_and_commit, + create_rocrate_archive +) +from autosubmitconfigparser.config.configcommon import AutosubmitConfig +from log.log import AutosubmitCritical + + +class TestRoCrate(TestCase): + + def setUp(self): + self.empty_rocrate = ROCrate() + self.as_conf = Mock(spec=AutosubmitConfig) + self.expid = 'zzzz' + self.project_path = str(Path(__file__).parent.joinpath('../../../')) + self.project_url = 'https://earth.bsc.es/gitlab/es/autosubmit.git' + self.as_conf.get_project_dir = Mock(return_value=self.project_path) + + def tearDown(self) -> None: + self.empty_rocrate = None + + def _create_conf_dir(self, parent, as_conf=None): + if not as_conf: + as_conf = self.as_conf + conf_dir = Path(parent, 'conf') + conf_dir.mkdir(exist_ok=True) + Path(conf_dir, 'metadata').mkdir() + unified_config = Path(conf_dir, 'metadata/experiment_data.yml') + unified_config.touch() + unified_config.write_text(yaml.dump(as_conf.experiment_data)) + as_conf.current_loaded_files = {unified_config: 0} + + def test_add_dir_and_files_empty_folder(self): + with TemporaryDirectory() as d: + _add_dir_and_files( + crate=self.empty_rocrate, + base_path=d, + relative_path=d, + encoding_format=None + ) + self.assertEquals(1, len(self.empty_rocrate.data_entities)) + + def test_add_dir_and_files(self): + with TemporaryDirectory() as d: + sub_path = Path(d, 'files') + sub_path.mkdir(parents=True) + with open(sub_path / 'file.txt', 'w+') as f: + f.write('hello') + f.flush() + + _add_dir_and_files( + crate=self.empty_rocrate, + base_path=d, + relative_path=str(sub_path), + encoding_format=None + ) + self.assertEquals(2, len(self.empty_rocrate.data_entities)) + for entity in self.empty_rocrate.data_entities: + if entity.source.name == 'file.txt': + properties = entity.properties() + self.assertTrue(properties['sdDatePublished']) + self.assertTrue(properties['dateModified']) + self.assertEquals(properties['encodingFormat'], 'text/plain') + break + else: + self.fail('Failed to locate the entity for files/file.txt') + + def test_add_dir_and_files_set_encoding(self): + encoding = 'image/jpeg' + with TemporaryDirectory() as d: + with TemporaryDirectory() as d: + sub_path = Path(d, 'files') + sub_path.mkdir(parents=True) + with open(sub_path / 'file.txt', 'w+') as f: + f.write('hello') + f.flush() + + _add_dir_and_files( + crate=self.empty_rocrate, + base_path=d, + relative_path=str(sub_path), + encoding_format=encoding + ) + self.assertEquals(2, len(self.empty_rocrate.data_entities)) + for entity in self.empty_rocrate.data_entities: + if entity.source.name == 'file.txt': + properties = entity.properties() + self.assertTrue(properties['sdDatePublished']) + self.assertTrue(properties['dateModified']) + self.assertEquals(properties['encodingFormat'], encoding) + break + else: + self.fail('Failed to locate the entity for files/file.txt') + + def test_get_action_status(self): + for tests in [ + ([], 'PotentialActionStatus'), + ([Job('a', 'a', Status.FAILED, 1), Job('b', 'b', Status.COMPLETED, 1)], 'FailedActionStatus'), + ([Job('a', 'a', Status.COMPLETED, 1), Job('b', 'b', Status.COMPLETED, 1)], 'CompletedActionStatus'), + ([Job('a', 'a', Status.DELAYED, 1)], 'PotentialActionStatus') + ]: + jobs = tests[0] + expected = tests[1] + result = _get_action_status(jobs) + self.assertEquals(expected, result) + + def test_create_formal_parameter(self): + formal_parameter = _create_formal_parameter(self.empty_rocrate, 'Name') + properties = formal_parameter.properties() + self.assertEquals('#Name-param', properties['@id']) + self.assertEquals('FormalParameter', properties['@type']) + self.assertEquals('Name', properties['name']) + + def test_create_parameter(self): + formal_parameter = _create_formal_parameter(self.empty_rocrate, 'Answer') + parameter = _create_parameter( + self.empty_rocrate, + 'Answer', + 42, + formal_parameter, + 'PropertyValue', + extra='test' + ) + properties = parameter.properties() + self.assertEquals(42, properties['value']) + self.assertEquals('test', properties['extra']) + + def test_get_local_project_entity(self): + project_path = '/tmp/project' + project_url = f'file://{project_path}' + self.as_conf.experiment_data = { + 'PROJECT': { + 'PROJECT_TYPE': 'LOCAL' + }, + 'LOCAL': { + 'PROJECT_PATH': project_path + } + } + project_entity = _get_project_entity( + self.as_conf, + self.empty_rocrate + ) + + self.assertEquals(project_entity['@id'], project_url) + self.assertEquals(project_entity['targetProduct'], 'Autosubmit') + self.assertEquals(project_entity['codeRepository'], project_url) + self.assertEquals(project_entity['version'], '') + + def test_get_dummy_project_entity(self): + project_url = '' + self.as_conf.experiment_data = { + 'PROJECT': { + 'PROJECT_TYPE': 'NONE' + } + } + project_entity = _get_project_entity( + self.as_conf, + self.empty_rocrate + ) + + self.assertEquals(project_entity['@id'], project_url) + self.assertEquals(project_entity['targetProduct'], 'Autosubmit') + self.assertEquals(project_entity['codeRepository'], project_url) + self.assertEquals(project_entity['version'], '') + + def test_get_subversion_or_other_project_entity(self): + for key in ['SVN', 'SUBVERSION', 'MERCURY', '', ' ']: + self.as_conf.experiment_data = { + 'PROJECT': { + 'PROJECT_TYPE': key + }, + key: { + 'PROJECT_PATH': '' + } + } + with self.assertRaises(AutosubmitCritical): + _get_project_entity( + self.as_conf, + self.empty_rocrate + ) + + def test_get_git_project_entity(self): + self.as_conf.experiment_data = { + 'PROJECT': { + 'PROJECT_TYPE': 'GIT' + }, + 'GIT': { + 'PROJECT_PATH': self.project_path, + 'PROJECT_ORIGIN': self.project_url + } + } + project_entity = _get_project_entity( + self.as_conf, + self.empty_rocrate + ) + self.assertEquals(project_entity['@id'], self.project_url) + self.assertEquals(project_entity['targetProduct'], 'Autosubmit') + self.assertEquals(project_entity['codeRepository'], self.project_url) + self.assertTrue(len(project_entity['version']) > 0) + + @patch('subprocess.check_output') + def test_get_git_branch_and_commit(self, mocked_check_output: Mock): + error = CalledProcessError(1, '') + mocked_check_output.side_effect = [error] + with self.assertRaises(AutosubmitCritical) as cm: + _get_git_branch_and_commit(project_path='') + + self.assertEquals(cm.exception.message, 'Failed to retrieve project branch...') + + mocked_check_output.reset_mock() + mocked_check_output.side_effect = ['master', error] + with self.assertRaises(AutosubmitCritical) as cm: + _get_git_branch_and_commit(project_path='') + + self.assertEquals(cm.exception.message, 'Failed to retrieve project commit SHA...') + + @patch('autosubmit.provenance.rocrate.BasicConfig') + @patch('autosubmit.provenance.rocrate.get_experiment_descrip') + @patch('autosubmit.provenance.rocrate.get_autosubmit_version') + def test_rocrate( + self, + mocked_get_autosubmit_version: Mock, + mocked_get_experiment_descrip: Mock, + mocked_BasicConfig: Mock): + with tempfile.TemporaryDirectory() as temp_dir: + mocked_BasicConfig.LOCAL_ROOT_DIR = temp_dir + mocked_BasicConfig.LOCAL_TMP_DIR.mkdir() + experiment_path = Path(mocked_BasicConfig.LOCAL_ROOT_DIR, self.expid) + experiment_path.mkdir() + mocked_BasicConfig.LOCAL_TMP_DIR = Path(experiment_path, 'tmp') + mocked_BasicConfig.LOCAL_TMP_DIR.mkdir() + project_path = Path(experiment_path, 'proj') + project_path.mkdir() + # some outputs + for output_file in ['graph_1.png', 'graph_2.gif', 'graph_3.gif', 'graph.jpg']: + Path(project_path, output_file).touch() + # required paths for AS + for other_required_path in ['conf', 'pkl', 'plot', 'status']: + Path(experiment_path, other_required_path).mkdir() + self.as_conf.experiment_data = { + 'DEFAULT': { + 'EXPID': self.expid + }, + 'EXPERIMENT': {}, + 'CONFIG': { + 'PRE': [ + '%PROJ%/conf/bootstrap/include.yml' + ] + }, + 'ROOTDIR': str(experiment_path), + 'PROJECT': { + 'PROJECT_DESTINATION': '', + 'PROJECT_TYPE': 'LOCAL' + }, + 'LOCAL': { + 'PROJECT_PATH': str(project_path) + }, + 'APP': { + 'INPUT_1': 1, + 'INPUT_2': 2 + } + } + rocrate_json = { + 'INPUTS': ['APP'], + 'OUTPUTS': [ + 'graph_*.gif' + ], + 'PATCH': json.dumps({ + '@graph': [ + { + '@id': './', + "license": "Apache-2.0" + } + ] + }) + } + self._create_conf_dir(experiment_path) + jobs = [] + start_time = '' + end_time = '' + + mocked_get_autosubmit_version.return_value = '4.0.0b0' + mocked_get_experiment_descrip.return_value = [ + ['mocked test project'] + ] + + crate = create_rocrate_archive( + as_conf=self.as_conf, + rocrate_json=rocrate_json, + jobs=jobs, + start_time=start_time, + end_time=end_time, + path=Path(temp_dir) + ) + self.assertIsNotNone(crate) + + @patch('autosubmit.provenance.rocrate._get_project_entity') + @patch('autosubmit.provenance.rocrate.BasicConfig') + @patch('autosubmit.provenance.rocrate.get_experiment_descrip') + @patch('autosubmit.provenance.rocrate.get_autosubmit_version') + def test_rocrate_invalid_project( + self, + mocked_get_autosubmit_version: Mock, + mocked_get_experiment_descrip: Mock, + mocked_BasicConfig: Mock, + mocked_get_project_entity: Mock): + mocked_get_project_entity.side_effect = ValueError + with tempfile.TemporaryDirectory() as temp_dir: + mocked_BasicConfig.LOCAL_ROOT_DIR = temp_dir + mocked_BasicConfig.LOCAL_TMP_DIR.mkdir() + experiment_path = Path(mocked_BasicConfig.LOCAL_ROOT_DIR, self.expid) + experiment_path.mkdir() + mocked_BasicConfig.LOCAL_TMP_DIR = Path(experiment_path, 'tmp') + mocked_BasicConfig.LOCAL_TMP_DIR.mkdir() + project_path = Path(experiment_path, 'proj') + project_path.mkdir() + # some outputs + for output_file in ['graph_1.png', 'graph_2.gif', 'graph_3.gif', 'graph.jpg']: + Path(project_path, output_file).touch() + # required paths for AS + for other_required_path in ['conf', 'pkl', 'plot', 'status']: + Path(experiment_path, other_required_path).mkdir() + self.as_conf.experiment_data = { + 'DEFAULT': { + 'EXPID': self.expid + }, + 'EXPERIMENT': {}, + 'CONFIG': {}, + 'ROOTDIR': str(experiment_path), + 'PROJECT': { + 'PROJECT_DESTINATION': '', + 'PROJECT_TYPE': 'GIT' + }, + 'GIT': { + 'PROJECT_PATH': str(project_path), + 'PROJECT_ORIGIN': self.project_url + } + } + rocrate_json = {} + self._create_conf_dir(experiment_path) + jobs = [] + + mocked_get_autosubmit_version.return_value = '4.0.0b0' + mocked_get_experiment_descrip.return_value = [ + ['mocked test project'] + ] + + with self.assertRaises(AutosubmitCritical) as cm: + create_rocrate_archive( + as_conf=self.as_conf, + rocrate_json=rocrate_json, + jobs=jobs, + start_time=None, + end_time=None, + path=Path(temp_dir) + ) + + self.assertEquals(cm.exception.message, 'Failed to read the Autosubmit Project for RO-Crate...') + + @patch('autosubmit.provenance.rocrate.BasicConfig') + @patch('autosubmit.provenance.rocrate.get_experiment_descrip') + @patch('autosubmit.provenance.rocrate.get_autosubmit_version') + def test_rocrate_invalid_parameter_type( + self, + mocked_get_autosubmit_version: Mock, + mocked_get_experiment_descrip: Mock, + mocked_BasicConfig: Mock): + """NOTE: This is not possible at the moment, as we are using ruamel.yaml + to parse the YAML, and we are not supporting objects. But you never know + what the code will do in the future, so we just make sure we fail nicely.""" + with tempfile.TemporaryDirectory() as temp_dir: + mocked_BasicConfig.LOCAL_ROOT_DIR = temp_dir + mocked_BasicConfig.LOCAL_TMP_DIR.mkdir() + experiment_path = Path(mocked_BasicConfig.LOCAL_ROOT_DIR, self.expid) + experiment_path.mkdir() + mocked_BasicConfig.LOCAL_TMP_DIR = Path(experiment_path, 'tmp') + mocked_BasicConfig.LOCAL_TMP_DIR.mkdir() + project_path = Path(experiment_path, 'proj') + project_path.mkdir() + # some outputs + for output_file in ['graph_1.png', 'graph_2.gif', 'graph_3.gif', 'graph.jpg']: + Path(project_path, output_file).touch() + # required paths for AS + for other_required_path in ['conf', 'pkl', 'plot', 'status']: + Path(experiment_path, other_required_path).mkdir() + self.as_conf.experiment_data = { + 'DEFAULT': { + 'EXPID': self.expid + }, + 'EXPERIMENT': {}, + 'CONFIG': {}, + 'ROOTDIR': str(experiment_path), + 'PROJECT': { + 'PROJECT_DESTINATION': '', + 'PROJECT_TYPE': 'GIT' + }, + 'GIT': { + 'PROJECT_PATH': str(project_path), + 'PROJECT_ORIGIN': self.project_url + }, + 'APP': { + 'OBJ': object() + } + } + rocrate_json = { + 'INPUTS': [ + 'APP' + ] + } + self._create_conf_dir(experiment_path) + jobs = [] + + mocked_get_autosubmit_version.return_value = '4.0.0b0' + mocked_get_experiment_descrip.return_value = [ + ['mocked test project'] + ] + + with self.assertRaises(AutosubmitCritical) as cm: + create_rocrate_archive( + as_conf=self.as_conf, + rocrate_json=rocrate_json, + jobs=jobs, + start_time=None, + end_time=None, + path=Path(temp_dir) + ) + + self.assertEquals(cm.exception.message, + 'Could not locate a type in RO-Crate for parameter APP.OBJ type object') + + @patch('autosubmit.autosubmit.Log') + @patch('autosubmit.autosubmit.AutosubmitConfig') + def test_rocrate_main_fail_missing_rocrate( + self, + mocked_AutosubmitConfig: Mock, + mocked_Log: Mock): + mocked_as_conf = Mock(autospec=AutosubmitConfig) + mocked_as_conf.experiment_data = {} + mocked_AutosubmitConfig.return_value = mocked_as_conf + + mocked_Log.error = Mock() + mocked_Log.error.return_value = '' + + autosubmit = Autosubmit() + with self.assertRaises(AutosubmitCritical) as cm, tempfile.TemporaryDirectory() as temp_dir: + autosubmit.rocrate(self.expid, path=Path(path=Path(temp_dir))) + + self.assertEquals(cm.exception.message, 'You must provide an ROCRATE configuration key when using RO-Crate...') + self.assertEquals(mocked_Log.error.call_count, 1) + + @patch('autosubmit.autosubmit.JobList') + @patch('autosubmit.autosubmit.AutosubmitConfig') + @patch('autosubmit.provenance.rocrate.BasicConfig') + @patch('autosubmit.provenance.rocrate.get_experiment_descrip') + @patch('autosubmit.provenance.rocrate.get_autosubmit_version') + def test_rocrate_main( + self, + mocked_get_autosubmit_version: Mock, + mocked_get_experiment_descrip: Mock, + mocked_BasicConfig: Mock, + mocked_AutosubmitConfig: Mock, + mocked_JobList: Mock): + with tempfile.TemporaryDirectory() as temp_dir: + mocked_BasicConfig.LOCAL_ROOT_DIR = temp_dir + mocked_BasicConfig.LOCAL_TMP_DIR.mkdir() + experiment_path = Path(mocked_BasicConfig.LOCAL_ROOT_DIR, self.expid) + experiment_path.mkdir() + mocked_BasicConfig.LOCAL_TMP_DIR = Path(experiment_path, 'tmp') + mocked_BasicConfig.LOCAL_TMP_DIR.mkdir() + project_path = Path(experiment_path, 'proj') + project_path.mkdir() + # some outputs + for output_file in ['graph_1.png', 'graph_2.gif', 'graph_3.gif', 'graph.jpg']: + Path(project_path, output_file).touch() + # required paths for AS + for other_required_path in ['conf', 'pkl', 'plot', 'status']: + Path(experiment_path, other_required_path).mkdir() + mocked_as_conf = Mock(autospec=AutosubmitConfig) + mocked_AutosubmitConfig.return_value = mocked_as_conf + mocked_as_conf.experiment_data = { + 'DEFAULT': { + 'EXPID': self.expid + }, + 'EXPERIMENT': {}, + 'CONFIG': {}, + 'ROOTDIR': str(experiment_path), + 'PROJECT': { + 'PROJECT_DESTINATION': '', + 'PROJECT_TYPE': 'LOCAL' + }, + 'LOCAL': { + 'PROJECT_PATH': str(project_path) + }, + 'APP': { + 'INPUT_1': 1, + 'INPUT_2': 2 + }, + 'ROCRATE': { + 'INPUTS': ['APP'], + 'OUTPUTS': [ + 'graph_*.gif' + ], + 'PATCH': json.dumps({ + '@graph': [ + { + '@id': './', + "license": "Apache-2.0" + } + ] + }) + } + } + self._create_conf_dir(experiment_path, as_conf=mocked_as_conf) + mocked_as_conf.get_storage_type.return_value = 'pkl' + mocked_as_conf.get_date_list.return_value = [] + + mocked_get_autosubmit_version.return_value = '4.0.0b0' + mocked_get_experiment_descrip.return_value = [ + ['mocked test project'] + ] + + mocked_job_list = Mock() + mocked_JobList.return_value = mocked_job_list + + job1 = Mock(autospec=Job) + job1_submit_time = datetime.datetime.strptime("21/11/06 16:30", "%d/%m/%y %H:%M") + job1_start_time = datetime.datetime.strptime("21/11/06 16:40", "%d/%m/%y %H:%M") + job1_finished_time = datetime.datetime.strptime("21/11/06 16:50", "%d/%m/%y %H:%M") + job1.get_last_retrials.return_value = [ + [job1_submit_time, job1_start_time, job1_finished_time, 'COMPLETED']] + job1.name = 'job1' + job1.date = '2006' + job1.member = 'fc0' + job1.section = 'JOB' + job1.chunk = '1' + job1.processors = '1' + + job2 = Mock(autospec=Job) + job2_submit_time = datetime.datetime.strptime("21/11/06 16:40", "%d/%m/%y %H:%M") + job2_start_time = datetime.datetime.strptime("21/11/06 16:50", "%d/%m/%y %H:%M") + job2_finished_time = datetime.datetime.strptime("21/11/06 17:00", "%d/%m/%y %H:%M") + job2.get_last_retrials.return_value = [ + [job2_submit_time, job2_start_time, job2_finished_time, 'COMPLETED']] + job2.name = 'job2' + job2.date = '2006' + job2.member = 'fc1' + job2.section = 'JOB' + job2.chunk = '1' + job2.processors = '1' + + mocked_job_list.get_job_list.return_value = [job1, job2] + + autosubmit = Autosubmit() + r = autosubmit.rocrate(self.expid, path=Path(temp_dir)) + self.assertTrue(r) + + @patch('autosubmit.provenance.rocrate.BasicConfig') + @patch('autosubmit.provenance.rocrate.get_experiment_descrip') + @patch('autosubmit.provenance.rocrate.get_autosubmit_version') + def test_custom_config_loaded_file( + self, + mocked_get_autosubmit_version: Mock, + mocked_get_experiment_descrip: Mock, + mocked_BasicConfig: Mock): + with tempfile.TemporaryDirectory() as temp_dir: + mocked_BasicConfig.LOCAL_ROOT_DIR = temp_dir + mocked_BasicConfig.LOCAL_TMP_DIR.mkdir() + experiment_path = Path(mocked_BasicConfig.LOCAL_ROOT_DIR, self.expid) + experiment_path.mkdir() + mocked_BasicConfig.LOCAL_TMP_DIR = Path(experiment_path, 'tmp') + mocked_BasicConfig.LOCAL_TMP_DIR.mkdir() + project_path = Path(experiment_path, 'proj') + project_path.mkdir() + # required paths for AS + for other_required_path in ['conf', 'pkl', 'plot', 'status']: + Path(experiment_path, other_required_path).mkdir() + + # custom config file + project_conf = Path(project_path, 'conf') + project_conf.mkdir() + custom_config = Path(project_conf, 'include.yml') + custom_config.touch() + custom_config.write_text('CUSTOM_CONFIG_LOADED: True') + + self.as_conf.experiment_data = { + 'DEFAULT': { + 'EXPID': self.expid + }, + 'EXPERIMENT': {}, + 'CONFIG': { + 'PRE': [ + str(project_conf) + ] + }, + 'ROOTDIR': str(experiment_path), + 'PROJECT': { + 'PROJECT_DESTINATION': '', + 'PROJECT_TYPE': 'LOCAL' + }, + 'LOCAL': { + 'PROJECT_PATH': str(project_path) + }, + 'APP': { + 'INPUT_1': 1, + 'INPUT_2': 2 + } + } + rocrate_json = { + 'INPUTS': ['APP'], + 'OUTPUTS': [ + 'graph_*.gif' + ], + 'PATCH': json.dumps({ + '@graph': [ + { + '@id': './', + "license": "Apache-2.0" + } + ] + }) + } + self._create_conf_dir(experiment_path) + # adding both directory and file to the list of loaded files + self.as_conf.current_loaded_files[str(project_conf)] = 0 + self.as_conf.current_loaded_files[str(custom_config)] = 0 + jobs = [] + start_time = '' + end_time = '' + + mocked_get_autosubmit_version.return_value = '4.0.0b0' + mocked_get_experiment_descrip.return_value = [ + ['mocked test project'] + ] + + crate = create_rocrate_archive( + as_conf=self.as_conf, + rocrate_json=rocrate_json, + jobs=jobs, + start_time=start_time, + end_time=end_time, + path=Path(temp_dir) + ) + self.assertIsNotNone(crate) + data_entities_ids = [data_entity['@id'] for data_entity in crate.data_entities] + self.assertTrue(File(crate, f'file://{str(project_conf)}/').id in data_entities_ids) + self.assertTrue(File(crate, f'file://{str(custom_config)}').id in data_entities_ids) + + @patch('autosubmit.provenance.rocrate.BasicConfig') + @patch('autosubmit.provenance.rocrate.get_experiment_descrip') + @patch('autosubmit.provenance.rocrate.get_autosubmit_version') + def test_no_duplicate_ids( + self, + mocked_get_autosubmit_version: Mock, + mocked_get_experiment_descrip: Mock, + mocked_BasicConfig: Mock): + with tempfile.TemporaryDirectory() as temp_dir: + mocked_BasicConfig.LOCAL_ROOT_DIR = temp_dir + mocked_BasicConfig.LOCAL_TMP_DIR.mkdir() + experiment_path = Path(mocked_BasicConfig.LOCAL_ROOT_DIR, self.expid) + experiment_path.mkdir() + mocked_BasicConfig.LOCAL_TMP_DIR = Path(experiment_path, 'tmp') + mocked_BasicConfig.LOCAL_TMP_DIR.mkdir() + project_path = Path(experiment_path, 'proj') + project_path.mkdir() + # required paths for AS + for other_required_path in ['conf', 'pkl', 'plot', 'status']: + Path(experiment_path, other_required_path).mkdir() + + # custom config file + project_conf = Path(project_path, 'conf') + project_conf.mkdir() + custom_config = Path(project_conf, 'include.yml') + custom_config.touch() + custom_config.write_text('CUSTOM_CONFIG_LOADED: True') + + self.as_conf.experiment_data = { + 'DEFAULT': { + 'EXPID': self.expid + }, + 'EXPERIMENT': {}, + 'CONFIG': { + 'PRE': [ + str(project_conf) + ] + }, + 'ROOTDIR': str(experiment_path), + 'PROJECT': { + 'PROJECT_DESTINATION': '', + 'PROJECT_TYPE': 'LOCAL' + }, + 'LOCAL': { + 'PROJECT_PATH': str(project_path) + }, + 'APP': { + 'INPUT_1': 1, + 'INPUT_2': 2 + } + } + rocrate_json = { + 'INPUTS': ['APP'], + 'OUTPUTS': [ + 'graph_*.gif' + ], + 'PATCH': json.dumps({ + '@graph': [ + { + '@id': './', + "license": "Apache-2.0" + } + ] + }) + } + self._create_conf_dir(experiment_path) + # adding both directory and file to the list of loaded files + self.as_conf.current_loaded_files[str(project_conf)] = 0 + self.as_conf.current_loaded_files[str(custom_config)] = 0 + jobs = [] + start_time = '' + end_time = '' + + mocked_get_autosubmit_version.return_value = '4.0.0b0' + mocked_get_experiment_descrip.return_value = [ + ['mocked test project'] + ] + + crate = create_rocrate_archive( + as_conf=self.as_conf, + rocrate_json=rocrate_json, + jobs=jobs, + start_time=start_time, + end_time=end_time, + path=Path(temp_dir) + ) + self.assertIsNotNone(crate) + data_entities_ids = [data_entity['@id'] for data_entity in crate.data_entities] + self.assertEquals(len(data_entities_ids), len(set(data_entities_ids)), f'Duplicate IDs found in the RO-Crate data entities: {str(data_entities_ids)}') + -- GitLab