diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 001a2fe9777de2f2f386f0564c7e8fa34f2221ca..6b59ffa95381c6a560164e6773f4ecb66f856e57 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -10,6 +10,8 @@ build: os: ubuntu-22.04 tools: python: "3.9" + apt_packages: + - graphviz jobs: post_install: # ruamel.yaml.clib appears to cause the build to fail when diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 78e8a7facf073f5c7deee20658f11fcf01928686..34f671680d7a6741da4fa76b875f4634ddf0222e 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -50,6 +50,7 @@ from .job.job_list_persistence import JobListPersistencePkl from .job.job_package_persistence import JobPackagePersistence from .job.job_packager import JobPackager from .job.job_utils import SubJob, SubJobManager +from .profiler.profiler import Profiler from .monitor.monitor import Monitor from .notifications.mail_notifier import MailNotifier from .notifications.notifier import Notifier @@ -190,6 +191,8 @@ class Autosubmit: help='Sets a experiment expid which completion will trigger the start of this experiment.') subparser.add_argument('-rom', '--run_only_members', required=False, help='Sets members allowed on this run.') + subparser.add_argument('-p', '--profile', action='store_true', default=False, required=False, + help='Prints performance parameters of the execution of this experiment.') # Expid @@ -647,7 +650,7 @@ class Autosubmit: Autosubmit._init_logs(args, args.logconsole, args.logfile, expid) if args.command == 'run': - return Autosubmit.run_experiment(args.expid, args.notransitive,args.start_time,args.start_after, args.run_only_members) + return Autosubmit.run_experiment(args.expid, args.notransitive,args.start_time,args.start_after, args.run_only_members, args.profile) elif args.command == 'expid': return Autosubmit.expid(args.description,args.HPC,args.copy, args.dummy,args.minimal_configuration,args.git_repo,args.git_branch,args.git_as_conf,args.operational,args.testcase,args.use_local_minimal) != '' elif args.command == 'delete': @@ -1999,8 +2002,9 @@ class Autosubmit: Log.debug("Sleep: {0}", safetysleeptime) Log.debug("Number of retrials: {0}", default_retrials) return total_jobs, safetysleeptime, default_retrials, check_wrapper_jobs_sleeptime + @staticmethod - def run_experiment(expid, notransitive=False, start_time=None, start_after=None,run_only_members=None): + def run_experiment(expid, notransitive=False, start_time=None, start_after=None, run_only_members=None, profile=False): """ Runs and experiment (submitting all the jobs properly and repeating its execution in case of failure). :param expid: the experiment id @@ -2008,6 +2012,7 @@ class Autosubmit: :param start_time: the time at which the experiment should start :param start_after: the expid after which the experiment should start :param run_only_members: the members to run + :param profile: if True, the whole experiment will be profiled :return: None """ @@ -2018,6 +2023,11 @@ class Autosubmit: except BaseException as e: raise AutosubmitCritical("Failure during the loading of the experiment configuration, check file paths", 7014, str(e)) + + # If the profile flag has been used, activate the profiler and start taking measures + if profile: + profiler = Profiler(expid) + profiler.start() # checking if there is a lock file to avoid multiple running on the same expid try: @@ -2250,6 +2260,9 @@ class Autosubmit: raise except BaseException as e: raise AutosubmitCritical("This seems like a bug in the code, please contact AS developers", 7070, str(e)) + finally: + if profile: + profiler.stop() @staticmethod def restore_platforms(platform_to_test, mail_notify=False, as_conf=None, expid=None): diff --git a/autosubmit/profiler/__init__.py b/autosubmit/profiler/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/autosubmit/profiler/profiler.py b/autosubmit/profiler/profiler.py new file mode 100644 index 0000000000000000000000000000000000000000..789402b4d9130faf67addbd3a179f2078e9a6788 --- /dev/null +++ b/autosubmit/profiler/profiler.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 + +# Copyright 2015-2023 Earth Sciences Department, BSC-CNS + +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . + +import cProfile +import io +import os +import pstats +from datetime import datetime +from pathlib import Path +from pstats import SortKey + +import psutil +from autosubmitconfigparser.config.basicconfig import BasicConfig + +from log.log import Log, AutosubmitCritical + + +_UNITS = ["B", "KiB", "MiB", "GiB", "TiB", "PiB"] + +class Profiler: + """Class to profile the execution of experiments.""" + + def __init__(self, expid:str): + self._profiler = cProfile.Profile() + self._expid = expid + + # Memory profiling variables + self._mem_init = 0 + self._mem_final = 0 + + # Error handling flags + self._started = False + + def start(self) -> None: + """Function to start the profiling process.""" + if self._started: + raise AutosubmitCritical('The profiling process was already started.', 7074) + + self._profiler.enable() + self._mem_init += _get_current_memory() + self._started = True + + def stop(self) -> None: + """Function to finish the profiling process.""" + if not self._started: + raise AutosubmitCritical('Cannot stop the profiler because was not running.', 7074) + + self._profiler.disable() + self._mem_final += _get_current_memory() + self._report() + + def _report(self) -> None: + """Function to print the final report into the stdout, log and filesystem.""" + + # Create the profiler path if it does not exist + report_path = Path(BasicConfig.LOCAL_ROOT_DIR, self._expid, "tmp", "profile") + report_path.mkdir(parents=True, exist_ok=True) + report_path.chmod(0o755) + if not os.access(report_path, os.W_OK): # Check for write access + raise AutosubmitCritical( + f'Directory {report_path} not writable. Please check permissions.', 7012) + + + stream = io.StringIO() + date_time = datetime.now().strftime('%Y%m%d-%H%M%S') + + # Generate function-by-function profiling results + sort_by = SortKey.CUMULATIVE + stats = pstats.Stats(self._profiler, stream=stream) # generate statistics + stats.strip_dirs().sort_stats(sort_by).print_stats() # format and save in the stream + + # Generate memory profiling results + mem_total : float = self._mem_final - self._mem_init # memory in Bytes + unit = 0 + # reduces the value to its most suitable unit + while mem_total >= 1024 and unit <= len(_UNITS) - 1: + unit += 1 + mem_total /= 1024 + + # Create and save report + report = "\n".join([ + _generate_title("Time & Calls Profiling"), + "", + stream.getvalue(), + _generate_title("Memory Profiling"), + f"MEMORY CONSUMPTION: {mem_total} {_UNITS[unit]}.", + "" + ]).replace('{', '{{').replace('}', '}}') # escape {} so Log can call str.format + + Log.info(report) + + stats.dump_stats(Path(report_path, f"{self._expid}_profile_{date_time}.prof")) + with open(Path(report_path, f"{self._expid}_profile_{date_time}.txt"), + 'w', encoding='UTF-8') as report_file: + report_file.write(report) + + Log.info(f"[INFO] You can also find report and prof files at {report_path}\n") + +def _generate_title(title="") -> str: + """ + Generates a title banner with the specified text. + + :param title: The title that will be shown in the banner. + :type title: str + :return: The banner with the specified title. + :rtype: str + """ + max_len = 80 + separator = "=" * max_len + message = title.center(max_len) + return "\n".join([separator, message, separator]) + + +def _get_current_memory() -> int: + """ + Return the current memory consumption of the process in Bytes. + + :return: The current memory used by the process (Bytes). + :rtype: int + """ + return psutil.Process(os.getpid()).memory_info().rss diff --git a/docs/source/conf.py b/docs/source/conf.py index 5c7b344ad5c3bf66f4f409c44560bed1c17f039b..86c56818432b57e790a2d2b85eb9fb3cb1fb7c34 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -36,9 +36,13 @@ extensions = [ 'sphinx.ext.autosectionlabel', 'sphinx_rtd_theme', 'sphinx_reredirects', - 'autosubmit_variables' + 'autosubmit_variables', + 'sphinx.ext.graphviz' ] +# Set .svg output fot the graphs generated by GraphViz +graphviz_output_format='svg' + autosectionlabel_prefix_document = True numfig = True diff --git a/docs/source/troubleshooting/error-codes.rst b/docs/source/troubleshooting/error-codes.rst index fbba9514e465d43514c30b08f7f022cacff60dad..9ed70aa32b7beaee5ce468fcaced9e67090e2051 100644 --- a/docs/source/troubleshooting/error-codes.rst +++ b/docs/source/troubleshooting/error-codes.rst @@ -123,6 +123,8 @@ Uncatalogued codes - Critical Error codes [7060+] | 7073 | Private key is encrypted | Each Session: Add your key into the ssh agent. ex. ssh-add $HOME/.ssh/id_rsa, then launch autosubmit. | | | | Alternative: You can use a non-encrypted key, just make sure that nobody except you has access to the file. | +------+-----------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| 7074 | Profiling process failed | You can find descriptive explanations in the log, as well as hints to solve the problem. | ++------+-----------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ **Default Solution** diff --git a/docs/source/userguide/run/index.rst b/docs/source/userguide/run/index.rst index f38ede902bf9f3d72e008b16db01243e5af85090..767d3cce9af03e46c6a5303d97ae247e02a8f6b1 100644 --- a/docs/source/userguide/run/index.rst +++ b/docs/source/userguide/run/index.rst @@ -187,6 +187,42 @@ This functionality can be used together with other options supplied by the `run` The `-sa` command has a long version `--start_after`. +.. _basic_profiling: + + +How to profile Autosubmit while running an experiment +----------------------------------------------------- + +Autosubmit offers the possibility to profile an experiment execution. To enable the profiler, just +add the ``--profile`` (or ``-p``) flag to your ``autosubmit run`` command, as in the following example: + +.. code-block:: bash + + autosubmit run --profile EXPID + +.. note:: Remember that the purpose of this profiler is to measure the performance of Autosubmit, + not the jobs it runs. + +This profiler uses Python's ``cProfile`` and ``psutil`` modules to generate a report with simple CPU and +memory metrics for the experiment execution. These metrics will be displayed in your console after +the experiment finishes, as in the example below: + +.. figure:: profiler_output.png + :name: profiler_head_output + :align: center + :alt: Screenshot of the header of the profiler's output + +The profiler output is also saved in ``/tmp/profile``. There you will find two files, the +report in plain text format and a ``.prof`` binary which contains the CPU metrics. We highly recommend +using `SnakeViz `_ to visualize this file, as follows: + +.. figure:: profiler_snakeviz.png + :name: profiler_snakeviz + :align: center + :alt: The .prof file represented by the graphical library SnakeViz + +For more detailed documentation about the profiler, please visit this :ref:`page`. + .. _run_modes: How to prepare an experiment to run in two independent job_list. (Priority jobs, Two-step-run) (OLD METHOD) diff --git a/docs/source/userguide/run/profiler.rst b/docs/source/userguide/run/profiler.rst new file mode 100644 index 0000000000000000000000000000000000000000..0a20cbd831f6700e8054fe3eb1716f29a6a3ca97 --- /dev/null +++ b/docs/source/userguide/run/profiler.rst @@ -0,0 +1,113 @@ +:orphan: + +.. + The :orphan: section tells Sphinx not to include this page in any contents list + +.. _advanced_profiling: + +autosubmit.profiler +=================== + +.. important:: If you are interested in measuring only the performance of the execution of an + experiment in Autosubmit, a simpler user-oriented guide is available at this + :ref:`link`. + +###################################### +The autosubmit's profiler +###################################### + +Autosubmit integrates a profiler that allows developers to easily measure the performance of entire +functions or specific code fragments. + +The profiler generates a comprehensive report with enough information to detect bottleneck functions +during the execution of experiments, as well as information about the total memory consumed. + +It mainly uses the ``cProfile`` library to make the report, so this module inherits its deterministic +profiling and reasonable overhead features. However, it also limits profiling to a single thread, so +please, do not use it on concurrent code. For memory profiling, it uses ``psutil``. + +.. caution:: + This profiler was originally designed to be used in the ``autosubmit run`` command, so using + it in other functions may produce unexpected results or errors. + + The profiler instantiation requires an ``EXPID``, and not all the functions in Autosubmit use it. + This can be bypassed using another string, but keep in mind that there is no error handling in + this case. + +How to profile a function or a specific code fragment? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Depending on the Autosubmit function you want to profile, you must add a ``--profile`` argument to the +parser. The ``autosubmit run`` subcommand supports it. It is recommended that the default value of +this flag always be ``False``, to ensure that the profiler does not interfere with the normal execution +in an unwanted way. You will need to add something like this to your parser: + +.. code-block:: python + + subparser.add_argument('-p', '--profile', action='store_true', default=False, required=False, + help='Prints performance parameters of the execution of this experiment.') + +The function must receive the flag as argument to control the execution of the profiler. If the flag +has value ``True``, you should proceed as follows: + +1. Instantiate a **Profiler(EXPID)** object. Specifying the ``EXPID`` is mandatory. + +2. Run the profiler by calling the **start()** function of the instantiated object, at the beginning + of the function or code fragment you want to evaluate. The measurement will start instantly. + +3. Execute the **stop()** function of the profiler at the end of the function or code fragment to be + evaluated. The process of taking measurements will stop instantly. The report will be generated + automatically and the files will be stored in the ``/tmp/profile`` directory. + +.. important:: Make sure, if necessary, that the call to `stop()` is always made, even if the + Autosubmit code fails, in order to get the performance report. + + +The most relevant functions of the profiler, in detail +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. digraph:: foo + :name: status_diagram + :align: center + :alt: Status diagram of the profiler + + bgcolor="transparent" + graph [nodesep=0.5] + + // Extreme nodes (start/end) + node [label="",color="black",style=filled,fixedsize=true]; + + node [shape=circle,height=0.25,width=0.25] ENTRY; + node [shape=doublecircle,height=0.2,width=0.2] EXIT; + + // Status nodes + node [shape=rect,style=rounded,height=0.5,width=1.2,fixedsize=true,fontsize=12,fontname=arial]; + + node [label="__init__"] init; + node [label="start"] start; + node [label="stop"] stop; + node [label="report"] report; + + // Relations + ENTRY -> init; + init -> start [label="when\nstarted = False",fontsize=7,fontname=arial]; + start -> stop [label=" when\n started = True",fontsize=7,fontname=arial]; + stop -> report [label="automatically",fontsize=7,fontname=arial]; + report -> EXIT; + + { rank = same; ENTRY init start } + { rank = same; stop report EXIT } + +* The **start()** function: Starts taking measures, both of execution times thanks to ``cProfile``, and + memory thanks to ``psutil``. It also manages errors to avoid illegal transitions between states. + +* The **stop()** function: Same as the previous function, but terminating the taking of measurements. + It will call the report function automatically. + +* The **_report()** function: It is private, and its purpose is to generate the final performance + report and storing it properly. It will print the report to the console output and log it at the same time. + In addition, it will generate two files in the directory chosen when instantiating the Profiler + object, a ``.txt`` file with the same report shown on screen, and a ``.prof`` file with the report + generated by ``pstats``. The ``.prof`` file can be manipulated with the appropriate tools. Our + recommendation is to open it with `SnakeViz `_, a graphical + library that will interpret the data for you and display it in an interactive web interface. diff --git a/docs/source/userguide/run/profiler_output.png b/docs/source/userguide/run/profiler_output.png new file mode 100644 index 0000000000000000000000000000000000000000..2a4808ed8d170bb7717d125f69853a616475e32c Binary files /dev/null and b/docs/source/userguide/run/profiler_output.png differ diff --git a/docs/source/userguide/run/profiler_snakeviz.png b/docs/source/userguide/run/profiler_snakeviz.png new file mode 100644 index 0000000000000000000000000000000000000000..8eadb90e21aa7c9f72e6bfca1a1eb240c1955c24 Binary files /dev/null and b/docs/source/userguide/run/profiler_snakeviz.png differ diff --git a/environment.yml b/environment.yml index 60e4d95c864dcd0e2b7e2e51282cac33d4cea018..e0ce1aded1e3adf495e84055a7dc1cac4136e3fa 100644 --- a/environment.yml +++ b/environment.yml @@ -44,4 +44,5 @@ dependencies: - coverage - requests - configobj + - psutil diff --git a/requeriments.txt b/requeriments.txt index 64e023c1ac6ed6b1890519f8ef6bdc150bf87dd7..95d521a9b17e2aa1e4f2809d058c650559708e2f 100644 --- a/requeriments.txt +++ b/requeriments.txt @@ -30,3 +30,4 @@ Pygments packaging==20 typing>=3.7 wheel +psutil diff --git a/setup.py b/setup.py index 5761d7ac2b4b879e80a624e3c035bebc66c92020..f96a9a0c9f7159cd9285994bb3c6acbe0baec28b 100644 --- a/setup.py +++ b/setup.py @@ -39,7 +39,7 @@ setup( url='http://www.bsc.es/projects/earthscience/autosubmit/', download_url='https://earth.bsc.es/wiki/doku.php?id=tools:autosubmit', keywords=['climate', 'weather', 'workflow', 'HPC'], - install_requires=['ruamel.yaml==0.17.21','cython','autosubmitconfigparser','bcrypt>=3.2','packaging>19','six>=1.10.0','configobj>=5.0.6','argparse>=1.4.0','python-dateutil>=2.8.2','matplotlib<3.6','numpy<1.22','py3dotplus>=1.1.0','pyparsing>=3.0.7','paramiko>=2.9.2','mock>=4.0.3','portalocker>=2.3.2','networkx==2.6.3','requests>=2.27.1','bscearth.utils>=0.5.2','cryptography>=36.0.1','setuptools>=60.8.2','xlib>=0.21','pip>=22.0.3','pythondialog','pytest','nose','coverage','PyNaCl>=1.4.0','Pygments'], + install_requires=['ruamel.yaml==0.17.21','cython','autosubmitconfigparser','bcrypt>=3.2','packaging>19','six>=1.10.0','configobj>=5.0.6','argparse>=1.4.0','python-dateutil>=2.8.2','matplotlib<3.6','numpy<1.22','py3dotplus>=1.1.0','pyparsing>=3.0.7','paramiko>=2.9.2','mock>=4.0.3','portalocker>=2.3.2','networkx==2.6.3','requests>=2.27.1','bscearth.utils>=0.5.2','cryptography>=36.0.1','setuptools>=60.8.2','xlib>=0.21','pip>=22.0.3','pythondialog','pytest','nose','coverage','PyNaCl>=1.4.0','Pygments','psutil'], classifiers=[ "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.9", diff --git a/test/unit/test_profiler.py b/test/unit/test_profiler.py new file mode 100644 index 0000000000000000000000000000000000000000..4d474d572a23a029be09fe099565fd9edbf40f13 --- /dev/null +++ b/test/unit/test_profiler.py @@ -0,0 +1,50 @@ +from unittest import TestCase, mock +from autosubmit.profiler.profiler import Profiler +from log.log import AutosubmitCritical +from pathlib import Path + + +class TestProfiler(TestCase): + def setUp(self): + self.profiler = Profiler("a000") + + # Black box techniques for status machine based software + # + # O---->__init__------> start + # | + # | + # stop ----> report --->0 + + # Status coverage + def test_status_machine(self): + self.profiler.start() + self.profiler.stop() + + # Transition coverage + def test_transitions(self): + # __init__ -> start + self.profiler.start() + + # start -> stop + self.profiler.stop() + + def test_transitions_fail_cases(self): + # __init__ -> stop + self.assertRaises(AutosubmitCritical, self.profiler.stop) + + # start -> start + self.profiler.start() + self.assertRaises(AutosubmitCritical, self.profiler.start) + + # White box tests + @mock.patch("os.access") + def test_writing_permission_check_fails(self, mock_response): + mock_response.return_value = False + + self.profiler.start() + self.assertRaises(AutosubmitCritical, self.profiler.stop) + + def test_memory_profiling_loop(self): + self.profiler.start() + bytearray(1024*1024) + self.profiler.stop()