From 2a638c72fec4b096456757deea4f1e26422df5b7 Mon Sep 17 00:00:00 2001 From: Wilmer Uruchi Ticona Date: Tue, 5 Oct 2021 19:42:30 +0200 Subject: [PATCH 1/8] Refactor started. Implementing tests. Main logic is pending. --- autosubmit/autosubmit.py | 6 +- autosubmit/database/db_jobdata.py | 1 - autosubmit/history/__init__.py | 0 autosubmit/history/data_classes/__init__.py | 0 .../history/data_classes/experiment_run.py | 62 ++++ autosubmit/history/data_classes/job_data.py | 281 +++++++++++++++++ .../history/database_managers/__init__.py | 0 .../database_managers/database_manager.py | 153 ++++++++++ .../database_managers/database_models.py | 55 ++++ .../experiment_history_db_manager.py | 283 ++++++++++++++++++ .../experiment_status_db_manager.py | 137 +++++++++ autosubmit/history/database_managers/tests.py | 106 +++++++ autosubmit/history/experiment_history.py | 31 ++ autosubmit/history/experiment_status.py | 42 +++ autosubmit/history/tests.py | 37 +++ autosubmit/history/utils.py | 57 ++++ 16 files changed, 1247 insertions(+), 4 deletions(-) create mode 100644 autosubmit/history/__init__.py create mode 100644 autosubmit/history/data_classes/__init__.py create mode 100644 autosubmit/history/data_classes/experiment_run.py create mode 100644 autosubmit/history/data_classes/job_data.py create mode 100644 autosubmit/history/database_managers/__init__.py create mode 100644 autosubmit/history/database_managers/database_manager.py create mode 100644 autosubmit/history/database_managers/database_models.py create mode 100644 autosubmit/history/database_managers/experiment_history_db_manager.py create mode 100644 autosubmit/history/database_managers/experiment_status_db_manager.py create mode 100644 autosubmit/history/database_managers/tests.py create mode 100644 autosubmit/history/experiment_history.py create mode 100644 autosubmit/history/experiment_status.py create mode 100644 autosubmit/history/tests.py create mode 100644 autosubmit/history/utils.py diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 279ab3255..11cb1efbc 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -71,8 +71,8 @@ import portalocker from pkg_resources import require, resource_listdir, resource_exists, resource_string from collections import defaultdict from pyparsing import nestedExpr -from database.db_jobdata import ExperimentStatus, JobDataStructure - +from database.db_jobdata import JobDataStructure +from history.experiment_status import ExperimentStatus """ Main module for autosubmit. Only contains an interface class to all functionality implemented on autosubmit """ @@ -1553,7 +1553,7 @@ class Autosubmit: job_data_structure.validate_current_run(job_list.get_job_list( ), as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), current_config=as_conf.get_full_config_as_json()) - ExperimentStatus(expid).update_running_status() + ExperimentStatus(expid).set_running() except Exception as e: raise AutosubmitCritical( "Error while processing job_data_structure", 7067, str(e)) diff --git a/autosubmit/database/db_jobdata.py b/autosubmit/database/db_jobdata.py index 9e17eb319..d9e9e8d4f 100644 --- a/autosubmit/database/db_jobdata.py +++ b/autosubmit/database/db_jobdata.py @@ -2042,7 +2042,6 @@ class JobDataStructure(MainDataBase): def _get_job_data(self, job_name): """ Returns rows belonging to a job_name - """ try: if self.conn: diff --git a/autosubmit/history/__init__.py b/autosubmit/history/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/autosubmit/history/data_classes/__init__.py b/autosubmit/history/data_classes/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/autosubmit/history/data_classes/experiment_run.py b/autosubmit/history/data_classes/experiment_run.py new file mode 100644 index 000000000..9aaa107c3 --- /dev/null +++ b/autosubmit/history/data_classes/experiment_run.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python + +# Copyright 2015-2020 Earth Sciences Department, BSC-CNS +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . + +from autosubmit.history.utils import get_current_datetime_if_none + +class ExperimentRun(): + """ + Class that represents an experiment run + """ + def __init__(self, run_id, created=None, modified=None, start=0, finish=0, chunk_unit="NA", chunk_size=0, completed=0, total=0, failed=0, queuing=0, running=0, submitted=0, suspended=0, metadata=""): + self.run_id = run_id + self.created = get_current_datetime_if_none(created) + self.modified = get_current_datetime_if_none(modified) # Added on DB 16 + self.start = start + self.finish = finish + self.chunk_unit = chunk_unit + self.chunk_size = chunk_size + self.submitted = submitted + self.queuing = queuing + self.running = running + self.completed = completed + self.failed = failed + self.total = total + self.suspended = suspended + self.metadata = metadata + + @classmethod + def from_model(cls, row): + """ Build ExperimentRun from ExperimentRunRow """ + experiment_run = cls(0) + experiment_run.run_id = row.run_id + experiment_run.created = get_current_datetime_if_none(row.created) + experiment_run.modified = get_current_datetime_if_none(row.modified) + experiment_run.start = row.start + experiment_run.finish = row.finish + experiment_run.chunk_unit = row.chunk_unit + experiment_run.chunk_size = row.chunk_size + experiment_run.completed = row.completed + experiment_run.total = row.total + experiment_run.failed = row.failed + experiment_run.queuing = row.queuing + experiment_run.running = row.running + experiment_run.submitted = row.submitted + experiment_run.suspended = row.suspended + experiment_run.metadata = row.metadata + return experiment_run + \ No newline at end of file diff --git a/autosubmit/history/data_classes/job_data.py b/autosubmit/history/data_classes/job_data.py new file mode 100644 index 000000000..ef1628865 --- /dev/null +++ b/autosubmit/history/data_classes/job_data.py @@ -0,0 +1,281 @@ +#!/usr/bin/env python + +# Copyright 2015-2020 Earth Sciences Department, BSC-CNS +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . + +import collections +import time +import autosubmit.history.utils as HUtils +import autosubmit.history.database_managers.database_models as Models +from datetime import datetime, timedelta +from json import dumps, loads + +class JobData(object): + """ + Robust representation of a row in the job_data table of the experiment history database. + """ + + def __init__(self, _id, counter=1, job_name="None", created=None, modified=None, submit=0, start=0, finish=0, status="UNKNOWN", rowtype=0, ncpus=0, wallclock="00:00", qos="debug", energy=0, date="", section="", member="", chunk=0, last=1, platform="NA", job_id=0, extra_data="", nnodes=0, run_id=None, MaxRSS=0.0, AveRSS=0.0, out="", err="", rowstatus=Models.RowStatus.INITIAL): + """ + """ + self._id = _id + self.counter = counter + self.job_name = job_name + self.created = HUtils.get_current_datetime_if_none(created) + self.modified = HUtils.get_current_datetime_if_none(modified) + self._submit = int(submit) + self._start = int(start) + self._finish = int(finish) + self.status = status + self.rowtype = rowtype + self.ncpus = ncpus + self.wallclock = wallclock + self.qos = qos if qos else "debug" + self._energy = energy if energy else 0 + self.date = date if date else "" + self.section = section if section else "" + self.member = member if member else "" + self.chunk = chunk if chunk else 0 + self.last = last + self._platform = platform if platform and len( + platform) > 0 else "NA" + self.job_id = job_id if job_id else 0 + try: + self.extra_data_parsed = loads(extra_data) + except Exception as exp: + self.extra_data_parsed = {} # Fail fast + self.extra_data = extra_data + self.nnodes = nnodes + self.run_id = run_id + self.require_update = False + # DB VERSION 15 attributes + self.MaxRSS = MaxRSS + self.AveRSS = AveRSS + self.out = out + self.err = err + self.rowstatus = rowstatus + + @classmethod + def from_model(cls, row): + """ Build JobData from JobDataRow. """ + job_data = cls(row.id, + row.counter, + row.job_name, + row.created, + row.modified, + row.submit, + row.start, + row.finish, + row.status, + row.rowtype, + row.ncpus, + row.wallclock, + row.qos, + row.energy, + row.date, + row.section, + row.member, + row.chunk, + row.last, + row.platform, + row.job_id, + row.extra_data, + row.nnodes, + row.run_id, + row.MaxRSS, + row.AveRSS, + row.out, + row.err, + row.rowstatus) + return job_data + + + @property + def submit(self): + """ + Returns the submit time timestamp as an integer. + """ + return int(self._submit) + + @property + def start(self): + """ + Returns the start time timestamp as an integer. + """ + return int(self._start) + + @property + def finish(self): + """ + Returns the finish time timestamp as an integer. + """ + return int(self._finish) + + @property + def platform(self): + """ + Returns the name of the platform, "NA" if no platform is set. + """ + return self._platform + + @property + def energy(self): + """ + Returns the energy spent value (JOULES) as an integer. + """ + return self._energy + + @submit.setter + def submit(self, submit): + self._submit = int(submit) + + @start.setter + def start(self, start): + self._start = int(start) + + @finish.setter + def finish(self, finish): + self._finish = int(finish) + + @platform.setter + def platform(self, platform): + self._platform = platform if platform and len(platform) > 0 else "NA" + + @energy.setter + def energy(self, energy): + """ + Set the energy value. If it is different than the current energy value, a update flag will be activated. + """ + if energy > 0: + if (energy != self._energy): + # print("Updating energy to {0} from {1}.".format( + # energy, self._energy)) + self.require_update = True + self._energy = energy if energy else 0 + + def delta_queue_time(self): + """ + Returns queuing time as a timedelta object. + """ + return str(timedelta(seconds=self.queuing_time())) + + def delta_running_time(self): + """ + Returns running time as a timedelta object. + """ + return str(timedelta(seconds=self.running_time())) + + def submit_datetime(self): + """ + Return the submit time as a datetime object, None if submit time equal 0. + """ + if self.submit > 0: + return datetime.fromtimestamp(self.submit) + return None + + def start_datetime(self): + """ + Return the start time as a datetime object, None if start time equal 0. + """ + if self.start > 0: + return datetime.fromtimestamp(self.start) + return None + + def finish_datetime(self): + """ + Return the finish time as a datetime object, None if start time equal 0. + """ + if self.finish > 0: + return datetime.fromtimestamp(self.finish) + return None + + def submit_datetime_str(self): + """ + Returns the submit datetime as a string with format %Y-%m-%d-%H:%M:%S + """ + o_datetime = self.submit_datetime() + if o_datetime: + return o_datetime.strftime(DATETIME_FORMAT) + else: + return None + + def start_datetime_str(self): + """ + Returns the start datetime as a string with format %Y-%m-%d-%H:%M:%S + """ + o_datetime = self.start_datetime() + if o_datetime: + return o_datetime.strftime(DATETIME_FORMAT) + else: + return None + + def finish_datetime_str(self): + """ + Returns the finish datetime as a string with format %Y-%m-%d-%H:%M:%S + """ + o_datetime = self.finish_datetime() + if o_datetime: + return o_datetime.strftime(DATETIME_FORMAT) + else: + return None + + def running_time(self): + """ + Calculates and returns the running time of the job, in seconds. + + :return: Running time in seconds. + :rtype: int + """ + if self.status in ["RUNNING", "COMPLETED", "FAILED"]: + return HUtils.calculate_run_time_in_seconds(self.start, self.finish) + return 0 + + def queuing_time(self): + """ + Calculates and returns the queuing time of the job, in seconds. + + :return: Queueing time in seconds. + :rtype: int + """ + if self.status in ["SUBMITTED", "QUEUING", "RUNNING", "COMPLETED", "HELD", "PREPARED", "FAILED", "SKIPPED"]: + return HUtils.calculate_queue_time_in_seconds(self.submit, self.start) + return 0 + + def get_hdata(self): + """ + Get the job data as an ordered dict into a JSON object. + :return: Job data as an ordered dict into a JSON object. + :rtype: JSON object. + """ + hdata = collections.OrderedDict() + hdata["name"] = self.job_name + hdata["date"] = self.date + hdata["section"] = self.section + hdata["member"] = self.member + hdata["chunk"] = self.chunk + hdata["submit"] = self.submit_datetime_str() + hdata["start"] = self.start_datetime_str() + hdata["finish"] = self.finish_datetime_str() + hdata["queue_time"] = self.delta_queue_time() + hdata["run_time"] = self.delta_running_time() + hdata["wallclock"] = self.wallclock + hdata["ncpus"] = self.ncpus + hdata["nnodes"] = self.nnodes + hdata["energy"] = self.energy + hdata["platform"] = self.platform + hdata["MaxRSS"] = self.MaxRSS + hdata["AveRSS"] = self.AveRSS + return dumps(hdata) \ No newline at end of file diff --git a/autosubmit/history/database_managers/__init__.py b/autosubmit/history/database_managers/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/autosubmit/history/database_managers/database_manager.py b/autosubmit/history/database_managers/database_manager.py new file mode 100644 index 000000000..57dee59c0 --- /dev/null +++ b/autosubmit/history/database_managers/database_manager.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python + +# Copyright 2015-2020 Earth Sciences Department, BSC-CNS +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . + +import sqlite3 +import os +import traceback +import autosubmit.history.utils as HUtils +import autosubmit.history.database_managers.database_models as Models +from abc import ABCMeta, abstractmethod +from log.log import Log, AutosubmitCritical, AutosubmitError +from autosubmit.config.basicConfig import BasicConfig + + +class DatabaseManager(): + """ Simple database manager. Needs expid. """ + __metaclass__ = ABCMeta + AS_TIMES_DB_NAME = "as_times.db" # default AS_TIMES location + ECEARTH_DB_NAME = "ecearth.db" # default EC_EARTH_DB_NAME location + + def __init__(self, expid): + self.expid = expid + self._basic_configuration = BasicConfig + self._basic_configuration.read() + + def get_connection(self, path): + # type : (str) -> Sqlite3Connection + """ + Create a database connection to the SQLite database specified by path. + :param path: database file name + :return: Connection object or None + """ + if not os.path.exists(path): + self._create_database_file(path) + return sqlite3.connect(path) + + def _create_database_file(self, path): + # type : (str) -> None + """ creates a database files with full permissions """ + os.umask(0) + os.open(path, os.O_WRONLY | os.O_CREAT, 0o777) + + def execute_statement_on_dbfile(self, path, statement): + # type : (str, str) -> None + """ Executes a statement on a database file specified by path. """ + conn = self.get_connection(path) + cursor = conn.cursor() + cursor.execute(statement) + conn.commit() + conn.close() + + def execute_statement_with_arguments_on_dbfile(self, path, statement, arguments): + # type : (str, str, Tuple) -> None + """ Executes an statement with arguments on a database file specified by path. """ + conn = self.get_connection(path) + cursor = conn.cursor() + cursor.execute(statement, arguments) + conn.commit() + conn.close() + + def execute_many_statement_with_arguments_on_dbfile(self, path, statement, arguments_list): + # type : (str, str, List[Tuple]) -> None + """ Executes many statements from a list of arguments specified by a path. """ + conn = self.get_connection(path) + cursor = conn.cursor() + cursor.executemany(statement, arguments_list) + conn.commit() + conn.close() + + def execute_many_statements_on_dbfile(self, path, statements): + # type : (str, List[str]) -> None + """ + Updates the table schema using a **small** list of statements. No Exception raised. + Can be used to execute a list of schema updates that might have been already applied. + """ + for statement in statements: + try: + self.execute_statement_on_dbfile(path, statement) + except Exception as exp: + Log.info(traceback.format_exc()) + Log.debug(str(exp)) + Log.warning("Error on updating {0}. Statement: {1}. You can ignore this message.".format(path, statement)) + + def get_from_statement(self, path, statement): + # type : (str, str) -> List[Tuple] + """ Get the rows from a statement with no arguments """ + conn = self.get_connection(path) + conn.text_factory = str + cursor = conn.cursor() + cursor.execute(statement) + statement_rows = cursor.fetchall() + conn.close() + return statement_rows + + def get_from_statement_with_arguments(self, path, statement, arguments): + # type : (str, str, Tuple) -> List[Tuple] + """ Get the rows from a statement with arguments """ + conn = self.get_connection(path) + conn.text_factory = str + cursor = conn.cursor() + cursor.execute(statement, arguments) + statement_rows = cursor.fetchall() + conn.close() + return statement_rows + + def insert_statement(self, path, statement): + # type : (str, str) -> int + """ Insert statement into path """ + conn = self.get_connection(path) + conn.text_factory = str + cursor = conn.cursor() + cursor.execute(statement) + lastrow_id = cursor.lastrowid + conn.commit() + conn.close() + return lastrow_id + + def insert_statement_with_arguments(self, path, statement, arguments): + # type : (str, str, Tuple) -> int + """ Insert statement with arguments into path """ + conn = self.get_connection(path) + conn.text_factory = str + cursor = conn.cursor() + cursor.execute(statement, arguments) + lastrow_id = cursor.lastrowid + conn.commit() + conn.close() + return lastrow_id + + def get_built_select_statement(self, table_name, conditions=None): + # type : (str, namedtuple, str) -> str + """ Build and return a SELECT statement with the same fields as the model. Requires that the table is associated with a model (namedtuple). """ + model = Models.table_name_to_model[table_name] + if conditions: + return "SELECT {0} FROM {1} WHERE {2}".format(HUtils.get_fields_as_comma_str(model), table_name, conditions) + else: + return "SELECT {0} FROM {1}".format(HUtils.get_fields_as_comma_str(model), table_name) + + diff --git a/autosubmit/history/database_managers/database_models.py b/autosubmit/history/database_managers/database_models.py new file mode 100644 index 000000000..c017e5d98 --- /dev/null +++ b/autosubmit/history/database_managers/database_models.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python + +# Copyright 2015-2020 Earth Sciences Department, BSC-CNS +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . + +import collections + +JobDataRow = collections.namedtuple('JobDataRow', ['id', 'counter', 'job_name', 'created', 'modified', 'submit', 'start', 'finish', 'status', 'rowtype', 'ncpus', 'wallclock', 'qos', 'energy', 'date', 'section', 'member', 'chunk', 'last', 'platform', 'job_id', 'extra_data', 'nnodes', 'run_id', 'MaxRSS', 'AveRSS', 'out', 'err', 'rowstatus']) + +ExperimentRunRow = collections.namedtuple('ExperimentRunRow', [ + 'run_id', 'created', 'modified', 'start', 'finish', 'chunk_unit', 'chunk_size', 'completed', 'total', 'failed', 'queuing', 'running', 'submitted', 'suspended', 'metadata']) + +ExperimentStatusRow = collections.namedtuple( + 'ExperimentStatusRow', ['exp_id', 'name', 'status', 'seconds_diff', 'modified']) + +ExperimentRow = collections.namedtuple('ExperimentRow', ["id", "name", "autosubmit_version", "description"]) + +PragmaVersion = collections.namedtuple('PragmaVersion', ['version']) + +class RunningStatus: + RUNNING = "RUNNING" + NOT_RUNNING = "NOT RUNNING" + +class RowType: + NORMAL = 2 + #PACKED = 2 + +class RowStatus: + INITIAL = 0 + COMPLETED = 1 + PROCESSED = 2 + FAULTY = 3 + CHANGED = 4 + +table_name_to_model = { + "experiment" : ExperimentRow, + "experiment_status" : ExperimentStatusRow, + "job_data" : JobDataRow, + "experiment_run" : ExperimentRunRow, + "pragma_version" : PragmaVersion +} + diff --git a/autosubmit/history/database_managers/experiment_history_db_manager.py b/autosubmit/history/database_managers/experiment_history_db_manager.py new file mode 100644 index 000000000..66ee1aabb --- /dev/null +++ b/autosubmit/history/database_managers/experiment_history_db_manager.py @@ -0,0 +1,283 @@ +#!/usr/bin/env python + +# Copyright 2015-2020 Earth Sciences Department, BSC-CNS +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . +import sqlite3 +import os +import traceback +import textwrap +import autosubmit.history.utils as HUtils +import database_models as Models +from abc import ABCMeta, abstractmethod +from log.log import Log, AutosubmitCritical, AutosubmitError +from database_manager import DatabaseManager +from datetime import datetime + +CURRENT_DB_VERSION = 16 +DB_EXPERIMENT_HEADER_SCHEMA_CHANGES = 14 +DB_VERSION_SCHEMA_CHANGES = 12 +DEFAULT_DB_VERSION = 10 + +class ExperimentHistoryDatabaseManager(DatabaseManager): + """ Manages actions directly on the database. + """ + def __init__(self, expid): + super(ExperimentHistoryDatabaseManager, self).__init__(expid) + self.db_version = DEFAULT_DB_VERSION # type : int + self._set_schema_changes() + self._set_table_queries() + self.historicaldb_file_path = os.path.join(self._basic_configuration.JOBDATA_DIR, "job_data_{0}.db".format(self.expid)) # type : str + + def _set_table_queries(self): + """ Sets basic table queries. """ + self.create_table_header_query = textwrap.dedent( + '''CREATE TABLE + IF NOT EXISTS experiment_run ( + run_id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT, + created TEXT NOT NULL, + modified TEXT NOT NULL, + start INTEGER NOT NULL, + finish INTEGER, + chunk_unit TEXT NOT NULL, + chunk_size INTEGER NOT NULL, + completed INTEGER NOT NULL, + total INTEGER NOT NULL, + failed INTEGER NOT NULL, + queuing INTEGER NOT NULL, + running INTEGER NOT NULL, + submitted INTEGER NOT NULL, + suspended INTEGER NOT NULL DEFAULT 0, + metadata TEXT + ); + ''') + self.create_table_query = textwrap.dedent( + '''CREATE TABLE + IF NOT EXISTS job_data ( + id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT, + counter INTEGER NOT NULL, + job_name TEXT NOT NULL, + created TEXT NOT NULL, + modified TEXT NOT NULL, + submit INTEGER NOT NULL, + start INTEGER NOT NULL, + finish INTEGER NOT NULL, + status TEXT NOT NULL, + rowtype INTEGER NOT NULL, + ncpus INTEGER NOT NULL, + wallclock TEXT NOT NULL, + qos TEXT NOT NULL, + energy INTEGER NOT NULL, + date TEXT NOT NULL, + section TEXT NOT NULL, + member TEXT NOT NULL, + chunk INTEGER NOT NULL, + last INTEGER NOT NULL, + platform TEXT NOT NULL, + job_id INTEGER NOT NULL, + extra_data TEXT NOT NULL, + nnodes INTEGER NOT NULL DEFAULT 0, + run_id INTEGER, + MaxRSS REAL NOT NULL DEFAULT 0.0, + AveRSS REAL NOT NULL DEFAULT 0.0, + out TEXT NOT NULL, + err TEXT NOT NULL, + rowstatus INTEGER NOT NULL DEFAULT 0, + UNIQUE(counter,job_name) + ); + ''') + self.create_index_query = textwrap.dedent(''' + CREATE INDEX IF NOT EXISTS ID_JOB_NAME ON job_data(job_name); + ''') + + def _set_schema_changes(self): + # type : () -> None + """ Creates the list of schema changes""" + self.version_schema_changes = [ + "ALTER TABLE job_data ADD COLUMN nnodes INTEGER NOT NULL DEFAULT 0", + "ALTER TABLE job_data ADD COLUMN run_id INTEGER" + ] + # Version 15 + self.version_schema_changes.extend([ + "ALTER TABLE job_data ADD COLUMN MaxRSS REAL NOT NULL DEFAULT 0.0", + "ALTER TABLE job_data ADD COLUMN AveRSS REAL NOT NULL DEFAULT 0.0", + "ALTER TABLE job_data ADD COLUMN out TEXT NOT NULL DEFAULT ''", + "ALTER TABLE job_data ADD COLUMN err TEXT NOT NULL DEFAULT ''", + "ALTER TABLE job_data ADD COLUMN rowstatus INTEGER NOT NULL DEFAULT 0", + "ALTER TABLE experiment_run ADD COLUMN suspended INTEGER NOT NULL DEFAULT 0", + "ALTER TABLE experiment_run ADD COLUMN metadata TEXT" + ]) + # Version 16 + self.version_schema_changes.extend([ + "ALTER TABLE experiment_run ADD COLUMN modified TEXT" + ]) + + def create_historical_database(self): + """ Creates the historical database with the latest changes. """ + self.execute_statement_on_dbfile(self.historicaldb_file_path, self.create_table_header_query) + self.execute_statement_on_dbfile(self.historicaldb_file_path, self.create_table_query) + self.execute_statement_on_dbfile(self.historicaldb_file_path, self.create_index_query) + self._set_historical_pragma_version(CURRENT_DB_VERSION) + self.db_version = CURRENT_DB_VERSION + + def update_historical_database(self): + """ Updates the historical database with the latest changes. """ + self.execute_many_statements_on_dbfile(self.historicaldb_file_path, self.version_schema_changes) + self.execute_statement_on_dbfile(self.historicaldb_file_path, self.create_index_query) + self.execute_statement_on_dbfile(self.historicaldb_file_path, self.create_table_header_query) + self._set_historical_pragma_version(CURRENT_DB_VERSION) + self.db_version = CURRENT_DB_VERSION + + def get_experiment_run_with_max_id(self): + """ Get Models.ExperimentRunRow for the maximum id run. """ + statement = self.get_built_select_statement("experiment_run", "run_id > 0 ORDER BY run_id DESC LIMIT 0, 1") + max_experiment_run = self.get_from_statement(self.historicaldb_file_path, statement) + if len(max_experiment_run) <= 0: + raise Exception("Error on experiment run retrieval") + return Models.ExperimentRunRow(*max_experiment_run[0]) + + def get_job_data_all(self): + """ Gets List of Models.JobDataRow from database. """ + statement = self.get_built_select_statement("job_data") + job_data_rows = self.get_from_statement(self.historicaldb_file_path, statement) + return [Models.JobDataRow(*row) for row in job_data_rows] + + def update_job_data_last(self, job_data): + """ + Updates job_data table with data class JobData. + Updates last = 0, modified by id + """ + statement = ''' UPDATE job_data SET last=0, modified = ? WHERE id = ?''' + arguments = (HUtils.get_current_datetime(), job_data._id) + self.execute_statement_with_arguments_on_dbfile(self.historicaldb_file_path, statement, arguments) + + def update_job_data_start(self, job_data): + """ + Updates job_data table with data class JobData. + Updates start, modified, job_id, status, rowtype by id. + """ + statement = ''' UPDATE job_data SET start=?, modified=?, job_id=?, status=?, rowtype=? WHERE id=? ''' + arguments = (int(job_data.start), HUtils.get_current_datetime(), job_data.job_id, job_data.status, job_data.rowtype, job_data._id) + self.execute_many_statement_with_arguments_on_dbfile(self.historicaldb_file_path, statement, arguments) + + def update_job_data_finish_plus(self, job_data): + """ + Update job_data table with data class JobData. + Updates submit, start, finish, modified, job_id, status, energy, extra_data, nnodes, ncpus, rowstatus, out, err by id. + """ + statement = ''' UPDATE job_data SET submit=?, start=?, finish=?, modified=?, job_id=?, status=?, energy=?, extra_data=?, nnodes=?, ncpus=?, rowstatus=?, out=?, err=? WHERE id=? ''' + arguments = (job_data.submit, job_data.start, job_data.finish, HUtils.get_current_datetime(), job_data.job_id,job_data.status, job_data.energy, job_data.extra_data, job_data.nnodes, job_data.ncpus, Models.RowStatus.COMPLETED, job_data.out, job_data.err, job_data._id) + self.execute_statement_with_arguments_on_dbfile(self.historicaldb_file_path, statement, arguments) + + def update_many_job_data_change_status(self, changes): + # type : (List[Tuple]) -> None + """ Update many job_data rows in bulk. Requires a changes list of argument tuples. """ + statement = ''' UPDATE job_data SET finish=?, modified=?, status=?, rowstatus=? WHERE id=? ''' + self.execute_many_statement_with_arguments_on_dbfile(self.historicaldb_file_path, statement, changes) + + def update_job_data_finish(self, job_data): + """ + Update job_data table with data class JobData. + Update finish, modified, job_id, status, energy, extra_data, nnodes, ncpus, rowstatus, out, err by id. + """ + statement = ''' UPDATE job_data SET finish=?, modified=?, job_id=?, status=?, energy=?, extra_data=?, nnodes=?, ncpus=?, rowstatus=?, out=?, err=? WHERE id=? ''' + arguments = (job_data.finish, HUtils.get_current_datetime(), job_data.job_id, job_data.status, job_data.energy, job_data.extra_data, job_data.nnodes, job_data.ncpus, Models.RowStatus.COMPLETED, job_data.out, job_data.err, job_data._id) + self.execute_statement_with_arguments_on_dbfile(self.historicaldb_file_path, statement, arguments) + + def update_job_data_processed(self, job_data): + """ + Update job_data table with data class JobData. + Updates energy, modified, MaxRSS, AveRSS, rowstatus by + """ + statement = ''' UPDATE job_data SET energy=?, modified=?, MaxRSS=?, AveRSS=?, rowstatus=? WHERE id=? ''' + arguments = (job_data.energy, HUtils.get_current_datetime(), job_data.MaxRSS, job_data.AveRSS, Models.RowStatus.PROCESSED, job_data._id) + self.execute_statement_with_arguments_on_dbfile(self.historicaldb_file_path, statement, arguments) + + def update_experiment_run(self, experiment_run): + """ + Update experiment_run table with data class ExperimentRun. + Updates by run_id (finish, chunk_unit, chunk_size, completed, total, failed, queuing, running, submitted, suspended) + """ + statement = ''' UPDATE experiment_run SET finish=?, chunk_unit=?, chunk_size=?, completed=?, total=?, failed=?, queuing=?, running=?, submitted=?, suspended=?, modified=? WHERE run_id=? ''' + arguments = (experiment_run.finish, experiment_run.chunk_unit, experiment_run.chunk_size, experiment_run.completed, experiment_run.total, experiment_run.failed, experiment_run.queuing, experiment_run.running, experiment_run.submitted, experiment_run.suspended, HUtils.get_current_datetime(), experiment_run.run_id) + self.execute_statement_with_arguments_on_dbfile(self.historicaldb_file_path, statement, arguments) + + def insert_job_data(self, job_data): + # type : (JobData) -> int + """ Insert data class JobData into database """ + statement = ''' INSERT INTO job_data(counter, job_name, created, modified, submit, start, finish, status, rowtype, ncpus, wallclock, qos, energy, date, section, member, chunk, last, platform, job_id, extra_data, nnodes, run_id, MaxRSS, AveRSS, out, err, rowstatus) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?) ''' + arguments = (job_data.counter, job_data.job_name, HUtils.get_current_datetime(), HUtils.get_current_datetime(), job_data.submit, job_data.start, job_data.finish, job_data.status, job_data.rowtype, job_data.ncpus, job_data.wallclock, job_data.qos, job_data.energy, job_data.date, job_data.section, job_data.member, job_data.chunk, job_data.last, job_data.platform, job_data.job_id, job_data.extra_data, job_data.nnodes, job_data.run_id, job_data.MaxRSS, job_data.AveRSS, job_data.out, job_data.err, job_data.rowstatus) + return self.insert_statement_with_arguments(self.historicaldb_file_path, statement, arguments) + + def insert_experiment_run(self, experiment_run): + """ Insert data class ExperimentRun into database """ + statement = ''' INSERT INTO experiment_run(created, modified, start, finish, chunk_unit, chunk_size, completed, total, failed, queuing, running, submitted, suspended, metadata) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?) ''' + arguments = (HUtils.get_current_datetime(), HUtils.get_current_datetime(), experiment_run.start, experiment_run.finish, experiment_run.chunk_unit, experiment_run.chunk_size, experiment_run.completed, + experiment_run.total, experiment_run.failed, experiment_run.queuing, experiment_run.running, experiment_run.submitted, experiment_run.suspended, experiment_run.metadata) + return self.insert_statement_with_arguments(self.historicaldb_file_path, statement, arguments) + + def get_job_data_last_by_run_id_and_finished(self, run_id): + """ Get List of Models.JobDataRow for last=1, finished > 0 and run_id """ + statement = self.get_built_select_statement("job_data", "run_id=? and last=1 and finish > 0 and rowtype >= 2 ORDER BY id") + arguments = (run_id,) + job_data_rows = self.get_from_statement_with_arguments(self.historicaldb_file_path, statement, arguments) + return [Models.JobDataRow(*row) for row in job_data_rows] + + def get_job_data_last_by_run_id(self, run_id): + """ Get List of Models.JobDataRow for last=1 and run_id """ + statement = self.get_built_select_statement("job_data", "run_id=? and last=1 and rowtype >= 2 ORDER BY id") + arguments = (run_id,) + job_data_rows = self.get_from_statement_with_arguments(self.historicaldb_file_path, statement, arguments) + return [Models.JobDataRow(*row) for row in job_data_rows] + + def get_job_data_by_name(self, job_name): + """ Get List of Models.JobDataRow for job_name """ + statement = self.get_built_select_statement("job_data", "job_name=? ORDER BY counter DESC") + arguments = (job_name,) + job_data_rows = self.get_from_statement_with_arguments(self.historicaldb_file_path, statement, arguments) + return [Models.JobDataRow(*row) for row in job_data_rows] + + def get_job_data_last_by_name(self, job_name): + """ Get List of Models.JobDataRow for job_name and last=1 """ + statement = self.get_built_select_statement("job_data", "last=1 and job_name=? ORDER BY counter DESC") + arguments = (job_name,) + job_data_rows_last = self.get_from_statement_with_arguments(self.historicaldb_file_path, statement, arguments) + return [Models.JobDataRow(*row) for row in job_data_rows_last] + + def delete_job_data(self, _id): + """ Deletes row in job_data by id. Useful for testing. """ + statement = ''' DELETE FROM job_data WHERE id=? ''' + arguments = (_id, ) + self.execute_statement_with_arguments_on_dbfile(self.historicaldb_file_path, statement, arguments) + + def delete_experiment_run(self, run_id): + """ Deletes row in experiment_run by run_id. Useful for testing. """ + statement = ''' DELETE FROM experiment_run where run_id=? ''' + arguments = (run_id,) + self.execute_statement_with_arguments_on_dbfile(self.historicaldb_file_path, statement, arguments) + + def _set_historical_pragma_version(self, version=10): + """ Sets the pragma version. """ + statement = "pragma user_version={v:d};".format(v=version) + self.execute_statement_on_dbfile(self.historicaldb_file_path, statement) + + def _get_pragma_version(self): + """ Gets current pragma version """ + statement = "pragma user_version;" + pragma_result = self.get_from_statement(self.historicaldb_file_path, statement) + if len(pragma_result) <= 0: + raise Exception("Error while getting the pragma version. This might be a signal of a deeper problem. Review previous errors.") + return Models.PragmaVersion(*pragma_result[0]).version + diff --git a/autosubmit/history/database_managers/experiment_status_db_manager.py b/autosubmit/history/database_managers/experiment_status_db_manager.py new file mode 100644 index 000000000..2d4ac7127 --- /dev/null +++ b/autosubmit/history/database_managers/experiment_status_db_manager.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python + + +# Copyright 2015-2020 Earth Sciences Department, BSC-CNS +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . + +import os +import sqlite3 +import traceback +import textwrap +import time +from database_manager import DatabaseManager +import autosubmit.history.utils as HUtils +import database_models as Models + +class ExperimentStatusDbManager(DatabaseManager): + """ Manages the actions on the status database """ + def __init__(self, expid): + super(ExperimentStatusDbManager, self).__init__(expid) + self._as_times_file_path = os.path.join(self._basic_configuration.LOCAL_ROOT_DIR, self.AS_TIMES_DB_NAME) + self._ecearth_file_path = os.path.join(self._basic_configuration.LOCAL_ROOT_DIR, self.ECEARTH_DB_NAME) + self._pkl_file_path = os.path.join(self._basic_configuration.LOCAL_ROOT_DIR, "pkl", "job_list_{0}.pkl".format(self.expid)) + self._validate_status_database() + self.current_experiment_row = self._get_current_experiment_row(self.expid) + self.current_experiment_status_row =self._get_current_experiment_status_row(self.current_experiment_row.id) + + def _validate_status_database(self): + """ Creates experiment_status table if it does not exist """ + create_table_query = textwrap.dedent( + '''CREATE TABLE + IF NOT EXISTS experiment_status ( + exp_id integer PRIMARY KEY, + name text NOT NULL, + status text NOT NULL, + seconds_diff integer NOT NULL, + modified text NOT NULL, + FOREIGN KEY (exp_id) REFERENCES experiment (id) + );''' + ) + self.execute_statement_on_dbfile(self._as_times_file_path, create_table_query) + + def print_current_table(self): + for experiment in self._get_experiment_status_content(): + print(experiment) + if self.current_experiment_status_row: + print("Current Row:\n\t" + self.current_experiment_status_row.name + "\n\t" + + str(self.current_experiment_status_row.exp_id) + "\n\t" + self.current_experiment_status_row.status) + + def is_running(self, time_condition=600): + # type : (int) -> bool + """ True if experiment is running, False otherwise. """ + if (os.path.exists(self._pkl_file_path)): + current_stat = os.stat(self._pkl_file_path) + timest = int(current_stat.st_mtime) + timesys = int(time.time()) + time_diff = int(timesys - timest) + if (time_diff < time_condition): + return True + else: + return False + return False + + def set_experiment_as_running(self, status="RUNNING"): + if self.current_experiment_status_row: + # Row exists + self._update_exp_status(status) + else: + # New Row + self._create_exp_status() + + def _get_current_experiment_row(self, expid): + # type : (str) -> Models.ExperimentRow + """ + Get the experiment from ecearth.db by expid as Models.ExperimentRow. + """ + statement = self.get_built_select_statement("experiment", "name=?") + current_rows = self.get_from_statement_with_arguments(self._ecearth_file_path, statement, (expid,)) + if len(current_rows) <= 0: + raise ValueError("Experiment {0} not found in {1}".format(expid, self._ecearth_file_path)) + return Models.ExperimentRow(*current_rows[0]) + + def _get_experiment_status_content(self): + # type : () -> List[Models.ExperimentStatusRow] + """ + Get all registers from experiment_status as List of Models.ExperimentStatusRow.\n + """ + statement = self.get_built_select_statement("experiment_status") + current_rows = self.get_from_statement(self._as_times_file_path, statement) + return [Models.ExperimentStatusRow(*row) for row in current_rows] + + def _get_current_experiment_status_row(self, exp_id): + # type : (int) -> Models.ExperimentStatusRow + """ Get Models.ExperimentStatusRow from as_times.db by exp_id (int)""" + statement = self.get_built_select_statement("experiment_status", "exp_id=?") + arguments = (exp_id,) + current_rows = self.get_from_statement_with_arguments(self._as_times_file_path, statement, arguments) + if len(current_rows) <= 0: + return None + return Models.ExperimentStatusRow(*current_rows[0]) + + + def _create_exp_status(self): + # type : () -> None + """ + Create experiment status + """ + statement = ''' INSERT INTO experiment_status(exp_id, name, status, seconds_diff, modified) VALUES(?,?,?,?,?) ''' + arguments = (self.current_experiment_row.id, self.expid, Models.RunningStatus.RUNNING, 0, HUtils.get_current_datetime()) + return self.insert_statement_with_arguments(self._as_times_file_path, statement, arguments) + + def _update_exp_status(self, status="RUNNING"): + # type : (str) -> None + """ + Update status, seconds_diff, modified in experiment_status. + """ + statement = ''' UPDATE experiment_status SET status = ?, seconds_diff = ?, modified = ? WHERE name = ? ''' + arguments = (status, 0, HUtils.get_current_datetime(), self.current_experiment_row.name) + self.execute_statement_with_arguments_on_dbfile(self._as_times_file_path, statement, arguments) + + +# if __name__ == "__main__": +# exp = ExperimentStatusDbManager("a2h6") +# exp.set_experiment_as_running() +# exp.print_current_table() \ No newline at end of file diff --git a/autosubmit/history/database_managers/tests.py b/autosubmit/history/database_managers/tests.py new file mode 100644 index 000000000..0e7977805 --- /dev/null +++ b/autosubmit/history/database_managers/tests.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python + +# Copyright 2015-2020 Earth Sciences Department, BSC-CNS +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . + +import unittest +import time +import random +from experiment_history_db_manager import ExperimentHistoryDatabaseManager +from autosubmit.history.data_classes.experiment_run import ExperimentRun +from autosubmit.history.data_classes.job_data import JobData +EXPID = "tt00" + +class TestExperimentHistoryDatabaseManager(unittest.TestCase): + + def setUp(self): + self.experiment_database = ExperimentHistoryDatabaseManager(EXPID) + + def tearDown(self): + pass + + def test_get_max_id(self): + max_item = self.experiment_database.get_experiment_run_with_max_id() + self.assertTrue(len(max_item) > 0) + self.assertTrue(max_item.run_id >= 18) # Max is 18 + + def test_pragma(self): + self.assertTrue(self.experiment_database._get_pragma_version() == 16) + + def test_get_job_data(self): + job_data = self.experiment_database.get_job_data_last_by_name("a29z_20000101_fc0_1_SIM") + self.assertTrue(len(job_data) > 0) + self.assertTrue(job_data[0].last == 1) + self.assertTrue(job_data[0].job_name == "a29z_20000101_fc0_1_SIM") + + job_data = self.experiment_database.get_job_data_by_name("a29z_20000101_fc0_1_SIM") + self.assertTrue(job_data[0].job_name == "a29z_20000101_fc0_1_SIM") + + job_data = self.experiment_database.get_job_data_last_by_run_id(18) # Latest + self.assertTrue(len(job_data) > 0) + + job_data = self.experiment_database.get_job_data_last_by_run_id_and_finished(18) + self.assertTrue(len(job_data) > 0) + + job_data = self.experiment_database.get_job_data_all() + self.assertTrue(len(job_data) > 0) + + def test_insert_and_delete_experiment_run(self): + new_run = ExperimentRun(19) + new_id = self.experiment_database.insert_experiment_run(new_run) + self.assertIsNotNone(new_id) + last_experiment_run = self.experiment_database.get_experiment_run_with_max_id() + self.assertTrue(new_id == last_experiment_run.run_id) + self.experiment_database.delete_experiment_run(new_id) + last_experiment_run = self.experiment_database.get_experiment_run_with_max_id() + self.assertTrue(new_id != last_experiment_run.run_id) + + def test_insert_and_delete_job_data(self): + max_run_id = self.experiment_database.get_experiment_run_with_max_id().run_id + new_job_data_name = "test_001_name_{0}".format(int(time.time())) + new_job_data = JobData(_id=1, job_name=new_job_data_name, run_id=max_run_id) + new_job_data_id = self.experiment_database.insert_job_data(new_job_data) + self.assertIsNotNone(new_job_data_id) + self.experiment_database.delete_job_data(new_job_data_id) + job_data = self.experiment_database.get_job_data_by_name(new_job_data_name) + self.assertTrue(len(job_data) == 0) + + + def test_update_experiment_run(self): + last_experiment_run = self.experiment_database.get_experiment_run_with_max_id() # 18 + experiment_run_data_class = ExperimentRun.from_model(last_experiment_run) + backup_run = ExperimentRun.from_model(last_experiment_run) + experiment_run_data_class.chunk_unit = "unouno" + experiment_run_data_class.running = random.randint(1, 100) + experiment_run_data_class.queuing = random.randint(1, 100) + experiment_run_data_class.suspended = random.randint(1, 100) + self.experiment_database.update_experiment_run(experiment_run_data_class) + last_experiment_run = self.experiment_database.get_experiment_run_with_max_id() # 18 + self.assertTrue(last_experiment_run.chunk_unit == experiment_run_data_class.chunk_unit) + self.assertTrue(last_experiment_run.running == experiment_run_data_class.running) + self.assertTrue(last_experiment_run.queuing == experiment_run_data_class.queuing) + self.assertTrue(last_experiment_run.suspended == experiment_run_data_class.suspended) + self.experiment_database.update_experiment_run(backup_run) + + def test_job_data_from_model(self): + job_data_rows = self.experiment_database.get_job_data_last_by_name("a29z_20000101_fc0_1_SIM") + job_data_row_first = job_data_rows[0] + job_data_data_class = JobData.from_model(job_data_row_first) + print(job_data_data_class.extra_data_parsed) + self.assertTrue(job_data_row_first.job_name == job_data_data_class.job_name) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/autosubmit/history/experiment_history.py b/autosubmit/history/experiment_history.py new file mode 100644 index 000000000..f1bfa1c59 --- /dev/null +++ b/autosubmit/history/experiment_history.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python + +# Copyright 2015-2020 Earth Sciences Department, BSC-CNS +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . +from database_managers.experiment_history_db_manager import ExperimentHistoryDatabaseManager + +class ExperimentHistory(): + def __init__(self, expid): + self.expid = expid + self.manager = ExperimentHistoryDatabaseManager(self.expid) + + def get_all_job_data_row(self): + return self.manager.get_job_data_all() + +# if __name__ == "__main__": +# exp = ExperimentHistory("tt00") +# for job_data_row in exp.get_all_job_data_row(): +# print(job_data_row) \ No newline at end of file diff --git a/autosubmit/history/experiment_status.py b/autosubmit/history/experiment_status.py new file mode 100644 index 000000000..871a76071 --- /dev/null +++ b/autosubmit/history/experiment_status.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python + +# Copyright 2015-2020 Earth Sciences Department, BSC-CNS +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . + +import traceback +from experiment_status_db_manager import ExperimentStatusDbManager +from log.log import Log + +class ExperimentStatus(): + """ Represents the Experiment Status Mechanism that keeps track of currently active experiments """ + def __init__(self, expid): + # type : (str) -> None + self.expid = expid # type : str + try: + self._manager = ExperimentStatusDbManager(self.expid) + except Exception as exp: + Log.warning("Error while trying to update {0} in experiment_status.".format(str(self.expid))) + Log.debug(traceback.format_exc()) + Log.info(traceback.format_exc()) + self._manager = None + + def set_running(self): + # type : () -> None + """ Set the status of the experiment in experiment_status of as_times.db as RUNNING. Creates the database, table and row if necessary.""" + if self._manager: + self._manager.set_experiment_as_running() + else: + Log.info("It's not possible to set the experiment as RUNNING in this moment. If it is not automatically set as RUNNING in a few minutes, look for previous errors.") \ No newline at end of file diff --git a/autosubmit/history/tests.py b/autosubmit/history/tests.py new file mode 100644 index 000000000..5d39c6c79 --- /dev/null +++ b/autosubmit/history/tests.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python + +# Copyright 2015-2020 Earth Sciences Department, BSC-CNS +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . + +import unittest +from experiment_history import ExperimentHistory + +class TestExperimentHistory(unittest.TestCase): + # @classmethod + # def setUpClass(cls): + # cls.exp = ExperimentHistory("tt00") # example database + def test_select_job_data_by_run_id(self): + result = ExperimentHistory("tt00").manager.get_job_data_last_by_run_id(17) + print(result) + self.assertIsNotNone(result) + + def test_get_all_job_data(self): + result = ExperimentHistory("tt00").get_all_job_data_row() + print(result) + self.assertTrue(result) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/autosubmit/history/utils.py b/autosubmit/history/utils.py new file mode 100644 index 000000000..314d7a0da --- /dev/null +++ b/autosubmit/history/utils.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python + +# Copyright 2015-2020 Earth Sciences Department, BSC-CNS + +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . + +import time +from datetime import datetime + +DATETIME_FORMAT = '%Y-%m-%d-%H:%M:%S' + +def get_fields_as_comma_str(model): + """ Get the fields of a namedtumple as a comma separated string. """ + return ",".join(model._fields) + +def calculate_queue_time_in_seconds(submit_time, start_time): + # type : (float, float) -> int + """ Calculates queue time in seconds based on submit and start timestamps. """ + if submit_time > 0 and start_time > 0 and (start_time - submit_time) > 0: + return int(start_time - submit_time) + return 0 + +def calculate_run_time_in_seconds(start_time, finish_time): + # type : (float, float) -> int + """ Calculates run time in seconds based on start and finish timestamps. """ + if finish_time > 0 and start_time > 0 and (finish_time - start_time) > 0: + return int(finish_time - start_time) + return 0 + +def get_current_datetime(): + # type : () -> str + """ Returns the current time in format '%Y-%m-%d-%H:%M:%S' """ + return datetime.today().strftime(DATETIME_FORMAT) + +def get_current_datetime_if_none(argument): + # type : (Any) -> Union[Any, str] + """ Returns the current time in format '%Y-%m-%d-%H:%M:%S' if the supplied argument is None, else return argument. """ + if argument is None: + return get_current_datetime() + else: + return argument + +# if __name__ == "__main__": +# print(get_fields_as_comma_str()) \ No newline at end of file -- GitLab From 17211e2ccb826b168a4544c311870f8b018f68c7 Mon Sep 17 00:00:00 2001 From: Wilmer Uruchi Ticona Date: Thu, 7 Oct 2021 14:55:21 +0200 Subject: [PATCH 2/8] Added tests and logging --- autosubmit/autosubmit.py | 2 +- autosubmit/history/data_classes/job_data.py | 6 +- .../database_managers/database_manager.py | 14 ++- .../database_managers/database_models.py | 1 + .../experiment_history_db_manager.py | 84 ++++++---------- .../experiment_status_db_manager.py | 68 +++++++------ autosubmit/history/database_managers/tests.py | 98 ++++++++++++++++++- autosubmit/history/experiment_history.py | 54 ++++++++-- autosubmit/history/experiment_status.py | 29 +++--- autosubmit/history/logging.py | 41 ++++++++ autosubmit/history/tests.py | 32 ++++-- autosubmit/history/utils.py | 9 ++ autosubmit/job/job_utils.py | 27 +++++ 13 files changed, 337 insertions(+), 128 deletions(-) create mode 100644 autosubmit/history/logging.py diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 11cb1efbc..4714fea09 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1553,7 +1553,7 @@ class Autosubmit: job_data_structure.validate_current_run(job_list.get_job_list( ), as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), current_config=as_conf.get_full_config_as_json()) - ExperimentStatus(expid).set_running() + ExperimentStatus(expid).set_as_running() except Exception as e: raise AutosubmitCritical( "Error while processing job_data_structure", 7067, str(e)) diff --git a/autosubmit/history/data_classes/job_data.py b/autosubmit/history/data_classes/job_data.py index ef1628865..5f2cc782f 100644 --- a/autosubmit/history/data_classes/job_data.py +++ b/autosubmit/history/data_classes/job_data.py @@ -208,7 +208,7 @@ class JobData(object): """ o_datetime = self.submit_datetime() if o_datetime: - return o_datetime.strftime(DATETIME_FORMAT) + return o_datetime.strftime(HUtils.DATETIME_FORMAT) else: return None @@ -218,7 +218,7 @@ class JobData(object): """ o_datetime = self.start_datetime() if o_datetime: - return o_datetime.strftime(DATETIME_FORMAT) + return o_datetime.strftime(HUtils.DATETIME_FORMAT) else: return None @@ -228,7 +228,7 @@ class JobData(object): """ o_datetime = self.finish_datetime() if o_datetime: - return o_datetime.strftime(DATETIME_FORMAT) + return o_datetime.strftime(HUtils.DATETIME_FORMAT) else: return None diff --git a/autosubmit/history/database_managers/database_manager.py b/autosubmit/history/database_managers/database_manager.py index 57dee59c0..5f7ecb5ea 100644 --- a/autosubmit/history/database_managers/database_manager.py +++ b/autosubmit/history/database_managers/database_manager.py @@ -22,20 +22,18 @@ import traceback import autosubmit.history.utils as HUtils import autosubmit.history.database_managers.database_models as Models from abc import ABCMeta, abstractmethod -from log.log import Log, AutosubmitCritical, AutosubmitError -from autosubmit.config.basicConfig import BasicConfig - +DEFAULT_JOBDATA_DIR = os.path.join('/esarchive', 'autosubmit', 'as_metadata', 'data') +DEFAULT_LOCAL_ROOT_DIR = os.path.join('/esarchive', 'autosubmit') class DatabaseManager(): """ Simple database manager. Needs expid. """ __metaclass__ = ABCMeta AS_TIMES_DB_NAME = "as_times.db" # default AS_TIMES location ECEARTH_DB_NAME = "ecearth.db" # default EC_EARTH_DB_NAME location - - def __init__(self, expid): - self.expid = expid - self._basic_configuration = BasicConfig - self._basic_configuration.read() + def __init__(self, expid, jobdata_dir_path=DEFAULT_JOBDATA_DIR, local_root_dir_path=DEFAULT_LOCAL_ROOT_DIR): + self.expid = expid + self.JOBDATA_DIR = jobdata_dir_path + self.LOCAL_ROOT_DIR = local_root_dir_path def get_connection(self, path): # type : (str) -> Sqlite3Connection diff --git a/autosubmit/history/database_managers/database_models.py b/autosubmit/history/database_managers/database_models.py index c017e5d98..e9609ce09 100644 --- a/autosubmit/history/database_managers/database_models.py +++ b/autosubmit/history/database_managers/database_models.py @@ -29,6 +29,7 @@ ExperimentStatusRow = collections.namedtuple( ExperimentRow = collections.namedtuple('ExperimentRow', ["id", "name", "autosubmit_version", "description"]) PragmaVersion = collections.namedtuple('PragmaVersion', ['version']) +MaxCounterRow = collections.namedtuple('MaxCounter', ['maxcounter']) class RunningStatus: RUNNING = "RUNNING" diff --git a/autosubmit/history/database_managers/experiment_history_db_manager.py b/autosubmit/history/database_managers/experiment_history_db_manager.py index 66ee1aabb..62816ad8a 100644 --- a/autosubmit/history/database_managers/experiment_history_db_manager.py +++ b/autosubmit/history/database_managers/experiment_history_db_manager.py @@ -22,24 +22,25 @@ import textwrap import autosubmit.history.utils as HUtils import database_models as Models from abc import ABCMeta, abstractmethod -from log.log import Log, AutosubmitCritical, AutosubmitError -from database_manager import DatabaseManager +from database_manager import DatabaseManager, DEFAULT_JOBDATA_DIR from datetime import datetime CURRENT_DB_VERSION = 16 DB_EXPERIMENT_HEADER_SCHEMA_CHANGES = 14 DB_VERSION_SCHEMA_CHANGES = 12 DEFAULT_DB_VERSION = 10 +DEFAULT_MAX_COUNTER = 0 -class ExperimentHistoryDatabaseManager(DatabaseManager): +class ExperimentHistoryDbManager(DatabaseManager): """ Manages actions directly on the database. """ - def __init__(self, expid): - super(ExperimentHistoryDatabaseManager, self).__init__(expid) + def __init__(self, expid, jobdata_dir_path=DEFAULT_JOBDATA_DIR): + """ Requires expid and jobdata_dir_path. """ + super(ExperimentHistoryDbManager, self).__init__(expid, jobdata_dir_path=jobdata_dir_path) self.db_version = DEFAULT_DB_VERSION # type : int self._set_schema_changes() self._set_table_queries() - self.historicaldb_file_path = os.path.join(self._basic_configuration.JOBDATA_DIR, "job_data_{0}.db".format(self.expid)) # type : str + self.historicaldb_file_path = os.path.join(self.JOBDATA_DIR, "job_data_{0}.db".format(self.expid)) # type : str def _set_table_queries(self): """ Sets basic table queries. """ @@ -133,12 +134,13 @@ class ExperimentHistoryDatabaseManager(DatabaseManager): self.db_version = CURRENT_DB_VERSION def update_historical_database(self): - """ Updates the historical database with the latest changes. """ - self.execute_many_statements_on_dbfile(self.historicaldb_file_path, self.version_schema_changes) - self.execute_statement_on_dbfile(self.historicaldb_file_path, self.create_index_query) - self.execute_statement_on_dbfile(self.historicaldb_file_path, self.create_table_header_query) - self._set_historical_pragma_version(CURRENT_DB_VERSION) - self.db_version = CURRENT_DB_VERSION + """ Updates the historical database with the latest changes IF necessary. """ + if self._get_pragma_version() == CURRENT_DB_VERSION: + self.execute_many_statements_on_dbfile(self.historicaldb_file_path, self.version_schema_changes) + self.execute_statement_on_dbfile(self.historicaldb_file_path, self.create_index_query) + self.execute_statement_on_dbfile(self.historicaldb_file_path, self.create_table_header_query) + self._set_historical_pragma_version(CURRENT_DB_VERSION) + self.db_version = CURRENT_DB_VERSION def get_experiment_run_with_max_id(self): """ Get Models.ExperimentRunRow for the maximum id run. """ @@ -154,56 +156,24 @@ class ExperimentHistoryDatabaseManager(DatabaseManager): job_data_rows = self.get_from_statement(self.historicaldb_file_path, statement) return [Models.JobDataRow(*row) for row in job_data_rows] - def update_job_data_last(self, job_data): - """ - Updates job_data table with data class JobData. - Updates last = 0, modified by id - """ - statement = ''' UPDATE job_data SET last=0, modified = ? WHERE id = ?''' - arguments = (HUtils.get_current_datetime(), job_data._id) - self.execute_statement_with_arguments_on_dbfile(self.historicaldb_file_path, statement, arguments) - - def update_job_data_start(self, job_data): - """ - Updates job_data table with data class JobData. - Updates start, modified, job_id, status, rowtype by id. - """ - statement = ''' UPDATE job_data SET start=?, modified=?, job_id=?, status=?, rowtype=? WHERE id=? ''' - arguments = (int(job_data.start), HUtils.get_current_datetime(), job_data.job_id, job_data.status, job_data.rowtype, job_data._id) - self.execute_many_statement_with_arguments_on_dbfile(self.historicaldb_file_path, statement, arguments) - - def update_job_data_finish_plus(self, job_data): - """ - Update job_data table with data class JobData. - Updates submit, start, finish, modified, job_id, status, energy, extra_data, nnodes, ncpus, rowstatus, out, err by id. - """ - statement = ''' UPDATE job_data SET submit=?, start=?, finish=?, modified=?, job_id=?, status=?, energy=?, extra_data=?, nnodes=?, ncpus=?, rowstatus=?, out=?, err=? WHERE id=? ''' - arguments = (job_data.submit, job_data.start, job_data.finish, HUtils.get_current_datetime(), job_data.job_id,job_data.status, job_data.energy, job_data.extra_data, job_data.nnodes, job_data.ncpus, Models.RowStatus.COMPLETED, job_data.out, job_data.err, job_data._id) - self.execute_statement_with_arguments_on_dbfile(self.historicaldb_file_path, statement, arguments) - def update_many_job_data_change_status(self, changes): # type : (List[Tuple]) -> None - """ Update many job_data rows in bulk. Requires a changes list of argument tuples. """ + """ + Update many job_data rows in bulk. Requires a changes list of argument tuples. + Only updates finish, modified, status, and rowstatus by id. + """ statement = ''' UPDATE job_data SET finish=?, modified=?, status=?, rowstatus=? WHERE id=? ''' self.execute_many_statement_with_arguments_on_dbfile(self.historicaldb_file_path, statement, changes) - def update_job_data_finish(self, job_data): + def update_job_data_by_id(self, job_data): """ Update job_data table with data class JobData. Update finish, modified, job_id, status, energy, extra_data, nnodes, ncpus, rowstatus, out, err by id. """ - statement = ''' UPDATE job_data SET finish=?, modified=?, job_id=?, status=?, energy=?, extra_data=?, nnodes=?, ncpus=?, rowstatus=?, out=?, err=? WHERE id=? ''' - arguments = (job_data.finish, HUtils.get_current_datetime(), job_data.job_id, job_data.status, job_data.energy, job_data.extra_data, job_data.nnodes, job_data.ncpus, Models.RowStatus.COMPLETED, job_data.out, job_data.err, job_data._id) + statement = ''' UPDATE job_data SET last=?, submit=?, start=?, finish=?, modified=?, job_id=?, status=?, energy=?, extra_data=?, nnodes=?, ncpus=?, rowstatus=?, out=?, err=? WHERE id=? ''' + arguments = (job_data.last, job_data.submit, job_data.start, job_data.finish, HUtils.get_current_datetime(), job_data.job_id, job_data.status, job_data.energy, job_data.extra_data, job_data.nnodes, job_data.ncpus, job_data.rowstatus, job_data.out, job_data.err, job_data._id) self.execute_statement_with_arguments_on_dbfile(self.historicaldb_file_path, statement, arguments) - def update_job_data_processed(self, job_data): - """ - Update job_data table with data class JobData. - Updates energy, modified, MaxRSS, AveRSS, rowstatus by - """ - statement = ''' UPDATE job_data SET energy=?, modified=?, MaxRSS=?, AveRSS=?, rowstatus=? WHERE id=? ''' - arguments = (job_data.energy, HUtils.get_current_datetime(), job_data.MaxRSS, job_data.AveRSS, Models.RowStatus.PROCESSED, job_data._id) - self.execute_statement_with_arguments_on_dbfile(self.historicaldb_file_path, statement, arguments) def update_experiment_run(self, experiment_run): """ @@ -256,6 +226,15 @@ class ExperimentHistoryDatabaseManager(DatabaseManager): job_data_rows_last = self.get_from_statement_with_arguments(self.historicaldb_file_path, statement, arguments) return [Models.JobDataRow(*row) for row in job_data_rows_last] + def get_job_data_max_counter(self): + statement = "SELECT MAX(counter) as maxcounter FROM job_data" + counter_result = self.get_from_statement(self.historicaldb_file_path, statement) + if len(counter_result) <= 0: + return DEFAULT_MAX_COUNTER + else: + max_counter = Models.MaxCounterRow(*counter_result[0]).maxcounter + return max_counter if max_counter else DEFAULT_MAX_COUNTER + def delete_job_data(self, _id): """ Deletes row in job_data by id. Useful for testing. """ statement = ''' DELETE FROM job_data WHERE id=? ''' @@ -274,10 +253,9 @@ class ExperimentHistoryDatabaseManager(DatabaseManager): self.execute_statement_on_dbfile(self.historicaldb_file_path, statement) def _get_pragma_version(self): - """ Gets current pragma version """ + """ Gets current pragma version as int. """ statement = "pragma user_version;" pragma_result = self.get_from_statement(self.historicaldb_file_path, statement) if len(pragma_result) <= 0: raise Exception("Error while getting the pragma version. This might be a signal of a deeper problem. Review previous errors.") return Models.PragmaVersion(*pragma_result[0]).version - diff --git a/autosubmit/history/database_managers/experiment_status_db_manager.py b/autosubmit/history/database_managers/experiment_status_db_manager.py index 2d4ac7127..26402cd71 100644 --- a/autosubmit/history/database_managers/experiment_status_db_manager.py +++ b/autosubmit/history/database_managers/experiment_status_db_manager.py @@ -22,20 +22,20 @@ import sqlite3 import traceback import textwrap import time -from database_manager import DatabaseManager +from database_manager import DatabaseManager, DEFAULT_LOCAL_ROOT_DIR import autosubmit.history.utils as HUtils import database_models as Models class ExperimentStatusDbManager(DatabaseManager): """ Manages the actions on the status database """ - def __init__(self, expid): - super(ExperimentStatusDbManager, self).__init__(expid) - self._as_times_file_path = os.path.join(self._basic_configuration.LOCAL_ROOT_DIR, self.AS_TIMES_DB_NAME) - self._ecearth_file_path = os.path.join(self._basic_configuration.LOCAL_ROOT_DIR, self.ECEARTH_DB_NAME) - self._pkl_file_path = os.path.join(self._basic_configuration.LOCAL_ROOT_DIR, "pkl", "job_list_{0}.pkl".format(self.expid)) + def __init__(self, expid, local_root_dir_path=DEFAULT_LOCAL_ROOT_DIR): + super(ExperimentStatusDbManager, self).__init__(expid, local_root_dir_path=local_root_dir_path) + self._as_times_file_path = os.path.join(self.LOCAL_ROOT_DIR, self.AS_TIMES_DB_NAME) + self._ecearth_file_path = os.path.join(self.LOCAL_ROOT_DIR, self.ECEARTH_DB_NAME) + self._pkl_file_path = os.path.join(self.LOCAL_ROOT_DIR, "pkl", "job_list_{0}.pkl".format(self.expid)) self._validate_status_database() - self.current_experiment_row = self._get_current_experiment_row(self.expid) - self.current_experiment_status_row =self._get_current_experiment_status_row(self.current_experiment_row.id) + # self.current_experiment_row = self._get_current_experiment_row(self.expid) + # self.current_experiment_status_row =self._get_current_experiment_status_row(self.current_experiment_row.id) def _validate_status_database(self): """ Creates experiment_status table if it does not exist """ @@ -73,15 +73,24 @@ class ExperimentStatusDbManager(DatabaseManager): return False return False - def set_experiment_as_running(self, status="RUNNING"): - if self.current_experiment_status_row: - # Row exists - self._update_exp_status(status) - else: - # New Row - self._create_exp_status() - - def _get_current_experiment_row(self, expid): + def set_existing_experiment_status_as_running(self, expid): + """ Set the experiment_status row as running. """ + self.update_exp_status(expid, Models.RunningStatus.RUNNING) + + def create_experiment_status_as_running(self, experiment): + """ Create a new experiment_status row for the Models.Experiment item.""" + self.create_exp_status(experiment.id, experiment.name, Models.RunningStatus.RUNNING) + + + def get_experiment_status_row_by_expid(self, expid): + # type : (str) -> Models.ExperimentRow + """ + Get Models.ExperimentRow by expid. + """ + experiment_row = self.get_experiment_row_by_expid(expid) + return self.get_experiment_status_row_by_exp_id(experiment_row.id) + + def get_experiment_row_by_expid(self, expid): # type : (str) -> Models.ExperimentRow """ Get the experiment from ecearth.db by expid as Models.ExperimentRow. @@ -101,7 +110,7 @@ class ExperimentStatusDbManager(DatabaseManager): current_rows = self.get_from_statement(self._as_times_file_path, statement) return [Models.ExperimentStatusRow(*row) for row in current_rows] - def _get_current_experiment_status_row(self, exp_id): + def get_experiment_status_row_by_exp_id(self, exp_id): # type : (int) -> Models.ExperimentStatusRow """ Get Models.ExperimentStatusRow from as_times.db by exp_id (int)""" statement = self.get_built_select_statement("experiment_status", "exp_id=?") @@ -112,26 +121,27 @@ class ExperimentStatusDbManager(DatabaseManager): return Models.ExperimentStatusRow(*current_rows[0]) - def _create_exp_status(self): - # type : () -> None + def create_exp_status(self, exp_id, expid, status): + # type : (int, str) -> None """ Create experiment status """ statement = ''' INSERT INTO experiment_status(exp_id, name, status, seconds_diff, modified) VALUES(?,?,?,?,?) ''' - arguments = (self.current_experiment_row.id, self.expid, Models.RunningStatus.RUNNING, 0, HUtils.get_current_datetime()) + arguments = (exp_id, expid, status, 0, HUtils.get_current_datetime()) return self.insert_statement_with_arguments(self._as_times_file_path, statement, arguments) - def _update_exp_status(self, status="RUNNING"): - # type : (str) -> None + def update_exp_status(self, expid, status="RUNNING"): + # type : (str, str) -> None """ Update status, seconds_diff, modified in experiment_status. """ statement = ''' UPDATE experiment_status SET status = ?, seconds_diff = ?, modified = ? WHERE name = ? ''' - arguments = (status, 0, HUtils.get_current_datetime(), self.current_experiment_row.name) + arguments = (status, 0, HUtils.get_current_datetime(), expid) self.execute_statement_with_arguments_on_dbfile(self._as_times_file_path, statement, arguments) - -# if __name__ == "__main__": -# exp = ExperimentStatusDbManager("a2h6") -# exp.set_experiment_as_running() -# exp.print_current_table() \ No newline at end of file + def delete_exp_status(self, expid): + # type : (str) -> None + """ Deletes experiment_status row by expid. Useful for testing purposes. """ + statement = ''' DELETE FROM experiment_status where name = ? ''' + arguments = (expid,) + self.execute_statement_with_arguments_on_dbfile(self._as_times_file_path, statement, arguments) \ No newline at end of file diff --git a/autosubmit/history/database_managers/tests.py b/autosubmit/history/database_managers/tests.py index 0e7977805..afc1caeb4 100644 --- a/autosubmit/history/database_managers/tests.py +++ b/autosubmit/history/database_managers/tests.py @@ -19,15 +19,57 @@ import unittest import time import random -from experiment_history_db_manager import ExperimentHistoryDatabaseManager +from experiment_history_db_manager import ExperimentHistoryDbManager +from experiment_status_db_manager import ExperimentStatusDbManager from autosubmit.history.data_classes.experiment_run import ExperimentRun from autosubmit.history.data_classes.job_data import JobData +from autosubmit.config.basicConfig import BasicConfig +import autosubmit.history.utils as HUtils EXPID = "tt00" +EXPID_NONE = "tt01" +BasicConfig.read() +JOBDATA_DIR = BasicConfig.JOBDATA_DIR +LOCAL_ROOT_DIR = BasicConfig.LOCAL_ROOT_DIR -class TestExperimentHistoryDatabaseManager(unittest.TestCase): +class TestExperimentStatusDatabaseManager(unittest.TestCase): + """ Covers Experiment Status Database Manager """ + def setUp(self): + self.exp_status_db = ExperimentStatusDbManager(EXPID, LOCAL_ROOT_DIR) + + def tearDown(self): + pass + def test_get_current_experiment_status_row(self): + exp_status_row = self.exp_status_db.get_experiment_status_row_by_expid(EXPID) + self.assertIsNotNone(exp_status_row) + exp_status_row_none = self.exp_status_db.get_experiment_status_row_by_expid(EXPID_NONE) + self.assertIsNone(exp_status_row_none) + exp_row_direct = self.exp_status_db.get_experiment_status_row_by_exp_id(exp_status_row.exp_id) + self.assertTrue(exp_status_row.exp_id == exp_row_direct.exp_id) + + + def test_update_exp_status(self): + self.exp_status_db.update_exp_status(EXPID, "RUNNING") + exp_status_row_current = self.exp_status_db.get_experiment_status_row_by_expid(EXPID) + self.assertTrue(exp_status_row_current.status == "RUNNING") + self.exp_status_db.update_exp_status(EXPID, "NOT RUNNING") + exp_status_row_current = self.exp_status_db.get_experiment_status_row_by_expid(EXPID) + self.assertTrue(exp_status_row_current.status == "NOT RUNNING") + + def test_create_exp_status(self): + experiment = self.exp_status_db.get_experiment_row_by_expid(EXPID_NONE) + self.exp_status_db.create_experiment_status_as_running(experiment) + experiment_status = self.exp_status_db.get_experiment_status_row_by_expid(EXPID_NONE) + self.assertIsNotNone(experiment_status) + self.exp_status_db.delete_exp_status(EXPID_NONE) + experiment_status = self.exp_status_db.get_experiment_status_row_by_expid(EXPID_NONE) + self.assertIsNone(experiment_status) + + +class TestExperimentHistoryDbManager(unittest.TestCase): + """ Covers Experiment History Database Manager and Data Models """ def setUp(self): - self.experiment_database = ExperimentHistoryDatabaseManager(EXPID) + self.experiment_database = ExperimentHistoryDbManager(EXPID, JOBDATA_DIR) def tearDown(self): pass @@ -98,9 +140,55 @@ class TestExperimentHistoryDatabaseManager(unittest.TestCase): def test_job_data_from_model(self): job_data_rows = self.experiment_database.get_job_data_last_by_name("a29z_20000101_fc0_1_SIM") job_data_row_first = job_data_rows[0] - job_data_data_class = JobData.from_model(job_data_row_first) - print(job_data_data_class.extra_data_parsed) + job_data_data_class = JobData.from_model(job_data_row_first) self.assertTrue(job_data_row_first.job_name == job_data_data_class.job_name) + def test_update_job_data_processed(self): + current_time = time.time() + job_data_rows = self.experiment_database.get_job_data_last_by_name("a29z_20000101_fc0_1_SIM") + job_data_row_first = job_data_rows[0] + job_data_data_class = JobData.from_model(job_data_row_first) + backup_job_dc = JobData.from_model(job_data_row_first) + job_data_data_class.nnodes = random.randint(1, 1000) + job_data_data_class.ncpus = random.randint(1, 1000) + job_data_data_class.status = "DELAYED" + job_data_data_class.finish = current_time + self.experiment_database.update_job_data_by_id(job_data_data_class) + job_data_rows_current = self.experiment_database.get_job_data_last_by_name("a29z_20000101_fc0_1_SIM") + job_data_row_first = job_data_rows_current[0] + self.assertTrue(job_data_row_first.nnodes == job_data_data_class.nnodes) + self.assertTrue(job_data_row_first.ncpus == job_data_data_class.ncpus) + self.assertTrue(job_data_row_first.status == job_data_data_class.status) + self.assertTrue(job_data_row_first.finish == job_data_data_class.finish) + self.experiment_database.update_job_data_by_id(backup_job_dc) + + def test_bulk_update(self): + current_time = time.time() + all_job_data_rows = self.experiment_database.get_job_data_all() + job_data_rows_test = [job for job in all_job_data_rows if job.run_id == 3] + backup = [JobData.from_model(job) for job in job_data_rows_test] + list_job_data_class = [JobData.from_model(job) for job in job_data_rows_test] + backup_changes = [(job.finish, HUtils.get_current_datetime(), job.status, job.rowstatus, job._id) for job in list_job_data_class] + changes = [(current_time, HUtils.get_current_datetime(), "DELAYED", job.rowstatus, job._id) for job in list_job_data_class] + self.experiment_database.update_many_job_data_change_status(changes) + all_job_data_rows = self.experiment_database.get_job_data_all() + job_data_rows_validate = [job for job in all_job_data_rows if job.run_id == 3] + for (job_val, change_item) in zip(job_data_rows_validate, changes): + finish, modified, status, rowstatus, _id = change_item + self.assertTrue(job_val.finish == finish) + self.assertTrue(job_val.modified == modified) + self.assertTrue(job_val.status == status) + self.assertTrue(job_val.rowstatus == rowstatus) + self.assertTrue(job_val.id == _id) + self.experiment_database.update_many_job_data_change_status(backup_changes) + + def test_job_data_maxcounter(self): + new_job_data = ExperimentHistoryDbManager(EXPID_NONE, JOBDATA_DIR) + max_empty_table_counter = new_job_data.get_job_data_max_counter() + self.assertTrue(max_empty_table_counter == 0) + max_existing_counter = self.experiment_database.get_job_data_max_counter() + self.assertTrue(max_existing_counter > 0) + + if __name__ == '__main__': unittest.main() \ No newline at end of file diff --git a/autosubmit/history/experiment_history.py b/autosubmit/history/experiment_history.py index f1bfa1c59..384c1e0c6 100644 --- a/autosubmit/history/experiment_history.py +++ b/autosubmit/history/experiment_history.py @@ -15,17 +15,51 @@ # You should have received a copy of the GNU General Public License # along with Autosubmit. If not, see . -from database_managers.experiment_history_db_manager import ExperimentHistoryDatabaseManager +import os +import traceback +from database_managers.experiment_history_db_manager import ExperimentHistoryDbManager, DEFAULT_JOBDATA_DIR +from database_managers.database_models import RowType +from data_classes.job_data import JobData +from logging import Logging class ExperimentHistory(): - def __init__(self, expid): + def __init__(self, expid, jobdata_dir_path=DEFAULT_JOBDATA_DIR): self.expid = expid - self.manager = ExperimentHistoryDatabaseManager(self.expid) - - def get_all_job_data_row(self): - return self.manager.get_job_data_all() + self._log = Logging(expid) + try: + self.manager = ExperimentHistoryDbManager(self.expid, jobdata_dir_path=jobdata_dir_path) + if os.path.exists(self.manager.historicaldb_file_path): + self.manager.update_historical_database() + else: + self.manager.create_historical_database() + except Exception as exp: + self._log.log(str(exp), traceback.format_exc()) + self.manager = None -# if __name__ == "__main__": -# exp = ExperimentHistory("tt00") -# for job_data_row in exp.get_all_job_data_row(): -# print(job_data_row) \ No newline at end of file + + def write_submit_time(self, job_name, submit=0, status="UNKNOWN", ncpus=0, wallclock="00:00", qos="debug", date="", member="", section="", chunk=0, platform="NA", job_id=0, packed=False, wrapper_queue=None, wrapper_code=None): + try: + next_counter = self._get_current_max_counter_by_job_name(job_name) + job_data_dc = JobData(0, next_counter, job_name, None, None, submit, 0, 0, status, self.determine_rowtype(wrapper_code), ncpus, wallclock, queu) + except Exception as exp: + self._log.log(str(exp), traceback.format_exc()) + + def _get_next_counter_by_job_name(self, job_name): + job_data_row = self.manager.get_job_data_last_by_name(job_name) + max_counter = self.manager.get_job_data_max_counter() + if len(job_data_row) > 0: + job_max_counter = max(job.counter for job in job_data_row) + return max(max_counter, job_max_counter) + else: + return max_counter + + def determine_rowtype(self, code): + if code: + return code + else: + return RowType.NORMAL + + def get_defined_queue_name(self, wrapper_queue, wrapper_code, qos): + if wrapper_code and wrapper_code > 2 and wrapper_queue is not None: + return wrapper_queue + return qos \ No newline at end of file diff --git a/autosubmit/history/experiment_status.py b/autosubmit/history/experiment_status.py index 871a76071..000e73294 100644 --- a/autosubmit/history/experiment_status.py +++ b/autosubmit/history/experiment_status.py @@ -17,26 +17,29 @@ # along with Autosubmit. If not, see . import traceback -from experiment_status_db_manager import ExperimentStatusDbManager -from log.log import Log +from database_managers.experiment_status_db_manager import ExperimentStatusDbManager, DEFAULT_LOCAL_ROOT_DIR +from logging import Logging class ExperimentStatus(): """ Represents the Experiment Status Mechanism that keeps track of currently active experiments """ - def __init__(self, expid): + def __init__(self, expid, local_root_dir_path=DEFAULT_LOCAL_ROOT_DIR): # type : (str) -> None self.expid = expid # type : str try: - self._manager = ExperimentStatusDbManager(self.expid) + self.manager = ExperimentStatusDbManager(self.expid, local_root_dir_path=local_root_dir_path) except Exception as exp: - Log.warning("Error while trying to update {0} in experiment_status.".format(str(self.expid))) - Log.debug(traceback.format_exc()) - Log.info(traceback.format_exc()) - self._manager = None + message = "Error while trying to update {0} in experiment_status.".format(str(self.expid)) + print(message) + Logging(self.expid).log(message, traceback.format_exc()) + self.manager = None - def set_running(self): + def set_as_running(self): # type : () -> None """ Set the status of the experiment in experiment_status of as_times.db as RUNNING. Creates the database, table and row if necessary.""" - if self._manager: - self._manager.set_experiment_as_running() - else: - Log.info("It's not possible to set the experiment as RUNNING in this moment. If it is not automatically set as RUNNING in a few minutes, look for previous errors.") \ No newline at end of file + if self.manager: + exp_status_row = self.manager.get_experiment_status_row_by_expid(self.expid) + if exp_status_row: + self.manager.set_existing_experiment_status_as_running(exp_status_row.name) + else: + exp_row = self.manager.get_experiment_row_by_expid(self.expid) + self.manager.create_experiment_status_as_running(exp_row) diff --git a/autosubmit/history/logging.py b/autosubmit/history/logging.py new file mode 100644 index 000000000..8a41f41da --- /dev/null +++ b/autosubmit/history/logging.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python + +# Copyright 2015-2020 Earth Sciences Department, BSC-CNS +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . +import os +import utils as HUtils + +class Logging: + def __init__(self, expid): + self.expid = expid + + def log(self, exception_msg, traceback_msg): + try: + log_path = self.get_default_log_path(self.expid) + HUtils.get_current_datetime() + if not os.path.exists(log_path): + HUtils.create_file_with_full_permissions(log_path) + with open(log_path, "a") as exp_log: + exp_log.write(self.build_message(exception_msg, traceback_msg)) + except Exception as exp: + print(exp) + print("Logging failed. Please report it to the developers.") + + def build_message(self, exception_msg, traceback_msg): + return "{0} :: {1} :: {2}".format(HUtils.get_current_datetime(), exception_msg, traceback_msg) + + def get_default_log_path(self, expid): + return os.path.join("/esarchive","autosubmit", "as_metadata", "logs","{}_log.txt".format(expid)) \ No newline at end of file diff --git a/autosubmit/history/tests.py b/autosubmit/history/tests.py index 5d39c6c79..9ffa3dad0 100644 --- a/autosubmit/history/tests.py +++ b/autosubmit/history/tests.py @@ -17,21 +17,41 @@ # along with Autosubmit. If not, see . import unittest +import traceback from experiment_history import ExperimentHistory +from logging import Logging class TestExperimentHistory(unittest.TestCase): # @classmethod # def setUpClass(cls): # cls.exp = ExperimentHistory("tt00") # example database def test_select_job_data_by_run_id(self): - result = ExperimentHistory("tt00").manager.get_job_data_last_by_run_id(17) - print(result) - self.assertIsNotNone(result) + pass def test_get_all_job_data(self): - result = ExperimentHistory("tt00").get_all_job_data_row() - print(result) - self.assertTrue(result) + pass + +class TestLogging(unittest.TestCase): + + def setUp(self): + message = "No Message" + try: + raise Exception("Setup test exception") + except: + message = traceback.format_exc() + self.log = Logging("tt00") + self.exp_message = "Exception message" + self.trace_message = message + + def test_build_message(self): + message = self.log.build_message(self.exp_message, self.trace_message) + print(message) + self.assertIsNotNone(message) + self.assertTrue(len(message) > 0) + + def test_log(self): + self.log.log(self.exp_message, self.trace_message) + if __name__ == '__main__': unittest.main() \ No newline at end of file diff --git a/autosubmit/history/utils.py b/autosubmit/history/utils.py index 314d7a0da..7ef3e36d4 100644 --- a/autosubmit/history/utils.py +++ b/autosubmit/history/utils.py @@ -18,10 +18,13 @@ # along with Autosubmit. If not, see . import time +import os from datetime import datetime DATETIME_FORMAT = '%Y-%m-%d-%H:%M:%S' + + def get_fields_as_comma_str(model): """ Get the fields of a namedtumple as a comma separated string. """ return ",".join(model._fields) @@ -53,5 +56,11 @@ def get_current_datetime_if_none(argument): else: return argument +def create_file_with_full_permissions(path): + # type : (str) -> None + """ creates a database files with full permissions """ + os.umask(0) + os.open(path, os.O_WRONLY | os.O_CREAT, 0o777) + # if __name__ == "__main__": # print(get_fields_as_comma_str()) \ No newline at end of file diff --git a/autosubmit/job/job_utils.py b/autosubmit/job/job_utils.py index 0d9076690..e0da9d2a2 100644 --- a/autosubmit/job/job_utils.py +++ b/autosubmit/job/job_utils.py @@ -18,11 +18,13 @@ # along with Autosubmit. If not, see . import networkx +import os from networkx.algorithms.dag import is_directed_acyclic_graph from networkx import DiGraph from networkx import dfs_edges from networkx import NetworkXError +from autosubmit.job.job_package_persistence import JobPackagePersistence def transitive_reduction(graph): @@ -41,6 +43,31 @@ def transitive_reduction(graph): reduced_graph.add_edges_from((u, v) for v in u_edges) return reduced_graph + def get_job_package_code(self, job_name): + """ + Finds the package code and retrieves it. None if no package. + + :param BasicConfig: Basic configuration + :type BasicConfig: Configuration Object + :param expid: Experiment Id + :type expid: String + :param current_job_name: Name of job + :type current_jobs: string + :return: package code, None if not found + :rtype: int or None + """ + try: + packages_wrapper = JobPackagePersistence(os.path.join(self.basic_conf.LOCAL_ROOT_DIR, self.expid, "pkl"),"job_packages_" + self.expid).load(wrapper=True) + packages_wrapper_plus = JobPackagePersistence(os.path.join(self.basic_conf.LOCAL_ROOT_DIR, self.expid, "pkl"),"job_packages_" + self.expid).load(wrapper=False) + if (packages_wrapper or packages_wrapper_plus): + packages = packages_wrapper if len(packages_wrapper) > len(packages_wrapper_plus) else packages_wrapper_plus + for exp, package_name, _job_name in packages: + if job_name == _job_name: + code = int(package_name.split("_")[2]) + return code + except: + pass + return None class Dependency(object): """ -- GitLab From ea03c009e7024f14178fad2492475776d907584a Mon Sep 17 00:00:00 2001 From: Wilmer Uruchi Ticona Date: Thu, 7 Oct 2021 20:45:58 +0200 Subject: [PATCH 3/8] Adding the parser to the package. --- .../experiment_history_db_manager.py | 5 +- autosubmit/history/experiment_history.py | 86 +++++++- autosubmit/history/logging.py | 10 +- autosubmit/history/slurm_parser.py | 185 ++++++++++++++++++ autosubmit/job/job_utils.py | 2 +- autosubmit/platforms/paramiko_platform.py | 5 +- 6 files changed, 272 insertions(+), 21 deletions(-) create mode 100644 autosubmit/history/slurm_parser.py diff --git a/autosubmit/history/database_managers/experiment_history_db_manager.py b/autosubmit/history/database_managers/experiment_history_db_manager.py index 62816ad8a..00e6467fd 100644 --- a/autosubmit/history/database_managers/experiment_history_db_manager.py +++ b/autosubmit/history/database_managers/experiment_history_db_manager.py @@ -168,13 +168,12 @@ class ExperimentHistoryDbManager(DatabaseManager): def update_job_data_by_id(self, job_data): """ Update job_data table with data class JobData. - Update finish, modified, job_id, status, energy, extra_data, nnodes, ncpus, rowstatus, out, err by id. + Update last, submit, start, finish, modified, job_id, status, energy, extra_data, nnodes, ncpus, rowstatus, out, err by id. """ statement = ''' UPDATE job_data SET last=?, submit=?, start=?, finish=?, modified=?, job_id=?, status=?, energy=?, extra_data=?, nnodes=?, ncpus=?, rowstatus=?, out=?, err=? WHERE id=? ''' arguments = (job_data.last, job_data.submit, job_data.start, job_data.finish, HUtils.get_current_datetime(), job_data.job_id, job_data.status, job_data.energy, job_data.extra_data, job_data.nnodes, job_data.ncpus, job_data.rowstatus, job_data.out, job_data.err, job_data._id) self.execute_statement_with_arguments_on_dbfile(self.historicaldb_file_path, statement, arguments) - def update_experiment_run(self, experiment_run): """ Update experiment_run table with data class ExperimentRun. @@ -186,7 +185,7 @@ class ExperimentHistoryDbManager(DatabaseManager): def insert_job_data(self, job_data): # type : (JobData) -> int - """ Insert data class JobData into database """ + """ Insert data class JobData into job_data table. """ statement = ''' INSERT INTO job_data(counter, job_name, created, modified, submit, start, finish, status, rowtype, ncpus, wallclock, qos, energy, date, section, member, chunk, last, platform, job_id, extra_data, nnodes, run_id, MaxRSS, AveRSS, out, err, rowstatus) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?) ''' arguments = (job_data.counter, job_data.job_name, HUtils.get_current_datetime(), HUtils.get_current_datetime(), job_data.submit, job_data.start, job_data.finish, job_data.status, job_data.rowtype, job_data.ncpus, job_data.wallclock, job_data.qos, job_data.energy, job_data.date, job_data.section, job_data.member, job_data.chunk, job_data.last, job_data.platform, job_data.job_id, job_data.extra_data, job_data.nnodes, job_data.run_id, job_data.MaxRSS, job_data.AveRSS, job_data.out, job_data.err, job_data.rowstatus) return self.insert_statement_with_arguments(self.historicaldb_file_path, statement, arguments) diff --git a/autosubmit/history/experiment_history.py b/autosubmit/history/experiment_history.py index 384c1e0c6..274e6c0c7 100644 --- a/autosubmit/history/experiment_history.py +++ b/autosubmit/history/experiment_history.py @@ -17,6 +17,7 @@ # along with Autosubmit. If not, see . import os import traceback +import slurm_parser as SlurmParser from database_managers.experiment_history_db_manager import ExperimentHistoryDbManager, DEFAULT_JOBDATA_DIR from database_managers.database_models import RowType from data_classes.job_data import JobData @@ -32,34 +33,101 @@ class ExperimentHistory(): self.manager.update_historical_database() else: self.manager.create_historical_database() + except Exception as exp: self._log.log(str(exp), traceback.format_exc()) self.manager = None - def write_submit_time(self, job_name, submit=0, status="UNKNOWN", ncpus=0, wallclock="00:00", qos="debug", date="", member="", section="", chunk=0, platform="NA", job_id=0, packed=False, wrapper_queue=None, wrapper_code=None): + def write_submit_time(self, job_name, submit=0, status="UNKNOWN", ncpus=0, wallclock="00:00", qos="debug", date="", member="", section="", chunk=0, platform="NA", job_id=0, wrapper_queue=None, wrapper_code=None): try: - next_counter = self._get_current_max_counter_by_job_name(job_name) - job_data_dc = JobData(0, next_counter, job_name, None, None, submit, 0, 0, status, self.determine_rowtype(wrapper_code), ncpus, wallclock, queu) + next_counter = self._get_next_counter_by_job_name(job_name) + job_data_dc = JobData(_id=0, + counter=next_counter, + job_name=job_name, + submit=submit, + status=status, + rowtype=self._get_defined_rowtype(wrapper_code), + ncpus=ncpus, + wallclock=wallclock, + qos=self._get_defined_queue_name(wrapper_queue, wrapper_code, qos), + date=date, + member=member, + section=section, + chunk=chunk, + platform=platform, + job_id=job_id) + return self._register_submitted_job_data_dc(job_data_dc) except Exception as exp: self._log.log(str(exp), traceback.format_exc()) + return None + def write_start_time(self, job_name, start=0, status="UNKWOWN", ncpus=0, wallclock="00:00", qos="debug", date="", member="", section="", chunk=0, platform="NA", job_id=0, wrapper_queue=None, wrapper_code=None): + try: + job_data_dc_last = self.get_job_data_dc_unique_latest_by_job_name(job_name) + if not job_data_dc_last: + job_data_dc_last = self.write_submit_time(job_name=job_name, status=status, ncpus=ncpus, wallclock=wallclock, qos=qos, date=date, member=member, section=section, chunk=chunk, platform=platform, job_id=job_id, wrapper_queue=wrapper_queue, wrapper_code=wrapper_code) + self._log.log("write_start_time {0} start not found.".format(job_name)) + job_data_dc_last.start = start + job_data_dc_last.qos = self._get_defined_queue_name(wrapper_queue, wrapper_code, qos) + job_data_dc_last.status = status + job_data_dc_last.rowtype = self._get_defined_rowtype(wrapper_code) + job_data_dc_last.job_id = job_id + return self.update_job_data_dc_by_id(job_data_dc_last) + except Exception as exp: + self._log.log(str(exp), traceback.format_exc()) + return None + + def write_finish_time(self, job_name, finish=0, status="UNKNOWN", ncpus=0, wallclock="00:00", qos="debug", date="", member="", section="", chunk=0, platform="NA", job_id=0, platform_object=None, packed=False, parent_id_list=None, no_slurm=True, out_file_path=None, out_file=None, err_file=None, wrapper_queue=None, wrapper_code=None): + try: + job_data_dc_last = self.get_job_data_dc_unique_latest_by_job_name(job_name) + if not job_data_dc_last: + job_data_dc_last = self.write_submit_time(job_name=job_name, status=status, ncpus=ncpus, wallclock=wallclock, qos=qos, date=date, member=member, section=section, chunk=chunk, platform=platform, job_id=job_id, wrapper_queue=wrapper_queue, wrapper_code=wrapper_code) + self._log.log("write_finish_time {0} submit not found.".format(job_name)) + # writing finish + + except Exception as exp: + self._log.log(str(exp), traceback.format_exc()) + return None + + def _register_submitted_job_data_dc(self, job_data_dc): + self._set_current_job_data_rows_last_to_zero(job_data_dc.job_name) + self.manager.insert_job_data(job_data_dc) + return self.get_job_data_dc_unique_latest_by_job_name(job_data_dc.job_name) + + def get_job_data_dc_unique_latest_by_job_name(self, job_name): + job_data_row_last = self.manager.get_job_data_last_by_name(job_name) + if len(job_data_row_last) > 0: + return JobData.from_model(job_data_row_last[0]) + return None + + def update_job_data_dc_by_id(self, job_data_dc): + self.manager.update_job_data_by_id(job_data_dc) + return self.get_job_data_dc_unique_latest_by_job_name(job_data_dc.job_name) + + def _set_current_job_data_rows_last_to_zero(self, job_name): + """ Sets the column last = 0 for all job_rows by job_name and last = 1. """ + job_data_row_last = self.manager.get_job_data_last_by_name(job_name) + job_data_dc_list = [JobData.from_model(row) for row in job_data_row_last] + for job_data_dc in job_data_dc_list: + job_data_dc.last = 0 + self.manager.update_job_data_by_id(job_data_dc) + def _get_next_counter_by_job_name(self, job_name): - job_data_row = self.manager.get_job_data_last_by_name(job_name) + job_data_dc = self.get_job_data_dc_unique_latest_by_job_name(job_name) max_counter = self.manager.get_job_data_max_counter() - if len(job_data_row) > 0: - job_max_counter = max(job.counter for job in job_data_row) - return max(max_counter, job_max_counter) + if job_data_dc: + return max(max_counter, job_data_dc.counter) else: return max_counter - def determine_rowtype(self, code): + def _get_defined_rowtype(self, code): if code: return code else: return RowType.NORMAL - def get_defined_queue_name(self, wrapper_queue, wrapper_code, qos): + def _get_defined_queue_name(self, wrapper_queue, wrapper_code, qos): if wrapper_code and wrapper_code > 2 and wrapper_queue is not None: return wrapper_queue return qos \ No newline at end of file diff --git a/autosubmit/history/logging.py b/autosubmit/history/logging.py index 8a41f41da..6b329054c 100644 --- a/autosubmit/history/logging.py +++ b/autosubmit/history/logging.py @@ -22,20 +22,20 @@ class Logging: def __init__(self, expid): self.expid = expid - def log(self, exception_msg, traceback_msg): + def log(self, main_msg, traceback_msg=""): try: log_path = self.get_default_log_path(self.expid) HUtils.get_current_datetime() if not os.path.exists(log_path): HUtils.create_file_with_full_permissions(log_path) with open(log_path, "a") as exp_log: - exp_log.write(self.build_message(exception_msg, traceback_msg)) + exp_log.write(self.build_message(main_msg, traceback_msg)) except Exception as exp: print(exp) - print("Logging failed. Please report it to the developers.") + print("Logging failed. Please report it to the developers.") - def build_message(self, exception_msg, traceback_msg): - return "{0} :: {1} :: {2}".format(HUtils.get_current_datetime(), exception_msg, traceback_msg) + def build_message(self, main_msg, traceback_msg): + return "{0} :: {1} :: {2}".format(HUtils.get_current_datetime(), main_msg, traceback_msg) def get_default_log_path(self, expid): return os.path.join("/esarchive","autosubmit", "as_metadata", "logs","{}_log.txt".format(expid)) \ No newline at end of file diff --git a/autosubmit/history/slurm_parser.py b/autosubmit/history/slurm_parser.py new file mode 100644 index 000000000..9e1cd2abf --- /dev/null +++ b/autosubmit/history/slurm_parser.py @@ -0,0 +1,185 @@ +#!/usr/bin/env python + +# Copyright 2015-2020 Earth Sciences Department, BSC-CNS +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . + +from datetime import datetime +from time import mktime, time + + +def parse_job_finish_data(self, output, wrapped): + """ + Slurm Command 'sacct -n --jobs {0} -o JobId%25,State,NCPUS,NNodes,Submit,Start,End,ConsumedEnergy,MaxRSS%25,AveRSS%25' + Only not wrapped jobs return submit, start, finish, joules, ncpus, nnodes. + + :return: submit, start, finish, joules, ncpus, nnodes, detailed_data + :rtype: int, int, int, int, int, int, json object (str) + """ + try: + # Setting up: Storing detail for posterity + detailed_data = dict() + steps = [] + # No blank spaces after or before + output = output.strip() if output else None + lines = output.split("\n") if output else [] + is_end_of_wrapper = False + # If there is output, list exists + if len(lines) > 0: + # Collecting information from all output + for line in lines: + line = line.strip().split() + if len(line) > 0: + # Collecting detailed data + name = str(line[0]) + if wrapped: + # If it belongs to a wrapper + extra_data = {"ncpus": str(line[2] if len(line) > 2 else "NA"), + "nnodes": str(line[3] if len(line) > 3 else "NA"), + "submit": str(line[4] if len(line) > 4 else "NA"), + "start": str(line[5] if len(line) > 5 else "NA"), + "finish": str(line[6] if len(line) > 6 else "NA"), + "energy": str(line[7] if len(line) > 7 else "NA"), + "MaxRSS": str(line[8] if len(line) > 8 else "NA"), + "AveRSS": str(line[9] if len(line) > 9 else "NA")} + else: + # Normal job + extra_data = {"submit": str(line[4] if len(line) > 4 else "NA"), + "start": str(line[5] if len(line) > 5 else "NA"), + "finish": str(line[6] if len(line) > 6 else "NA"), + "energy": str(line[7] if len(line) > 7 else "NA"), + "MaxRSS": str(line[8] if len(line) > 8 else "NA"), + "AveRSS": str(line[9] if len(line) > 9 else "NA")} + # Detailed data will contain the important information from output + detailed_data[name] = extra_data + steps.append(name) + submit = start = finish = energy = nnodes = ncpus = 0 + status = "UNKNOWN" + # Take first line as source + line = lines[0].strip().split() + ncpus = int(line[2] if len(line) > 2 else 0) + nnodes = int(line[3] if len(line) > 3 else 0) + status = str(line[1]) + if wrapped == False: + # If it is not wrapper job, take first line as source + if status not in ["COMPLETED", "FAILED", "UNKNOWN"]: + # It not completed, then its error and send default data plus output + return (0, 0, 0, 0, ncpus, nnodes, detailed_data, False) + else: + # If it is a wrapped job + # Check if the wrapper has finished + if status in ["COMPLETED", "FAILED", "UNKNOWN"]: + # Wrapper has finished + is_end_of_wrapper = True + # Continue with first line as source + if line: + try: + # Parse submit and start only for normal jobs (not wrapped) + submit = int(mktime(datetime.strptime( + line[4], "%Y-%m-%dT%H:%M:%S").timetuple())) if not wrapped else 0 + start = int(mktime(datetime.strptime( + line[5], "%Y-%m-%dT%H:%M:%S").timetuple())) if not wrapped else 0 + # Assuming the job has been COMPLETED + # If normal job or end of wrapper => Try to get the finish time from the first line of the output, else default to now. + finish = 0 + + if not wrapped: + # If normal job, take finish time from first line + finish = (int(mktime(datetime.strptime(line[6], "%Y-%m-%dT%H:%M:%S").timetuple( + ))) if len(line) > 6 and line[6] != "Unknown" else int(time())) + energy = parse_output_number(line[7]) if len( + line) > 7 and len(line[7]) > 0 else 0 + else: + # If it is a wrapper job + # If end of wrapper, take data from first line + if is_end_of_wrapper == True: + finish = (int(mktime(datetime.strptime(line[6], "%Y-%m-%dT%H:%M:%S").timetuple( + ))) if len(line) > 6 and line[6] != "Unknown" else int(time())) + energy = parse_output_number(line[7]) if len( + line) > 7 and len(line[7]) > 0 else 0 + else: + # If wrapped but not end of wrapper, try to get info from current data. + if "finish" in extra_data.keys() and extra_data["finish"] != "Unknown": + # finish data exists + finish = int(mktime(datetime.strptime( + extra_data["finish"], "%Y-%m-%dT%H:%M:%S").timetuple())) + else: + # if finish date does not exist, query previous step. + if len(steps) >= 2 and detailed_data.__contains__(steps[-2]): + new_extra_data = detailed_data[steps[-2]] + if "finish" in new_extra_data.keys() and new_extra_data["finish"] != "Unknown": + # This might result in an job finish < start, need to handle that in the caller function + finish = int(mktime(datetime.strptime( + new_extra_data["finish"], "%Y-%m-%dT%H:%M:%S").timetuple())) + else: + finish = int(time()) + else: + finish = int(time()) + if "energy" in extra_data.keys() and extra_data["energy"] != "NA": + # energy exists + energy = parse_output_number( + extra_data["energy"]) + else: + # if energy does not exist, query previous step + if len(steps) >= 2 and detailed_data.__contains__(steps[-2]): + new_extra_data = detailed_data[steps[-2]] + if "energy" in new_extra_data.keys() and new_extra_data["energy"] != "NA": + energy = parse_output_number( + new_extra_data["energy"]) + else: + energy = 0 + else: + energy = 0 + except Exception as exp: + pass + + detailed_data = detailed_data if not wrapped or is_end_of_wrapper == True else extra_data + return (submit, start, finish, energy, ncpus, nnodes, detailed_data, is_end_of_wrapper) + + return (0, 0, 0, 0, 0, 0, dict(), False) + except Exception as exp: + return (0, 0, 0, 0, 0, 0, dict(), False) + + +def parse_output_number(string_number): + """ + Parses number in format 1.0K 1.0M 1.0G + + :param string_number: String representation of number + :type string_number: str + :return: number in float format + :rtype: float + """ + number = 0.0 + if (string_number): + last_letter = string_number.strip()[-1] + multiplier = 1 + if last_letter == "G": + multiplier = 1000000000 + number = string_number[:-1] + elif last_letter == "M": + multiplier = 1000000 + number = string_number[:-1] + elif last_letter == "K": + multiplier = 1000 + number = string_number[:-1] + else: + number = string_number + try: + number = float(number) * multiplier + except Exception as exp: + number = 0.0 + pass + return number \ No newline at end of file diff --git a/autosubmit/job/job_utils.py b/autosubmit/job/job_utils.py index e0da9d2a2..8deae76ff 100644 --- a/autosubmit/job/job_utils.py +++ b/autosubmit/job/job_utils.py @@ -67,7 +67,7 @@ def transitive_reduction(graph): return code except: pass - return None + return 0 class Dependency(object): """ diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index 0d2b387c7..7fa387511 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -395,7 +395,7 @@ class ParamikoPlatform(Platform): else: return None - def check_job_energy(self, job_id, packed=False): + def check_job_energy(self, job_id): """ Checks job energy and return values. Defined in child classes. @@ -407,8 +407,7 @@ class ParamikoPlatform(Platform): """ check_energy_cmd = self.get_job_energy_cmd(job_id) self.send_command(check_energy_cmd) - return self.parse_job_finish_data( - self.get_ssh_output(), packed) + return self.get_ssh_output() def submit_Script(self, hold=False): """ -- GitLab From 41a97bc107732a9f8a54c6835388eacd576d3574 Mon Sep 17 00:00:00 2001 From: Wilmer Uruchi Ticona Date: Wed, 13 Oct 2021 20:39:51 +0200 Subject: [PATCH 4/8] Added testing for the slurm parser. --- .vscode/settings.json | 3 +- autosubmit/autosubmit.py | 40 +--- autosubmit/database/db_jobdata.py | 3 - .../database_managers/database_manager.py | 11 +- .../database_managers/database_models.py | 3 +- .../experiment_history_db_manager.py | 97 ++++++-- autosubmit/history/database_managers/tests.py | 63 +++-- autosubmit/history/experiment_history.py | 220 ++++++++++++++---- autosubmit/history/logging.py | 2 +- .../history/platform_monitor/__init__.py | 0 .../output_examples/pending.txt | 1 + .../output_examples/wrapper1.txt | 3 + .../output_examples/wrapper2.txt | 3 + .../output_examples/wrapper_big.txt | 33 +++ .../platform_monitor/platform_monitor.py | 31 +++ .../platform_monitor/platform_utils.py | 70 ++++++ .../history/platform_monitor/slurm_monitor.py | 54 +++++ .../platform_monitor/slurm_monitor_item.py | 98 ++++++++ autosubmit/history/platform_monitor/test.py | 104 +++++++++ autosubmit/history/slurm_parser.py | 94 +++----- autosubmit/history/tests.py | 15 +- autosubmit/history/utils.py | 10 +- autosubmit/job/job.py | 7 + 23 files changed, 768 insertions(+), 197 deletions(-) create mode 100644 autosubmit/history/platform_monitor/__init__.py create mode 100644 autosubmit/history/platform_monitor/output_examples/pending.txt create mode 100644 autosubmit/history/platform_monitor/output_examples/wrapper1.txt create mode 100644 autosubmit/history/platform_monitor/output_examples/wrapper2.txt create mode 100644 autosubmit/history/platform_monitor/output_examples/wrapper_big.txt create mode 100644 autosubmit/history/platform_monitor/platform_monitor.py create mode 100644 autosubmit/history/platform_monitor/platform_utils.py create mode 100644 autosubmit/history/platform_monitor/slurm_monitor.py create mode 100644 autosubmit/history/platform_monitor/slurm_monitor_item.py create mode 100644 autosubmit/history/platform_monitor/test.py diff --git a/.vscode/settings.json b/.vscode/settings.json index eaee7b14b..19a51787b 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,3 +1,4 @@ { - "python.pythonPath": "/Library/Frameworks/Python.framework/Versions/2.7/bin/python" + "python.pythonPath": "/Library/Frameworks/Python.framework/Versions/2.7/bin/python", + "python.linting.enabled": false } \ No newline at end of file diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 4714fea09..c23e757e4 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -73,6 +73,7 @@ from collections import defaultdict from pyparsing import nestedExpr from database.db_jobdata import JobDataStructure from history.experiment_status import ExperimentStatus +from history.experiment_history import ExperimentHistory """ Main module for autosubmit. Only contains an interface class to all functionality implemented on autosubmit """ @@ -4171,7 +4172,6 @@ class Autosubmit: Log.debug('Status of jobs to change: {0}', filter_status) Log.debug('Sections to change: {0}', filter_section) wrongExpid = 0 - job_tracked_changes = {} as_conf = AutosubmitConfig( expid, BasicConfig, ConfigParserFactory()) as_conf.check_conf_files(True) @@ -4392,9 +4392,6 @@ class Autosubmit: ft = filter_chunks.split(",")[1:] if ft == 'Any': for job in job_list.get_job_list(): - # Tracking changes - job_tracked_changes[job.name] = ( - job.status, final_status) Autosubmit.change_status( final, final_status, job, save) else: @@ -4404,9 +4401,6 @@ class Autosubmit: if filter_chunks: jobs_filtered.append(job) else: - # Tracking changes - job_tracked_changes[job.name] = ( - job.status, final_status) Autosubmit.change_status( final, final_status, job, save) @@ -4563,9 +4557,6 @@ class Autosubmit: job.platform.name, job.name), 6000) continue if job.status != final_status: - # Tracking changes - job_tracked_changes[job.name] = ( - job.status, final_status) # Only real changes performed_changes[job.name] = str( Status.VALUE_TO_KEY[job.status]) + " -> " + str(final) @@ -4594,9 +4585,6 @@ class Autosubmit: if fc == 'Any': for job in jobs_filtered: - # Tracking changes - job_tracked_changes[job.name] = ( - job.status, final_status) Autosubmit.change_status( final, final_status, job, save) else: @@ -4615,16 +4603,10 @@ class Autosubmit: for chunk_json in member_json['cs']: chunk = int(chunk_json) for job in filter(lambda j: j.chunk == chunk and j.synchronize is not None, jobs_date): - # Tracking changes - job_tracked_changes[job.name] = ( - job.status, final_status) Autosubmit.change_status( final, final_status, job, save) for job in filter(lambda j: j.chunk == chunk, jobs_member): - # Tracking changes - job_tracked_changes[job.name] = ( - job.status, final_status) Autosubmit.change_status( final, final_status, job, save) @@ -4634,18 +4616,12 @@ class Autosubmit: Log.debug("Filtering jobs with status {0}", filter_status) if status_list == 'Any': for job in job_list.get_job_list(): - # Tracking changes - job_tracked_changes[job.name] = ( - job.status, final_status) Autosubmit.change_status( final, final_status, job, save) else: for status in status_list: fs = Autosubmit._get_status(status) for job in filter(lambda j: j.status == fs, job_list.get_job_list()): - # Tracking changes - job_tracked_changes[job.name] = ( - job.status, final_status) Autosubmit.change_status( final, final_status, job, save) @@ -4668,22 +4644,16 @@ class Autosubmit: else: for job in job_list.get_job_list(): if job.name in jobs: - # Tracking changes - job_tracked_changes[job.name] = ( - job.status, final_status) Autosubmit.change_status( final, final_status, job, save) job_list.update_list(as_conf, False, True) if save and wrongExpid == 0: - job_list.save() - # Historical Database: Setup new run if greater or equal than 90% of completed date-member jobs are going to be changed. - # Or if the total number of jobs in the job_list is different than the total number of jobs in the current experiment run register in the database - job_data_structure = JobDataStructure(expid) - job_data_structure.process_status_changes( - job_tracked_changes, job_list.get_job_list(), as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), check_run=True, current_config=as_conf.get_full_config_as_json(), is_setstatus=True) - + job_list.save() + exp_history = ExperimentHistory(expid, BasicConfig.JOBDATA_DIR) + exp_history.initialize_database() + exp_history.process_status_changes(job_list.get_job_list(), chunk_unit=as_conf.get_chunk_size_unit(), chunk_size=as_conf.get_chunk_size(), current_config=as_conf.get_full_config_as_json()) else: Log.printlog( "Changes NOT saved to the JobList!!!!: use -s option to save", 3000) diff --git a/autosubmit/database/db_jobdata.py b/autosubmit/database/db_jobdata.py index d9e9e8d4f..a7e5f89a4 100644 --- a/autosubmit/database/db_jobdata.py +++ b/autosubmit/database/db_jobdata.py @@ -1389,9 +1389,6 @@ class JobDataStructure(MainDataBase): return None # warning_messages.append( # "Critical | This version of Autosubmit does not support the database that provides the energy information.") - # Include only those that exist in the pkl and have the same status as in the pkl - # current_job_data = [job for job in current_job_data_last if job.job_name in allJobsDict.keys( - # ) and allJobsDict[job.job_name] == job.status] if current_job_data_last else None # Start processing if current_job_data: # Dropping parents key diff --git a/autosubmit/history/database_managers/database_manager.py b/autosubmit/history/database_managers/database_manager.py index 5f7ecb5ea..a4f1516e6 100644 --- a/autosubmit/history/database_managers/database_manager.py +++ b/autosubmit/history/database_managers/database_manager.py @@ -18,7 +18,6 @@ import sqlite3 import os -import traceback import autosubmit.history.utils as HUtils import autosubmit.history.database_managers.database_models as Models from abc import ABCMeta, abstractmethod @@ -50,7 +49,7 @@ class DatabaseManager(): # type : (str) -> None """ creates a database files with full permissions """ os.umask(0) - os.open(path, os.O_WRONLY | os.O_CREAT, 0o777) + os.open(path, os.O_WRONLY | os.O_CREAT, 0o776) def execute_statement_on_dbfile(self, path, statement): # type : (str, str) -> None @@ -83,15 +82,13 @@ class DatabaseManager(): # type : (str, List[str]) -> None """ Updates the table schema using a **small** list of statements. No Exception raised. - Can be used to execute a list of schema updates that might have been already applied. + Should be used to execute a list of schema updates that might have been already applied. """ for statement in statements: try: self.execute_statement_on_dbfile(path, statement) - except Exception as exp: - Log.info(traceback.format_exc()) - Log.debug(str(exp)) - Log.warning("Error on updating {0}. Statement: {1}. You can ignore this message.".format(path, statement)) + except Exception as exp: + pass def get_from_statement(self, path, statement): # type : (str, str) -> List[Tuple] diff --git a/autosubmit/history/database_managers/database_models.py b/autosubmit/history/database_managers/database_models.py index e9609ce09..2ba5d5549 100644 --- a/autosubmit/history/database_managers/database_models.py +++ b/autosubmit/history/database_managers/database_models.py @@ -41,10 +41,11 @@ class RowType: class RowStatus: INITIAL = 0 - COMPLETED = 1 + COMPLETED = 1 PROCESSED = 2 FAULTY = 3 CHANGED = 4 + PENDING_PROCESS = 5 table_name_to_model = { "experiment" : ExperimentRow, diff --git a/autosubmit/history/database_managers/experiment_history_db_manager.py b/autosubmit/history/database_managers/experiment_history_db_manager.py index 00e6467fd..c2b0d8935 100644 --- a/autosubmit/history/database_managers/experiment_history_db_manager.py +++ b/autosubmit/history/database_managers/experiment_history_db_manager.py @@ -21,6 +21,8 @@ import traceback import textwrap import autosubmit.history.utils as HUtils import database_models as Models +from autosubmit.history.data_classes.job_data import JobData +from autosubmit.history.data_classes.experiment_run import ExperimentRun from abc import ABCMeta, abstractmethod from database_manager import DatabaseManager, DEFAULT_JOBDATA_DIR from datetime import datetime @@ -37,11 +39,13 @@ class ExperimentHistoryDbManager(DatabaseManager): def __init__(self, expid, jobdata_dir_path=DEFAULT_JOBDATA_DIR): """ Requires expid and jobdata_dir_path. """ super(ExperimentHistoryDbManager, self).__init__(expid, jobdata_dir_path=jobdata_dir_path) - self.db_version = DEFAULT_DB_VERSION # type : int self._set_schema_changes() self._set_table_queries() self.historicaldb_file_path = os.path.join(self.JOBDATA_DIR, "job_data_{0}.db".format(self.expid)) # type : str - + + def is_header_ready_db_version(self): + return self._get_pragma_version() >= DB_EXPERIMENT_HEADER_SCHEMA_CHANGES + def _set_table_queries(self): """ Sets basic table queries. """ self.create_table_header_query = textwrap.dedent( @@ -131,7 +135,6 @@ class ExperimentHistoryDbManager(DatabaseManager): self.execute_statement_on_dbfile(self.historicaldb_file_path, self.create_table_query) self.execute_statement_on_dbfile(self.historicaldb_file_path, self.create_index_query) self._set_historical_pragma_version(CURRENT_DB_VERSION) - self.db_version = CURRENT_DB_VERSION def update_historical_database(self): """ Updates the historical database with the latest changes IF necessary. """ @@ -140,22 +143,62 @@ class ExperimentHistoryDbManager(DatabaseManager): self.execute_statement_on_dbfile(self.historicaldb_file_path, self.create_index_query) self.execute_statement_on_dbfile(self.historicaldb_file_path, self.create_table_header_query) self._set_historical_pragma_version(CURRENT_DB_VERSION) - self.db_version = CURRENT_DB_VERSION - def get_experiment_run_with_max_id(self): + def get_experiment_run_dc_with_max_id(self): + """ Get Current (latest) ExperimentRun data class. """ + return ExperimentRun.from_model(self._get_experiment_run_with_max_id()) + + def _get_experiment_run_with_max_id(self): """ Get Models.ExperimentRunRow for the maximum id run. """ statement = self.get_built_select_statement("experiment_run", "run_id > 0 ORDER BY run_id DESC LIMIT 0, 1") max_experiment_run = self.get_from_statement(self.historicaldb_file_path, statement) - if len(max_experiment_run) <= 0: - raise Exception("Error on experiment run retrieval") + if len(max_experiment_run) == 0: + raise None return Models.ExperimentRunRow(*max_experiment_run[0]) def get_job_data_all(self): - """ Gets List of Models.JobDataRow from database. """ + """ Gets all content from job_data as list of Models.JobDataRow from database. """ statement = self.get_built_select_statement("job_data") job_data_rows = self.get_from_statement(self.historicaldb_file_path, statement) return [Models.JobDataRow(*row) for row in job_data_rows] + def update_job_data_dc_by_id(self, job_data_dc): + """ Update JobData data class. Returns latest last=1 row from job_data by job_name. """ + self._update_job_data_by_id(job_data_dc) + return self.get_job_data_dc_unique_latest_by_job_name(job_data_dc.job_name) + + def update_experiment_run_dc_by_id(self, experiment_run_dc): + """ Requires ExperimentRun data class. """ + self._update_experiment_run(experiment_run_dc) + return ExperimentRun.from_model(self.get_experiment_run_with_max_id()) + + def get_job_data_dcs_last_by_run_id(self, run_id): + job_data_rows = self._get_job_data_last_by_run_id(run_id) + return [JobData.from_model(row) for row in job_data_rows] + + def get_all_last_job_data_dcs(self): + """ Gets JobData data classes in job_data for last=1. """ + job_data_rows = self._get_all_last_job_data_rows() + return [JobData.from_model(row) for row in job_data_rows] + + def register_submitted_job_data_dc(self, job_data_dc): + """ Sets previous register to last=0 and inserts the new job_data_dc data class.""" + self._set_current_job_data_rows_last_to_zero_by_job_name(job_data_dc.job_name) + self._insert_job_data(job_data_dc) + return self.get_job_data_dc_unique_latest_by_job_name(job_data_dc.job_name) + + def register_experiment_run_dc(self, experiment_run_dc): + self._insert_experiment_run(experiment_run_dc) + return ExperimentRun.from_model(self.get_experiment_run_with_max_id()) + + def _set_current_job_data_rows_last_to_zero_by_job_name(self, job_name): + """ Sets the column last = 0 for all job_rows by job_name and last = 1. """ + job_data_row_last = self._get_job_data_last_by_name(job_name) + job_data_dc_list = [JobData.from_model(row) for row in job_data_row_last] + for job_data_dc in job_data_dc_list: + job_data_dc.last = 0 + self._update_job_data_by_id(job_data_dc) + def update_many_job_data_change_status(self, changes): # type : (List[Tuple]) -> None """ @@ -165,46 +208,53 @@ class ExperimentHistoryDbManager(DatabaseManager): statement = ''' UPDATE job_data SET finish=?, modified=?, status=?, rowstatus=? WHERE id=? ''' self.execute_many_statement_with_arguments_on_dbfile(self.historicaldb_file_path, statement, changes) - def update_job_data_by_id(self, job_data): + def _update_job_data_by_id(self, job_data_dc): """ Update job_data table with data class JobData. Update last, submit, start, finish, modified, job_id, status, energy, extra_data, nnodes, ncpus, rowstatus, out, err by id. """ statement = ''' UPDATE job_data SET last=?, submit=?, start=?, finish=?, modified=?, job_id=?, status=?, energy=?, extra_data=?, nnodes=?, ncpus=?, rowstatus=?, out=?, err=? WHERE id=? ''' - arguments = (job_data.last, job_data.submit, job_data.start, job_data.finish, HUtils.get_current_datetime(), job_data.job_id, job_data.status, job_data.energy, job_data.extra_data, job_data.nnodes, job_data.ncpus, job_data.rowstatus, job_data.out, job_data.err, job_data._id) + arguments = (job_data_dc.last, job_data_dc.submit, job_data_dc.start, job_data_dc.finish, HUtils.get_current_datetime(), job_data_dc.job_id, job_data_dc.status, job_data_dc.energy, job_data_dc.extra_data, job_data_dc.nnodes, job_data_dc.ncpus, job_data_dc.rowstatus, job_data_dc.out, job_data_dc.err, job_data_dc._id) self.execute_statement_with_arguments_on_dbfile(self.historicaldb_file_path, statement, arguments) - def update_experiment_run(self, experiment_run): + def _update_experiment_run(self, experiment_run_dc): """ Update experiment_run table with data class ExperimentRun. Updates by run_id (finish, chunk_unit, chunk_size, completed, total, failed, queuing, running, submitted, suspended) """ statement = ''' UPDATE experiment_run SET finish=?, chunk_unit=?, chunk_size=?, completed=?, total=?, failed=?, queuing=?, running=?, submitted=?, suspended=?, modified=? WHERE run_id=? ''' - arguments = (experiment_run.finish, experiment_run.chunk_unit, experiment_run.chunk_size, experiment_run.completed, experiment_run.total, experiment_run.failed, experiment_run.queuing, experiment_run.running, experiment_run.submitted, experiment_run.suspended, HUtils.get_current_datetime(), experiment_run.run_id) + arguments = (experiment_run_dc.finish, experiment_run_dc.chunk_unit, experiment_run_dc.chunk_size, experiment_run_dc.completed, experiment_run_dc.total, experiment_run_dc.failed, experiment_run_dc.queuing, experiment_run_dc.running, experiment_run_dc.submitted, experiment_run_dc.suspended, HUtils.get_current_datetime(), experiment_run_dc.run_id) self.execute_statement_with_arguments_on_dbfile(self.historicaldb_file_path, statement, arguments) - def insert_job_data(self, job_data): + def _insert_job_data(self, job_data): # type : (JobData) -> int """ Insert data class JobData into job_data table. """ statement = ''' INSERT INTO job_data(counter, job_name, created, modified, submit, start, finish, status, rowtype, ncpus, wallclock, qos, energy, date, section, member, chunk, last, platform, job_id, extra_data, nnodes, run_id, MaxRSS, AveRSS, out, err, rowstatus) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?) ''' arguments = (job_data.counter, job_data.job_name, HUtils.get_current_datetime(), HUtils.get_current_datetime(), job_data.submit, job_data.start, job_data.finish, job_data.status, job_data.rowtype, job_data.ncpus, job_data.wallclock, job_data.qos, job_data.energy, job_data.date, job_data.section, job_data.member, job_data.chunk, job_data.last, job_data.platform, job_data.job_id, job_data.extra_data, job_data.nnodes, job_data.run_id, job_data.MaxRSS, job_data.AveRSS, job_data.out, job_data.err, job_data.rowstatus) return self.insert_statement_with_arguments(self.historicaldb_file_path, statement, arguments) - def insert_experiment_run(self, experiment_run): + def _insert_experiment_run(self, experiment_run): """ Insert data class ExperimentRun into database """ statement = ''' INSERT INTO experiment_run(created, modified, start, finish, chunk_unit, chunk_size, completed, total, failed, queuing, running, submitted, suspended, metadata) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?) ''' arguments = (HUtils.get_current_datetime(), HUtils.get_current_datetime(), experiment_run.start, experiment_run.finish, experiment_run.chunk_unit, experiment_run.chunk_size, experiment_run.completed, experiment_run.total, experiment_run.failed, experiment_run.queuing, experiment_run.running, experiment_run.submitted, experiment_run.suspended, experiment_run.metadata) return self.insert_statement_with_arguments(self.historicaldb_file_path, statement, arguments) - def get_job_data_last_by_run_id_and_finished(self, run_id): + def get_job_data_dc_unique_latest_by_job_name(self, job_name): + """ Returns JobData data class for the latest job_data_row with last=1 by job_name. """ + job_data_row_last = self._get_job_data_last_by_name(job_name) + if len(job_data_row_last) > 0: + return JobData.from_model(job_data_row_last[0]) + return None + + def _get_job_data_last_by_run_id_and_finished(self, run_id): """ Get List of Models.JobDataRow for last=1, finished > 0 and run_id """ statement = self.get_built_select_statement("job_data", "run_id=? and last=1 and finish > 0 and rowtype >= 2 ORDER BY id") arguments = (run_id,) job_data_rows = self.get_from_statement_with_arguments(self.historicaldb_file_path, statement, arguments) return [Models.JobDataRow(*row) for row in job_data_rows] - def get_job_data_last_by_run_id(self, run_id): + def _get_job_data_last_by_run_id(self, run_id): """ Get List of Models.JobDataRow for last=1 and run_id """ statement = self.get_built_select_statement("job_data", "run_id=? and last=1 and rowtype >= 2 ORDER BY id") arguments = (run_id,) @@ -217,8 +267,14 @@ class ExperimentHistoryDbManager(DatabaseManager): arguments = (job_name,) job_data_rows = self.get_from_statement_with_arguments(self.historicaldb_file_path, statement, arguments) return [Models.JobDataRow(*row) for row in job_data_rows] + + def _get_all_last_job_data_rows(self): + """ Get List of Models.JobDataRow for last=1. """ + statement = self.get_built_select_statement("job_data", "last=1") + job_data_rows = self.get_from_statement(self.historicaldb_file_path, statement) + return [Models.JobDataRow(*row) for row in job_data_rows] - def get_job_data_last_by_name(self, job_name): + def _get_job_data_last_by_name(self, job_name): """ Get List of Models.JobDataRow for job_name and last=1 """ statement = self.get_built_select_statement("job_data", "last=1 and job_name=? ORDER BY counter DESC") arguments = (job_name,) @@ -226,6 +282,7 @@ class ExperimentHistoryDbManager(DatabaseManager): return [Models.JobDataRow(*row) for row in job_data_rows_last] def get_job_data_max_counter(self): + """ The max counter is the maximum count value for the count column in job_data. """ statement = "SELECT MAX(counter) as maxcounter FROM job_data" counter_result = self.get_from_statement(self.historicaldb_file_path, statement) if len(counter_result) <= 0: @@ -234,10 +291,10 @@ class ExperimentHistoryDbManager(DatabaseManager): max_counter = Models.MaxCounterRow(*counter_result[0]).maxcounter return max_counter if max_counter else DEFAULT_MAX_COUNTER - def delete_job_data(self, _id): - """ Deletes row in job_data by id. Useful for testing. """ + def delete_job_data(self, id): + """ Deletes row from job_data by id. Useful for testing. """ statement = ''' DELETE FROM job_data WHERE id=? ''' - arguments = (_id, ) + arguments = (id, ) self.execute_statement_with_arguments_on_dbfile(self.historicaldb_file_path, statement, arguments) def delete_experiment_run(self, run_id): diff --git a/autosubmit/history/database_managers/tests.py b/autosubmit/history/database_managers/tests.py index afc1caeb4..4c07b8f7a 100644 --- a/autosubmit/history/database_managers/tests.py +++ b/autosubmit/history/database_managers/tests.py @@ -75,15 +75,15 @@ class TestExperimentHistoryDbManager(unittest.TestCase): pass def test_get_max_id(self): - max_item = self.experiment_database.get_experiment_run_with_max_id() - self.assertTrue(len(max_item) > 0) + max_item = self.experiment_database.get_experiment_run_dc_with_max_id() + self.assertTrue(max_item.run_id > 0) self.assertTrue(max_item.run_id >= 18) # Max is 18 def test_pragma(self): self.assertTrue(self.experiment_database._get_pragma_version() == 16) def test_get_job_data(self): - job_data = self.experiment_database.get_job_data_last_by_name("a29z_20000101_fc0_1_SIM") + job_data = self.experiment_database._get_job_data_last_by_name("a29z_20000101_fc0_1_SIM") self.assertTrue(len(job_data) > 0) self.assertTrue(job_data[0].last == 1) self.assertTrue(job_data[0].job_name == "a29z_20000101_fc0_1_SIM") @@ -91,10 +91,10 @@ class TestExperimentHistoryDbManager(unittest.TestCase): job_data = self.experiment_database.get_job_data_by_name("a29z_20000101_fc0_1_SIM") self.assertTrue(job_data[0].job_name == "a29z_20000101_fc0_1_SIM") - job_data = self.experiment_database.get_job_data_last_by_run_id(18) # Latest + job_data = self.experiment_database._get_job_data_last_by_run_id(18) # Latest self.assertTrue(len(job_data) > 0) - job_data = self.experiment_database.get_job_data_last_by_run_id_and_finished(18) + job_data = self.experiment_database._get_job_data_last_by_run_id_and_finished(18) self.assertTrue(len(job_data) > 0) job_data = self.experiment_database.get_job_data_all() @@ -102,19 +102,19 @@ class TestExperimentHistoryDbManager(unittest.TestCase): def test_insert_and_delete_experiment_run(self): new_run = ExperimentRun(19) - new_id = self.experiment_database.insert_experiment_run(new_run) + new_id = self.experiment_database._insert_experiment_run(new_run) self.assertIsNotNone(new_id) - last_experiment_run = self.experiment_database.get_experiment_run_with_max_id() + last_experiment_run = self.experiment_database.get_experiment_run_dc_with_max_id() self.assertTrue(new_id == last_experiment_run.run_id) self.experiment_database.delete_experiment_run(new_id) - last_experiment_run = self.experiment_database.get_experiment_run_with_max_id() + last_experiment_run = self.experiment_database.get_experiment_run_dc_with_max_id() self.assertTrue(new_id != last_experiment_run.run_id) def test_insert_and_delete_job_data(self): - max_run_id = self.experiment_database.get_experiment_run_with_max_id().run_id + max_run_id = self.experiment_database.get_experiment_run_dc_with_max_id().run_id new_job_data_name = "test_001_name_{0}".format(int(time.time())) new_job_data = JobData(_id=1, job_name=new_job_data_name, run_id=max_run_id) - new_job_data_id = self.experiment_database.insert_job_data(new_job_data) + new_job_data_id = self.experiment_database._insert_job_data(new_job_data) self.assertIsNotNone(new_job_data_id) self.experiment_database.delete_job_data(new_job_data_id) job_data = self.experiment_database.get_job_data_by_name(new_job_data_name) @@ -122,30 +122,29 @@ class TestExperimentHistoryDbManager(unittest.TestCase): def test_update_experiment_run(self): - last_experiment_run = self.experiment_database.get_experiment_run_with_max_id() # 18 - experiment_run_data_class = ExperimentRun.from_model(last_experiment_run) - backup_run = ExperimentRun.from_model(last_experiment_run) + experiment_run_data_class = self.experiment_database.get_experiment_run_dc_with_max_id() # 18 + backup_run = self.experiment_database.get_experiment_run_dc_with_max_id() experiment_run_data_class.chunk_unit = "unouno" experiment_run_data_class.running = random.randint(1, 100) experiment_run_data_class.queuing = random.randint(1, 100) experiment_run_data_class.suspended = random.randint(1, 100) - self.experiment_database.update_experiment_run(experiment_run_data_class) - last_experiment_run = self.experiment_database.get_experiment_run_with_max_id() # 18 + self.experiment_database._update_experiment_run(experiment_run_data_class) + last_experiment_run = self.experiment_database.get_experiment_run_dc_with_max_id() # 18 self.assertTrue(last_experiment_run.chunk_unit == experiment_run_data_class.chunk_unit) self.assertTrue(last_experiment_run.running == experiment_run_data_class.running) self.assertTrue(last_experiment_run.queuing == experiment_run_data_class.queuing) self.assertTrue(last_experiment_run.suspended == experiment_run_data_class.suspended) - self.experiment_database.update_experiment_run(backup_run) + self.experiment_database._update_experiment_run(backup_run) def test_job_data_from_model(self): - job_data_rows = self.experiment_database.get_job_data_last_by_name("a29z_20000101_fc0_1_SIM") + job_data_rows = self.experiment_database._get_job_data_last_by_name("a29z_20000101_fc0_1_SIM") job_data_row_first = job_data_rows[0] job_data_data_class = JobData.from_model(job_data_row_first) self.assertTrue(job_data_row_first.job_name == job_data_data_class.job_name) def test_update_job_data_processed(self): current_time = time.time() - job_data_rows = self.experiment_database.get_job_data_last_by_name("a29z_20000101_fc0_1_SIM") + job_data_rows = self.experiment_database._get_job_data_last_by_name("a29z_20000101_fc0_1_SIM") job_data_row_first = job_data_rows[0] job_data_data_class = JobData.from_model(job_data_row_first) backup_job_dc = JobData.from_model(job_data_row_first) @@ -153,14 +152,14 @@ class TestExperimentHistoryDbManager(unittest.TestCase): job_data_data_class.ncpus = random.randint(1, 1000) job_data_data_class.status = "DELAYED" job_data_data_class.finish = current_time - self.experiment_database.update_job_data_by_id(job_data_data_class) - job_data_rows_current = self.experiment_database.get_job_data_last_by_name("a29z_20000101_fc0_1_SIM") + self.experiment_database._update_job_data_by_id(job_data_data_class) + job_data_rows_current = self.experiment_database._get_job_data_last_by_name("a29z_20000101_fc0_1_SIM") job_data_row_first = job_data_rows_current[0] self.assertTrue(job_data_row_first.nnodes == job_data_data_class.nnodes) self.assertTrue(job_data_row_first.ncpus == job_data_data_class.ncpus) self.assertTrue(job_data_row_first.status == job_data_data_class.status) self.assertTrue(job_data_row_first.finish == job_data_data_class.finish) - self.experiment_database.update_job_data_by_id(backup_job_dc) + self.experiment_database._update_job_data_by_id(backup_job_dc) def test_bulk_update(self): current_time = time.time() @@ -189,6 +188,28 @@ class TestExperimentHistoryDbManager(unittest.TestCase): max_existing_counter = self.experiment_database.get_job_data_max_counter() self.assertTrue(max_existing_counter > 0) + def test_register_submitted_job_data_dc(self): + job_data_dc = self.experiment_database.get_job_data_dc_unique_latest_by_job_name("a29z_20000101_fc0_1_SIM") + max_counter = self.experiment_database.get_job_data_max_counter() + self.assertTrue(max_counter > 0) + self.assertTrue(job_data_dc.counter > 0) + next_counter = max(max_counter, job_data_dc.counter + 1) + self.assertTrue(next_counter >= max_counter) + self.assertTrue(next_counter >= job_data_dc.counter + 1) + job_data_dc.counter = next_counter + job_data_dc_current = self.experiment_database.register_submitted_job_data_dc(job_data_dc) + self.assertTrue(job_data_dc._id < job_data_dc_current._id) + job_data_last_list = self.experiment_database._get_job_data_last_by_name(job_data_dc.job_name) + self.assertTrue(len(job_data_last_list) == 1) + self.experiment_database.delete_job_data(job_data_last_list[0].id) + job_data_dc.last = 1 + updated_job_data_dc = self.experiment_database.update_job_data_dc_by_id(job_data_dc) + self.assertTrue(job_data_dc._id == updated_job_data_dc._id) + job_data_dc = self.experiment_database.get_job_data_dc_unique_latest_by_job_name("a29z_20000101_fc0_1_SIM") + self.assertTrue(job_data_dc._id == updated_job_data_dc._id) + + def test_experiment_run_dc(self): + experiment_run = self.experiment_database if __name__ == '__main__': unittest.main() \ No newline at end of file diff --git a/autosubmit/history/experiment_history.py b/autosubmit/history/experiment_history.py index 274e6c0c7..485a574c0 100644 --- a/autosubmit/history/experiment_history.py +++ b/autosubmit/history/experiment_history.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # Copyright 2015-2020 Earth Sciences Department, BSC-CNS # This file is part of Autosubmit. @@ -17,28 +17,47 @@ # along with Autosubmit. If not, see . import os import traceback -import slurm_parser as SlurmParser +import database_managers.database_models as Models +import utils as HUtils +from time import time, sleep from database_managers.experiment_history_db_manager import ExperimentHistoryDbManager, DEFAULT_JOBDATA_DIR -from database_managers.database_models import RowType from data_classes.job_data import JobData +from data_classes.experiment_run import ExperimentRun +from platform_monitor.slurm_monitor import SlurmMonitor from logging import Logging +SECONDS_WAIT_PLATFORM = 60 + class ExperimentHistory(): - def __init__(self, expid, jobdata_dir_path=DEFAULT_JOBDATA_DIR): + def __init__(self, expid, jobdata_dir_path=DEFAULT_JOBDATA_DIR): self.expid = expid self._log = Logging(expid) + self._job_data_dir_path = jobdata_dir_path try: self.manager = ExperimentHistoryDbManager(self.expid, jobdata_dir_path=jobdata_dir_path) - if os.path.exists(self.manager.historicaldb_file_path): - self.manager.update_historical_database() - else: - self.manager.create_historical_database() + except Exception as exp: + self._log.log(str(exp), traceback.format_exc()) + self.manager = None + def initialize_database(self): + try: + if self.my_database_exists(): + self.manager.update_historical_database() + else: + self.manager.create_historical_database() except Exception as exp: self._log.log(str(exp), traceback.format_exc()) self.manager = None - + def my_database_exists(self): + return os.path.exists(self.manager.historicaldb_file_path) + + def is_header_ready(self): + if self.manager: + return self.manager.is_header_ready_db_version() + return False + + def write_submit_time(self, job_name, submit=0, status="UNKNOWN", ncpus=0, wallclock="00:00", qos="debug", date="", member="", section="", chunk=0, platform="NA", job_id=0, wrapper_queue=None, wrapper_code=None): try: next_counter = self._get_next_counter_by_job_name(job_name) @@ -57,14 +76,14 @@ class ExperimentHistory(): chunk=chunk, platform=platform, job_id=job_id) - return self._register_submitted_job_data_dc(job_data_dc) + return self.manager.register_submitted_job_data_dc(job_data_dc) except Exception as exp: self._log.log(str(exp), traceback.format_exc()) return None def write_start_time(self, job_name, start=0, status="UNKWOWN", ncpus=0, wallclock="00:00", qos="debug", date="", member="", section="", chunk=0, platform="NA", job_id=0, wrapper_queue=None, wrapper_code=None): try: - job_data_dc_last = self.get_job_data_dc_unique_latest_by_job_name(job_name) + job_data_dc_last = self.manager.get_job_data_dc_unique_latest_by_job_name(job_name) if not job_data_dc_last: job_data_dc_last = self.write_submit_time(job_name=job_name, status=status, ncpus=ncpus, wallclock=wallclock, qos=qos, date=date, member=member, section=section, chunk=chunk, platform=platform, job_id=job_id, wrapper_queue=wrapper_queue, wrapper_code=wrapper_code) self._log.log("write_start_time {0} start not found.".format(job_name)) @@ -73,61 +92,170 @@ class ExperimentHistory(): job_data_dc_last.status = status job_data_dc_last.rowtype = self._get_defined_rowtype(wrapper_code) job_data_dc_last.job_id = job_id - return self.update_job_data_dc_by_id(job_data_dc_last) + return self.manager.update_job_data_dc_by_id(job_data_dc_last) except Exception as exp: self._log.log(str(exp), traceback.format_exc()) return None def write_finish_time(self, job_name, finish=0, status="UNKNOWN", ncpus=0, wallclock="00:00", qos="debug", date="", member="", section="", chunk=0, platform="NA", job_id=0, platform_object=None, packed=False, parent_id_list=None, no_slurm=True, out_file_path=None, out_file=None, err_file=None, wrapper_queue=None, wrapper_code=None): try: - job_data_dc_last = self.get_job_data_dc_unique_latest_by_job_name(job_name) + job_data_dc_last = self.manager.get_job_data_dc_unique_latest_by_job_name(job_name) if not job_data_dc_last: job_data_dc_last = self.write_submit_time(job_name=job_name, status=status, ncpus=ncpus, wallclock=wallclock, qos=qos, date=date, member=member, section=section, chunk=chunk, platform=platform, job_id=job_id, wrapper_queue=wrapper_queue, wrapper_code=wrapper_code) self._log.log("write_finish_time {0} submit not found.".format(job_name)) - # writing finish - + job_data_dc_last.finish = finish if finish > 0 else int(time()) + job_data_dc_last.status = status + job_data_dc_last.job_id = job_id + job_data_dc_last.rowstatus = Models.RowStatus.PENDING_PROCESS + job_data_dc_last.out = out_file if out_file else "" + job_data_dc_last.err = err_file if err_file else "" + return self.manager.update_job_data_dc_by_id(job_data_dc_last) except Exception as exp: self._log.log(str(exp), traceback.format_exc()) return None + + def write_platform_data_after_finish(self, job_data_dc, platform_obj): + """ """ + try: + sleep(SECONDS_WAIT_PLATFORM) + slurm_monitor = SlurmMonitor(platform_obj.check_job_energy(job_data_dc.job_id)) + # Get current rows in run_id by rowtype (wrapper code) + # Add children names columnd to job_data - def _register_submitted_job_data_dc(self, job_data_dc): - self._set_current_job_data_rows_last_to_zero(job_data_dc.job_name) - self.manager.insert_job_data(job_data_dc) - return self.get_job_data_dc_unique_latest_by_job_name(job_data_dc.job_name) - - def get_job_data_dc_unique_latest_by_job_name(self, job_name): - job_data_row_last = self.manager.get_job_data_last_by_name(job_name) - if len(job_data_row_last) > 0: - return JobData.from_model(job_data_row_last[0]) - return None - - def update_job_data_dc_by_id(self, job_data_dc): - self.manager.update_job_data_by_id(job_data_dc) - return self.get_job_data_dc_unique_latest_by_job_name(job_data_dc.job_name) - - def _set_current_job_data_rows_last_to_zero(self, job_name): - """ Sets the column last = 0 for all job_rows by job_name and last = 1. """ - job_data_row_last = self.manager.get_job_data_last_by_name(job_name) - job_data_dc_list = [JobData.from_model(row) for row in job_data_row_last] - for job_data_dc in job_data_dc_list: - job_data_dc.last = 0 - self.manager.update_job_data_by_id(job_data_dc) + except Exception as exp: + self._log.log(str(exp), traceback.format_exc()) + return None - def _get_next_counter_by_job_name(self, job_name): - job_data_dc = self.get_job_data_dc_unique_latest_by_job_name(job_name) - max_counter = self.manager.get_job_data_max_counter() - if job_data_dc: - return max(max_counter, job_data_dc.counter) - else: - return max_counter + def process_status_changes(self, job_list=None, chunk_unit="NA", chunk_size=0, current_config=""): + """ Detect status differences between job_list and current job_data rows, and update. Creates a new run if necessary. """ + try: + current_experiment_run_dc = self.manager.get_experiment_run_dc_with_max_id() + update_these_changes = self._get_built_list_of_changes(job_list) + if len(update_these_changes) > 0: + self.manager.update_many_job_data_change_status(update_these_changes) + if self.should_we_create_a_new_run(job_list, len(update_these_changes), current_experiment_run_dc.total): + return self.create_new_experiment_run(chunk_unit, chunk_size, current_config) + return self.update_counts_on_experiment_run_dc(current_experiment_run_dc, job_list) + except Exception as exp: + self._log.log(str(exp), traceback.format_exc()) + + def process_job_list_changes_to_experiment_totals(self, job_list=None): + """ Updates current experiment_run row with totals calculated from job_list. """ + try: + current_experiment_run_dc = self.manager.get_experiment_run_dc_with_max_id() + return self.update_counts_on_experiment_run_dc(current_experiment_run_dc, job_list) + except Exception as exp: + self._log.log(str(exp), traceback.format_exc()) + def should_we_create_a_new_run(self, job_list, changes_count, total_count): + if len(job_list) != total_count: + return True + if changes_count > int(self._get_date_member_completed_count(job_list)*0.9): + return True + return False + + def update_counts_on_experiment_run_dc(self, experiment_run_dc, job_list=None): + """ Return updated row as Models.ExperimentRun. """ + status_counts = self.get_status_counts_from_job_list(job_list) + experiment_run_dc.completed = status_counts[HUtils.SupportedStatus.COMPLETED] + experiment_run_dc.failed = status_counts[HUtils.SupportedStatus.FAILED] + experiment_run_dc.queuing = status_counts[HUtils.SupportedStatus.QUEUING] + experiment_run_dc.submitted = status_counts[HUtils.SupportedStatus.SUBMITTED] + experiment_run_dc.running = status_counts[HUtils.SupportedStatus.RUNNING] + experiment_run_dc.suspended = status_counts[HUtils.SupportedStatus.SUSPENDED] + return self.manager.update_experiment_run_dc_by_id(experiment_run_dc) + + def finish_current_experiment_run(self): + current_experiment_run_dc = self.manager.get_experiment_run_dc_with_max_id() + current_experiment_run_dc.finish = int(time()) + return self.manager.update_experiment_run_dc_by_id(current_experiment_run_dc) + + def create_new_experiment_run(self, chunk_unit="NA", chunk_size=0, current_config=""): + """ Also writes the finish timestamp of the previous run. """ + self.finish_current_experiment_run() + return self._create_new_experiment_run_dc_with_counts(chunk_unit=chunk_unit, chunk_size=chunk_size, current_config=current_config) + + def detect_changes_in_job_list(self, job_list): + """ Detect changes in job_list compared to the current contents of job_data table. Returns a list of JobData data classes where the status of each item is the new status.""" + job_name_to_job = {job.name: job for job in job_list} + current_job_data_dcs = self.manager.get_all_last_job_data_dcs() + differences = [] + for job_dc in current_job_data_dcs: + if job_dc.job_name in job_name_to_job and job_dc.status != job_name_to_job[job_dc.job_name].status_str: + job_dc.status = job_name_to_job[job_dc.job_name].status_str + differences.append(job_dc) + return differences + + def update_experiment_history_from_job_list(self, job_list): + """ job_list: List of objects, each object must have attributes date, member, status_str. """ + raise NotImplementedError + + def _create_new_experiment_run_dc_with_counts(self, chunk_unit, chunk_size, current_config="", job_list=None): + """ Create new experiment_run row and return the new Models.ExperimentRun data class from database. """ + status_counts = self.get_status_counts_from_job_list(job_list) + experiment_run_dc = ExperimentRun(0, + chunk_unit=chunk_unit, + chunk_size=chunk_size, + metadata=current_config, + completed=status_counts[HUtils.SupportedStatus.COMPLETED], + total=status_counts["TOTAL"], + failed=status_counts[HUtils.SupportedStatus.FAILED], + queuing=status_counts[HUtils.SupportedStatus.QUEUING], + running=status_counts[HUtils.SupportedStatus.RUNNING], + submitted=status_counts[HUtils.SupportedStatus.SUBMITTED], + suspended=status_counts[HUtils.SupportedStatus.SUSPENDED]) + return self.manager.register_experiment_run_dc(experiment_run_dc) + def _get_defined_rowtype(self, code): if code: return code else: - return RowType.NORMAL + return Models.RowType.NORMAL def _get_defined_queue_name(self, wrapper_queue, wrapper_code, qos): if wrapper_code and wrapper_code > 2 and wrapper_queue is not None: return wrapper_queue - return qos \ No newline at end of file + return qos + + def _get_next_counter_by_job_name(self, job_name): + """ Return the counter attribute from the latest job data row by job_name. """ + job_data_dc = self.manager.get_job_data_dc_unique_latest_by_job_name(job_name) + max_counter = self.manager.get_job_data_max_counter() + if job_data_dc: + return max(max_counter, job_data_dc.counter + 1) + else: + return max_counter + + def _get_date_member_completed_count(self, job_list): + """ Each item in the job_list must have attributes: date, member, status_str. """ + job_list = job_list if job_list else [] + return sum(1 for job in job_list if job.date is not None and job.member is None and job.status_str == HUtils.SupportedStatus.COMPLETED) + + def _get_built_list_of_changes(self, job_list): + """ Return: List of (current timestamp, current datetime str, status, rowstatus, id in job_data). One tuple per change. """ + job_data_dcs = self.detect_changes_in_job_list(job_list) + return [(int(time()), HUtils.get_current_datetime(), job.status, Models.RowStatus.CHANGED, job._id) for job in job_data_dcs] + + def get_status_counts_from_job_list(self, job_list): + """ + Return dict with keys COMPLETED, FAILED, QUEUING, SUBMITTED, RUNNING, SUSPENDED, TOTAL. + """ + result = { + HUtils.SupportedStatus.COMPLETED: 0, + HUtils.SupportedStatus.FAILED: 0, + HUtils.SupportedStatus.QUEUING: 0, + HUtils.SupportedStatus.SUBMITTED: 0, + HUtils.SupportedStatus.RUNNING: 0, + HUtils.SupportedStatus.RUNNING: 0, + "TOTAL": 0 + } + + if not job_list: + job_list = [] + + for job in job_list: + if job.status_str in result: + result[job.status_str] += 1 + result["TOTAL"] = len(job_list) + return result + \ No newline at end of file diff --git a/autosubmit/history/logging.py b/autosubmit/history/logging.py index 6b329054c..fa379c61a 100644 --- a/autosubmit/history/logging.py +++ b/autosubmit/history/logging.py @@ -18,7 +18,7 @@ import os import utils as HUtils -class Logging: +class Logging(): def __init__(self, expid): self.expid = expid diff --git a/autosubmit/history/platform_monitor/__init__.py b/autosubmit/history/platform_monitor/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/autosubmit/history/platform_monitor/output_examples/pending.txt b/autosubmit/history/platform_monitor/output_examples/pending.txt new file mode 100644 index 000000000..007e88d08 --- /dev/null +++ b/autosubmit/history/platform_monitor/output_examples/pending.txt @@ -0,0 +1 @@ + 17838842 PENDING 4 1 2021-10-11T10:55:53 Unknown Unknown diff --git a/autosubmit/history/platform_monitor/output_examples/wrapper1.txt b/autosubmit/history/platform_monitor/output_examples/wrapper1.txt new file mode 100644 index 000000000..61b855cd1 --- /dev/null +++ b/autosubmit/history/platform_monitor/output_examples/wrapper1.txt @@ -0,0 +1,3 @@ + 12535498 COMPLETED 2 1 2020-11-18T13:54:24 2020-11-18T13:55:55 2020-11-18T13:56:10 2.77K + 12535498.batch COMPLETED 2 1 2020-11-18T13:55:55 2020-11-18T13:55:55 2020-11-18T13:56:10 2.69K 659K 659K + 12535498.extern COMPLETED 2 1 2020-11-18T13:55:55 2020-11-18T13:55:55 2020-11-18T13:56:10 2.77K 24K 24K \ No newline at end of file diff --git a/autosubmit/history/platform_monitor/output_examples/wrapper2.txt b/autosubmit/history/platform_monitor/output_examples/wrapper2.txt new file mode 100644 index 000000000..082eb0105 --- /dev/null +++ b/autosubmit/history/platform_monitor/output_examples/wrapper2.txt @@ -0,0 +1,3 @@ + 12535498 COMPLETED 2 1 2020-11-18T13:54:24 2020-11-18T13:55:55 2020-11-18T13:56:10 2.77K + 12535498.batch COMPLETED 2 1 2020-11-18T13:55:55 2020-11-18T13:55:55 2020-11-18T13:56:10 2.69K 659K 659K + 12535498.0 COMPLETED 2 1 2020-11-18T13:55:55 2020-11-18T13:55:55 2020-11-18T13:56:10 2.77K 24K 24K \ No newline at end of file diff --git a/autosubmit/history/platform_monitor/output_examples/wrapper_big.txt b/autosubmit/history/platform_monitor/output_examples/wrapper_big.txt new file mode 100644 index 000000000..65c6c1191 --- /dev/null +++ b/autosubmit/history/platform_monitor/output_examples/wrapper_big.txt @@ -0,0 +1,33 @@ + 17857525 COMPLETED 10 1 2021-10-13T15:51:16 2021-10-13T15:51:17 2021-10-13T15:52:47 19.05K + 17857525.batch COMPLETED 10 1 2021-10-13T15:51:17 2021-10-13T15:51:17 2021-10-13T15:52:47 13.38K 6264K 6264K + 17857525.extern COMPLETED 10 1 2021-10-13T15:51:17 2021-10-13T15:51:17 2021-10-13T15:52:47 13.66K 473K 68K + 17857525.0 COMPLETED 10 1 2021-10-13T15:51:21 2021-10-13T15:51:21 2021-10-13T15:51:22 186 352K 312.30K + 17857525.1 COMPLETED 10 1 2021-10-13T15:51:23 2021-10-13T15:51:23 2021-10-13T15:51:24 186 420K 306.70K + 17857525.2 COMPLETED 10 1 2021-10-13T15:51:24 2021-10-13T15:51:24 2021-10-13T15:51:27 188 352K 325.80K + 17857525.3 COMPLETED 10 1 2021-10-13T15:51:28 2021-10-13T15:51:28 2021-10-13T15:51:29 192 352K 341.90K + 17857525.4 COMPLETED 10 1 2021-10-13T15:51:29 2021-10-13T15:51:29 2021-10-13T15:51:31 186 352K 335.20K + 17857525.5 COMPLETED 10 1 2021-10-13T15:51:31 2021-10-13T15:51:31 2021-10-13T15:51:32 186 352K 329.80K + 17857525.6 COMPLETED 10 1 2021-10-13T15:51:32 2021-10-13T15:51:32 2021-10-13T15:51:33 184 428K 311.10K + 17857525.7 COMPLETED 10 1 2021-10-13T15:51:34 2021-10-13T15:51:34 2021-10-13T15:51:35 185 416K 341.40K + 17857525.8 COMPLETED 10 1 2021-10-13T15:51:35 2021-10-13T15:51:35 2021-10-13T15:51:37 180 428K 317.40K + 17857525.9 COMPLETED 10 1 2021-10-13T15:51:39 2021-10-13T15:51:39 2021-10-13T15:51:42 17 424K 272.70K + 17857525.10 COMPLETED 10 1 2021-10-13T15:51:42 2021-10-13T15:51:42 2021-10-13T15:51:44 185 356K 304.20K + 17857525.11 COMPLETED 10 1 2021-10-13T15:51:44 2021-10-13T15:51:44 2021-10-13T15:51:45 189 352K 322.20K + 17857525.12 COMPLETED 10 1 2021-10-13T15:51:45 2021-10-13T15:51:45 2021-10-13T15:51:47 184 388K 310.70K + 17857525.13 COMPLETED 10 1 2021-10-13T15:51:48 2021-10-13T15:51:48 2021-10-13T15:51:49 183 352K 336.90K + 17857525.14 COMPLETED 10 1 2021-10-13T15:51:49 2021-10-13T15:51:49 2021-10-13T15:51:51 183 428K 346.60K + 17857525.15 COMPLETED 10 1 2021-10-13T15:51:51 2021-10-13T15:51:51 2021-10-13T15:51:53 187 352K 335.90K + 17857525.16 COMPLETED 10 1 2021-10-13T15:51:54 2021-10-13T15:51:54 2021-10-13T15:51:55 184 424K 270K + 17857525.17 COMPLETED 10 1 2021-10-13T15:51:55 2021-10-13T15:51:55 2021-10-13T15:51:57 186 352K 304.80K + 17857525.18 COMPLETED 10 1 2021-10-13T15:51:57 2021-10-13T15:51:57 2021-10-13T15:51:59 182 428K 357K + 17857525.19 COMPLETED 10 1 2021-10-13T15:51:59 2021-10-13T15:51:59 2021-10-13T15:52:01 185 420K 280.60K + 17857525.20 COMPLETED 10 1 2021-10-13T15:52:01 2021-10-13T15:52:01 2021-10-13T15:52:03 185 352K 339.90K + 17857525.21 COMPLETED 10 1 2021-10-13T15:52:04 2021-10-13T15:52:04 2021-10-13T15:52:05 188 356K 340.20K + 17857525.22 COMPLETED 10 1 2021-10-13T15:52:06 2021-10-13T15:52:06 2021-10-13T15:52:08 185 352K 287.50K + 17857525.23 COMPLETED 10 1 2021-10-13T15:52:08 2021-10-13T15:52:08 2021-10-13T15:52:11 187 420K 349.40K + 17857525.24 COMPLETED 10 1 2021-10-13T15:52:14 2021-10-13T15:52:14 2021-10-13T15:52:16 185 420K 353.70K + 17857525.25 COMPLETED 10 1 2021-10-13T15:52:20 2021-10-13T15:52:20 2021-10-13T15:52:22 187 352K 340.30K + 17857525.26 COMPLETED 10 1 2021-10-13T15:52:24 2021-10-13T15:52:24 2021-10-13T15:52:32 186 420K 345.80K + 17857525.27 COMPLETED 10 1 2021-10-13T15:52:37 2021-10-13T15:52:37 2021-10-13T15:52:39 184 352K 341K + 17857525.28 COMPLETED 10 1 2021-10-13T15:52:41 2021-10-13T15:52:41 2021-10-13T15:52:43 184 352K 326.20K + 17857525.29 COMPLETED 10 1 2021-10-13T15:52:44 2021-10-13T15:52:44 2021-10-13T15:52:47 183 352K 319.30K diff --git a/autosubmit/history/platform_monitor/platform_monitor.py b/autosubmit/history/platform_monitor/platform_monitor.py new file mode 100644 index 000000000..10b044703 --- /dev/null +++ b/autosubmit/history/platform_monitor/platform_monitor.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python + +# Copyright 2015-2020 Earth Sciences Department, BSC-CNS +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . + +from abc import ABCMeta, abstractmethod + +class PlatformMonitor(): + __metaclass__ = ABCMeta + def __init__(self, platform_output): + self.original_input = platform_output + self.input = str(platform_output).strip() + self.input_items = [] + + @abstractmethod + def identify_input_rows(self): + """ """ + diff --git a/autosubmit/history/platform_monitor/platform_utils.py b/autosubmit/history/platform_monitor/platform_utils.py new file mode 100644 index 000000000..43a015ab9 --- /dev/null +++ b/autosubmit/history/platform_monitor/platform_utils.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python + +# Copyright 2015-2020 Earth Sciences Department, BSC-CNS +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . + +import os +from time import mktime +from datetime import datetime + +SLURM_DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%S" + +def parse_output_number(string_number): + """ + Parses number in format 1.0K 1.0M 1.0G + + :param string_number: String representation of number + :type string_number: str + :return: number in float format + :rtype: float + """ + number = 0.0 + if (string_number): + last_letter = string_number.strip()[-1] + multiplier = 1.0 + if last_letter == "G": + multiplier = 1000000000.0 # Billion + number = float(string_number[:-1]) + elif last_letter == "M": + multiplier = 1000000.0 # Million + number = float(string_number[:-1]) + elif last_letter == "K": + multiplier = 1000.0 # Thousand + number = float(string_number[:-1]) + else: + number = float(string_number) + try: + number = float(number) * multiplier + except Exception as exp: + number = 0.0 + pass + return number + +def try_parse_time_to_timestamp(input): + """ + Receives a string in format "%Y-%m-%dT%H:%M:%S" and tries to parse it to timestamp. + """ + try: + return int(mktime(datetime.strptime(input, SLURM_DATETIME_FORMAT).timetuple())) + except: + return 0 + +def read_example(example_name): + source_path = "autosubmit/history/platform_monitor/output_examples/" + file_path = os.path.join(source_path, example_name) + with open(file_path, "r") as fp: + output_ssh = fp.read() + return output_ssh \ No newline at end of file diff --git a/autosubmit/history/platform_monitor/slurm_monitor.py b/autosubmit/history/platform_monitor/slurm_monitor.py new file mode 100644 index 000000000..0727e2de2 --- /dev/null +++ b/autosubmit/history/platform_monitor/slurm_monitor.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python + +# Copyright 2015-2020 Earth Sciences Department, BSC-CNS +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . + +from platform_monitor import PlatformMonitor +from slurm_monitor_item import SlurmMonitorItem + +class SlurmMonitor(PlatformMonitor): + """ Manages Slurm commands interpretation. """ + def __init__(self, platform_output): + super(SlurmMonitor, self).__init__(platform_output) + + def identify_input_rows(self): + lines = self.input.split("\n") + for line in lines: + self.input_items.append(SlurmMonitorItem.from_line(line)) + + def step_count(self): + return len([step for step in self.input_items if step.is_step]) + + def get_header(self): + return next((header for header in self.input_items if header.is_header), None) + + def sum_steps_energy(self): + return sum([step.energy for step in self.input_items if step.is_step]) + + def get_batch(self): + return next((batch for batch in self.input_items if batch.is_batch), None) + + def get_extern(self): + return next((extern for extern in self.input_items if extern.is_extern), None) + + def steps_plus_extern_approximate_header_energy(self): + return abs((self.sum_steps_energy() + self.get_extern().energy) - self.get_header().energy) <= 10 + + def print_items(self): + for item in self.input_items: + print(item) + + diff --git a/autosubmit/history/platform_monitor/slurm_monitor_item.py b/autosubmit/history/platform_monitor/slurm_monitor_item.py new file mode 100644 index 000000000..a990315f9 --- /dev/null +++ b/autosubmit/history/platform_monitor/slurm_monitor_item.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python + +# Copyright 2015-2020 Earth Sciences Department, BSC-CNS +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . + +import platform_utils as utils + +class SlurmMonitorItem(): + def __init__(self, name, status, ncpus, nnodes, submit, start, finish, energy="0", MaxRSS=0.0, AveRSS=0.0): + self.name = str(name) + self.status = str(status) + self.ncpus = int(ncpus) + self.nnodes = int(nnodes) + self.submit = utils.try_parse_time_to_timestamp(submit) + self.start = utils.try_parse_time_to_timestamp(start) + self.finish = utils.try_parse_time_to_timestamp(finish) + self.energy_str = energy + self.energy = utils.parse_output_number(energy) + self.MaxRSS = utils.parse_output_number(MaxRSS) + self.AveRSS = utils.parse_output_number(AveRSS) + + @property + def is_header(self): + return not self.is_detail + + @property + def is_detail(self): + if self.name.find(".") >= 0: + return True + return False + + @property + def is_extern(self): + if self.name.find(".ext") >= 0: + return True + return False + + @property + def is_batch(self): + if self.name.find(".bat") >= 0: + return True + return False + + @property + def step_number(self): + if self.is_step == True: + point_loc = self.name.find(".") + return int(self.name[point_loc+1:]) + return -1 + + @property + def is_step(self): + if self.name.find(".") >= 0 and self.is_batch == False and self.is_extern == False: + return True + return False + + @classmethod + def from_line(cls, line): + line = line.strip().split() + if len(line) < 2: + raise Exception("Slurm parser found a line too short {0}".format(line)) + new_item = cls(line[0], + line[1], + str(line[2]) if len(line) > 2 else 0, + str(line[3]) if len(line) > 3 else 0, + str(line[4]) if len(line) > 4 else 0, + str(line[5]) if len(line) > 5 else 0, + str(line[6]) if len(line) > 6 else 0, + str(line[7]) if len(line) > 7 else 0, + str(line[8]) if len(line) > 8 else 0, + str(line[9]) if len(line) > 9 else 0) + return new_item + + def get_as_dict(self): + return {"ncpus": self.ncpus, + "nnodes": self.nnodes, + "submit": self.submit, + "start": self.start, + "finish": self.finish, + "energy": self.energy, + "MaxRSS": self.MaxRSS, + "AveRSS": self.AveRSS} + + def __str__(self): + return "Name {0}, Status {1}, NCpus {2}, NNodes {3}, Submit {4}, Start {5}, Finish {6}, Energy {7}, MaxRSS {8}, AveRSS {9} [Energy Str {10}]".format(self.name, self.status, self.ncpus, self.nnodes, self.submit, self.start, self.finish, self.energy, self.MaxRSS, self.AveRSS, self.energy_str, self.is_batch) \ No newline at end of file diff --git a/autosubmit/history/platform_monitor/test.py b/autosubmit/history/platform_monitor/test.py new file mode 100644 index 000000000..6279ccefa --- /dev/null +++ b/autosubmit/history/platform_monitor/test.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python + +# Copyright 2015-2020 Earth Sciences Department, BSC-CNS +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . + +import unittest +import platform_utils as utils +from slurm_monitor import SlurmMonitor + +class TestSlurmMonitor(unittest.TestCase): + def test_reader_on_simple_wrapper_example_1(self): + ssh_output = utils.read_example("wrapper1.txt") + slurm_monitor = SlurmMonitor(ssh_output) + slurm_monitor.identify_input_rows() + # Header + self.assertTrue(slurm_monitor.input_items[0].is_batch == False) + self.assertTrue(slurm_monitor.input_items[0].is_detail == False) + self.assertTrue(slurm_monitor.input_items[0].is_extern == False) + self.assertTrue(slurm_monitor.input_items[0].is_header == True) + self.assertTrue(slurm_monitor.input_items[0].is_detail == False) + # Batch + self.assertTrue(slurm_monitor.input_items[1].is_batch == True) + self.assertTrue(slurm_monitor.input_items[1].is_detail == True) + self.assertTrue(slurm_monitor.input_items[1].is_extern == False) + self.assertTrue(slurm_monitor.input_items[1].is_header == False) + self.assertTrue(slurm_monitor.input_items[1].is_detail == True) + # Extern + self.assertTrue(slurm_monitor.input_items[2].is_batch == False) + self.assertTrue(slurm_monitor.input_items[2].is_detail == True) + self.assertTrue(slurm_monitor.input_items[2].is_extern == True) + self.assertTrue(slurm_monitor.input_items[2].is_header == False) + self.assertTrue(slurm_monitor.input_items[2].is_detail == True) + header = slurm_monitor.get_header() + batch = slurm_monitor.get_batch() + extern = slurm_monitor.get_extern() + self.assertIsNotNone(header) + self.assertIsNotNone(batch) + self.assertIsNotNone(extern) + # print("{0} {1} <- {2}".format(batch.name, batch.energy, batch.energy_str)) + # print("{0} {1} <- {2}".format(extern.name, extern.energy, extern.energy_str)) + # print("{0} {1} <- {2}".format(header.name, header.energy, header.energy_str)) + self.assertTrue(slurm_monitor.steps_plus_extern_approximate_header_energy()) + + + def test_reader_on_simple_wrapper_example_2(self): + ssh_output = utils.read_example("wrapper2.txt") # not real + slurm_monitor = SlurmMonitor(ssh_output) + slurm_monitor.identify_input_rows() + # Header + self.assertTrue(slurm_monitor.input_items[0].is_batch == False) + self.assertTrue(slurm_monitor.input_items[0].is_detail == False) + self.assertTrue(slurm_monitor.input_items[0].is_step == False) + self.assertTrue(slurm_monitor.input_items[0].is_extern == False) + self.assertTrue(slurm_monitor.input_items[0].is_header == True) + # Batch + self.assertTrue(slurm_monitor.input_items[1].is_batch == True) + self.assertTrue(slurm_monitor.input_items[1].is_detail == True) + self.assertTrue(slurm_monitor.input_items[1].is_step == False) + self.assertTrue(slurm_monitor.input_items[1].is_extern == False) + self.assertTrue(slurm_monitor.input_items[1].is_header == False) + # Step 0 + self.assertTrue(slurm_monitor.input_items[2].is_batch == False) + self.assertTrue(slurm_monitor.input_items[2].is_detail == True) + self.assertTrue(slurm_monitor.input_items[2].is_step == True) + self.assertTrue(slurm_monitor.input_items[2].is_extern == False) + self.assertTrue(slurm_monitor.input_items[2].is_header == False) + self.assertTrue(slurm_monitor.input_items[2].step_number >= 0) + + def test_reader_on_big_wrapper(self): + ssh_output = utils.read_example("wrapper_big.txt") + slurm_monitor = SlurmMonitor(ssh_output) + slurm_monitor.identify_input_rows() + self.assertTrue(slurm_monitor.step_count() == 30) + header = slurm_monitor.get_header() + batch = slurm_monitor.get_batch() + extern = slurm_monitor.get_extern() + self.assertIsNotNone(header) + self.assertIsNotNone(batch) + self.assertIsNotNone(extern) + steps_and_extern_energy = slurm_monitor.sum_steps_energy() + extern.energy + # print(slurm_monitor.sum_steps_energy()) + # print(slurm_monitor.sum_steps_energy() + extern.energy) + # print("{0} {1} <- {2}".format(batch.name, batch.energy, batch.energy_str)) + # print("{0} {1} <- {2}".format(extern.name, extern.energy, extern.energy_str)) + # print("{0} {1} <- {2}".format(header.name, header.energy, header.energy_str)) + # ENERGY: extern + steps ~ header + self.assertTrue(slurm_monitor.steps_plus_extern_approximate_header_energy()) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/autosubmit/history/slurm_parser.py b/autosubmit/history/slurm_parser.py index 9e1cd2abf..3247eefd7 100644 --- a/autosubmit/history/slurm_parser.py +++ b/autosubmit/history/slurm_parser.py @@ -16,15 +16,21 @@ # You should have received a copy of the GNU General Public License # along with Autosubmit. If not, see . +import os +from collections import namedtuple from datetime import datetime from time import mktime, time +SlurmRow = namedtuple("SlurmRow", ["JobId", "State", "NCPUS", "NNodes", "Submit", "Start", "End", "ConsumedEnergy", "MaxRSS", "AveRSS"]) -def parse_job_finish_data(self, output, wrapped): +def parse_job_finish_data(output, is_wrapped): """ Slurm Command 'sacct -n --jobs {0} -o JobId%25,State,NCPUS,NNodes,Submit,Start,End,ConsumedEnergy,MaxRSS%25,AveRSS%25' + Only not wrapped jobs return submit, start, finish, joules, ncpus, nnodes. + ConsumedEnergy: Total energy consumed by all tasks in job, in joules. Note: Only in case of exclusive job allocation this value reflects the jobs' real energy consumption. + :return: submit, start, finish, joules, ncpus, nnodes, detailed_data :rtype: int, int, int, int, int, int, json object (str) """ @@ -33,8 +39,8 @@ def parse_job_finish_data(self, output, wrapped): detailed_data = dict() steps = [] # No blank spaces after or before - output = output.strip() if output else None - lines = output.split("\n") if output else [] + output = str(output).strip() if output else None + lines = output.split("\n") if output else [] is_end_of_wrapper = False # If there is output, list exists if len(lines) > 0: @@ -44,24 +50,14 @@ def parse_job_finish_data(self, output, wrapped): if len(line) > 0: # Collecting detailed data name = str(line[0]) - if wrapped: - # If it belongs to a wrapper - extra_data = {"ncpus": str(line[2] if len(line) > 2 else "NA"), - "nnodes": str(line[3] if len(line) > 3 else "NA"), - "submit": str(line[4] if len(line) > 4 else "NA"), - "start": str(line[5] if len(line) > 5 else "NA"), - "finish": str(line[6] if len(line) > 6 else "NA"), - "energy": str(line[7] if len(line) > 7 else "NA"), - "MaxRSS": str(line[8] if len(line) > 8 else "NA"), - "AveRSS": str(line[9] if len(line) > 9 else "NA")} - else: - # Normal job - extra_data = {"submit": str(line[4] if len(line) > 4 else "NA"), - "start": str(line[5] if len(line) > 5 else "NA"), - "finish": str(line[6] if len(line) > 6 else "NA"), - "energy": str(line[7] if len(line) > 7 else "NA"), - "MaxRSS": str(line[8] if len(line) > 8 else "NA"), - "AveRSS": str(line[9] if len(line) > 9 else "NA")} + extra_data = {"ncpus": str(line[2] if len(line) > 2 else "NA"), + "nnodes": str(line[3] if len(line) > 3 else "NA"), + "submit": str(line[4] if len(line) > 4 else "NA"), + "start": str(line[5] if len(line) > 5 else "NA"), + "finish": str(line[6] if len(line) > 6 else "NA"), + "energy": str(line[7] if len(line) > 7 else "NA"), + "MaxRSS": str(line[8] if len(line) > 8 else "NA"), + "AveRSS": str(line[9] if len(line) > 9 else "NA")} # Detailed data will contain the important information from output detailed_data[name] = extra_data steps.append(name) @@ -72,13 +68,13 @@ def parse_job_finish_data(self, output, wrapped): ncpus = int(line[2] if len(line) > 2 else 0) nnodes = int(line[3] if len(line) > 3 else 0) status = str(line[1]) - if wrapped == False: + if is_wrapped == False: # If it is not wrapper job, take first line as source if status not in ["COMPLETED", "FAILED", "UNKNOWN"]: # It not completed, then its error and send default data plus output return (0, 0, 0, 0, ncpus, nnodes, detailed_data, False) else: - # If it is a wrapped job + # If it is a is_wrapped job # Check if the wrapper has finished if status in ["COMPLETED", "FAILED", "UNKNOWN"]: # Wrapper has finished @@ -86,16 +82,16 @@ def parse_job_finish_data(self, output, wrapped): # Continue with first line as source if line: try: - # Parse submit and start only for normal jobs (not wrapped) + # Parse submit and start only for normal jobs (not is_wrapped) submit = int(mktime(datetime.strptime( - line[4], "%Y-%m-%dT%H:%M:%S").timetuple())) if not wrapped else 0 + line[4], "%Y-%m-%dT%H:%M:%S").timetuple())) if not is_wrapped else 0 start = int(mktime(datetime.strptime( - line[5], "%Y-%m-%dT%H:%M:%S").timetuple())) if not wrapped else 0 + line[5], "%Y-%m-%dT%H:%M:%S").timetuple())) if not is_wrapped else 0 # Assuming the job has been COMPLETED # If normal job or end of wrapper => Try to get the finish time from the first line of the output, else default to now. finish = 0 - if not wrapped: + if not is_wrapped: # If normal job, take finish time from first line finish = (int(mktime(datetime.strptime(line[6], "%Y-%m-%dT%H:%M:%S").timetuple( ))) if len(line) > 6 and line[6] != "Unknown" else int(time())) @@ -145,7 +141,7 @@ def parse_job_finish_data(self, output, wrapped): except Exception as exp: pass - detailed_data = detailed_data if not wrapped or is_end_of_wrapper == True else extra_data + detailed_data = detailed_data if not is_wrapped or is_end_of_wrapper == True else extra_data return (submit, start, finish, energy, ncpus, nnodes, detailed_data, is_end_of_wrapper) return (0, 0, 0, 0, 0, 0, dict(), False) @@ -153,33 +149,15 @@ def parse_job_finish_data(self, output, wrapped): return (0, 0, 0, 0, 0, 0, dict(), False) -def parse_output_number(string_number): - """ - Parses number in format 1.0K 1.0M 1.0G - - :param string_number: String representation of number - :type string_number: str - :return: number in float format - :rtype: float - """ - number = 0.0 - if (string_number): - last_letter = string_number.strip()[-1] - multiplier = 1 - if last_letter == "G": - multiplier = 1000000000 - number = string_number[:-1] - elif last_letter == "M": - multiplier = 1000000 - number = string_number[:-1] - elif last_letter == "K": - multiplier = 1000 - number = string_number[:-1] - else: - number = string_number - try: - number = float(number) * multiplier - except Exception as exp: - number = 0.0 - pass - return number \ No newline at end of file + + +def read_example(example_name): + source_path = "autosubmit/history/output_examples/" + file_path = os.path.join(source_path, example_name) + with open(file_path, "r") as fp: + output_ssh = fp.read() + return output_ssh + +if __name__ == "__main__": + output_ssh = read_example("pending.txt") + print(parse_job_finish_data(output_ssh, True)) \ No newline at end of file diff --git a/autosubmit/history/tests.py b/autosubmit/history/tests.py index 9ffa3dad0..67462a811 100644 --- a/autosubmit/history/tests.py +++ b/autosubmit/history/tests.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # Copyright 2015-2020 Earth Sciences Department, BSC-CNS # This file is part of Autosubmit. @@ -25,9 +25,19 @@ class TestExperimentHistory(unittest.TestCase): # @classmethod # def setUpClass(cls): # cls.exp = ExperimentHistory("tt00") # example database - def test_select_job_data_by_run_id(self): + def setUp(self): pass + + def test_db_exists(self): + exp_history = ExperimentHistory("tt00") + self.assertTrue(exp_history.my_database_exists() == True) + exp_history = ExperimentHistory("tt99") + self.assertTrue(exp_history.my_database_exists() == False) + def test_is_header_ready(self): + exp_history = ExperimentHistory("tt00") + self.assertTrue(exp_history.is_header_ready() == True) + def test_get_all_job_data(self): pass @@ -53,5 +63,6 @@ class TestLogging(unittest.TestCase): self.log.log(self.exp_message, self.trace_message) + if __name__ == '__main__': unittest.main() \ No newline at end of file diff --git a/autosubmit/history/utils.py b/autosubmit/history/utils.py index 7ef3e36d4..831695fbd 100644 --- a/autosubmit/history/utils.py +++ b/autosubmit/history/utils.py @@ -23,8 +23,6 @@ from datetime import datetime DATETIME_FORMAT = '%Y-%m-%d-%H:%M:%S' - - def get_fields_as_comma_str(model): """ Get the fields of a namedtumple as a comma separated string. """ return ",".join(model._fields) @@ -62,5 +60,13 @@ def create_file_with_full_permissions(path): os.umask(0) os.open(path, os.O_WRONLY | os.O_CREAT, 0o777) +class SupportedStatus: + COMPLETED = "COMPLETED" + FAILED = "FAILED" + QUEUING = "QUEUING" + SUBMITTED = "SUBMITTED" + RUNNING = "RUNNING" + SUSPENDED = "SUSPENDED" + # if __name__ == "__main__": # print(get_fields_as_comma_str()) \ No newline at end of file diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 94e1a5838..62448be73 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -173,6 +173,13 @@ class Job(object): :rtype: set """ return self._parents + + @property + def status_str(self): + """ + String representation of the current status + """ + return Status.VALUE_TO_KEY.get(self.status, "UNKNOWN") @parents.setter def parents(self, parents): -- GitLab From a852a0de483dda5f3a6753cebc87d742e66eb35c Mon Sep 17 00:00:00 2001 From: Wilmer Uruchi Ticona Date: Thu, 14 Oct 2021 17:23:55 +0200 Subject: [PATCH 5/8] Finished refactor. Testing pending. Endpoints pending. --- autosubmit/history/data_classes/job_data.py | 42 +++- .../database_managers/database_models.py | 5 +- .../experiment_history_db_manager.py | 214 ++++++++++++------ autosubmit/history/database_managers/tests.py | 31 ++- autosubmit/history/experiment_history.py | 111 +++++++-- autosubmit/history/experiment_status.py | 2 +- .../{logging.py => internal_logging.py} | 0 .../platform_monitor/platform_monitor.py | 2 +- .../history/platform_monitor/slurm_monitor.py | 35 +-- autosubmit/history/platform_monitor/test.py | 9 +- autosubmit/job/job.py | 7 + 11 files changed, 331 insertions(+), 127 deletions(-) rename autosubmit/history/{logging.py => internal_logging.py} (100%) diff --git a/autosubmit/history/data_classes/job_data.py b/autosubmit/history/data_classes/job_data.py index 5f2cc782f..23610fc54 100644 --- a/autosubmit/history/data_classes/job_data.py +++ b/autosubmit/history/data_classes/job_data.py @@ -28,7 +28,10 @@ class JobData(object): Robust representation of a row in the job_data table of the experiment history database. """ - def __init__(self, _id, counter=1, job_name="None", created=None, modified=None, submit=0, start=0, finish=0, status="UNKNOWN", rowtype=0, ncpus=0, wallclock="00:00", qos="debug", energy=0, date="", section="", member="", chunk=0, last=1, platform="NA", job_id=0, extra_data="", nnodes=0, run_id=None, MaxRSS=0.0, AveRSS=0.0, out="", err="", rowstatus=Models.RowStatus.INITIAL): + def __init__(self, _id, counter=1, job_name="None", created=None, modified=None, submit=0, start=0, finish=0, + status="UNKNOWN", rowtype=0, ncpus=0, wallclock="00:00", qos="debug", energy=0, date="", section="", + member="", chunk=0, last=1, platform="NA", job_id=0, extra_data="", nnodes=0, run_id=None, MaxRSS=0.0, + AveRSS=0.0, out="", err="", rowstatus=Models.RowStatus.INITIAL, children="", platform_output=""): """ """ self._id = _id @@ -44,7 +47,7 @@ class JobData(object): self.ncpus = ncpus self.wallclock = wallclock self.qos = qos if qos else "debug" - self._energy = energy if energy else 0 + self._energy = round(energy, 2) if energy else 0 self.date = date if date else "" self.section = section if section else "" self.member = member if member else "" @@ -66,7 +69,9 @@ class JobData(object): self.AveRSS = AveRSS self.out = out self.err = err - self.rowstatus = rowstatus + self.rowstatus = rowstatus + self.children = children # DB 17 + self.platform_output = platform_output # DB 17 @classmethod def from_model(cls, row): @@ -99,9 +104,14 @@ class JobData(object): row.AveRSS, row.out, row.err, - row.rowstatus) + row.rowstatus, + row.children, + row.platform_output) return job_data + @property + def computational_weight(self): + return round(float(self.running_time * self.ncpus),2) @property def submit(self): @@ -137,6 +147,16 @@ class JobData(object): Returns the energy spent value (JOULES) as an integer. """ return self._energy + + @property + def wrapper_code(self): + """ + Another name for rowtype + """ + if self.rowtype > 2: + return self.rowtype + else: + return None @submit.setter def submit(self, submit): @@ -164,20 +184,23 @@ class JobData(object): # print("Updating energy to {0} from {1}.".format( # energy, self._energy)) self.require_update = True - self._energy = energy if energy else 0 + self._energy = round(energy, 2) + @property def delta_queue_time(self): """ Returns queuing time as a timedelta object. """ return str(timedelta(seconds=self.queuing_time())) + @property def delta_running_time(self): """ Returns running time as a timedelta object. """ return str(timedelta(seconds=self.running_time())) + @property def submit_datetime(self): """ Return the submit time as a datetime object, None if submit time equal 0. @@ -186,6 +209,7 @@ class JobData(object): return datetime.fromtimestamp(self.submit) return None + @property def start_datetime(self): """ Return the start time as a datetime object, None if start time equal 0. @@ -194,6 +218,7 @@ class JobData(object): return datetime.fromtimestamp(self.start) return None + @property def finish_datetime(self): """ Return the finish time as a datetime object, None if start time equal 0. @@ -202,6 +227,7 @@ class JobData(object): return datetime.fromtimestamp(self.finish) return None + @property def submit_datetime_str(self): """ Returns the submit datetime as a string with format %Y-%m-%d-%H:%M:%S @@ -211,7 +237,7 @@ class JobData(object): return o_datetime.strftime(HUtils.DATETIME_FORMAT) else: return None - + @property def start_datetime_str(self): """ Returns the start datetime as a string with format %Y-%m-%d-%H:%M:%S @@ -221,7 +247,7 @@ class JobData(object): return o_datetime.strftime(HUtils.DATETIME_FORMAT) else: return None - + @property def finish_datetime_str(self): """ Returns the finish datetime as a string with format %Y-%m-%d-%H:%M:%S @@ -232,6 +258,7 @@ class JobData(object): else: return None + @property def running_time(self): """ Calculates and returns the running time of the job, in seconds. @@ -243,6 +270,7 @@ class JobData(object): return HUtils.calculate_run_time_in_seconds(self.start, self.finish) return 0 + @property def queuing_time(self): """ Calculates and returns the queuing time of the job, in seconds. diff --git a/autosubmit/history/database_managers/database_models.py b/autosubmit/history/database_managers/database_models.py index 2ba5d5549..3d205f965 100644 --- a/autosubmit/history/database_managers/database_models.py +++ b/autosubmit/history/database_managers/database_models.py @@ -18,7 +18,10 @@ import collections -JobDataRow = collections.namedtuple('JobDataRow', ['id', 'counter', 'job_name', 'created', 'modified', 'submit', 'start', 'finish', 'status', 'rowtype', 'ncpus', 'wallclock', 'qos', 'energy', 'date', 'section', 'member', 'chunk', 'last', 'platform', 'job_id', 'extra_data', 'nnodes', 'run_id', 'MaxRSS', 'AveRSS', 'out', 'err', 'rowstatus']) +JobDataRow = collections.namedtuple('JobDataRow', ['id', 'counter', 'job_name', 'created', 'modified', 'submit', 'start', 'finish', + 'status', 'rowtype', 'ncpus', 'wallclock', 'qos', 'energy', 'date', 'section', 'member', + 'chunk', 'last', 'platform', 'job_id', 'extra_data', 'nnodes', 'run_id', 'MaxRSS', 'AveRSS', + 'out', 'err', 'rowstatus', 'children', 'platform_output']) ExperimentRunRow = collections.namedtuple('ExperimentRunRow', [ 'run_id', 'created', 'modified', 'start', 'finish', 'chunk_unit', 'chunk_size', 'completed', 'total', 'failed', 'queuing', 'running', 'submitted', 'suspended', 'metadata']) diff --git a/autosubmit/history/database_managers/experiment_history_db_manager.py b/autosubmit/history/database_managers/experiment_history_db_manager.py index c2b0d8935..001fe7b71 100644 --- a/autosubmit/history/database_managers/experiment_history_db_manager.py +++ b/autosubmit/history/database_managers/experiment_history_db_manager.py @@ -27,7 +27,7 @@ from abc import ABCMeta, abstractmethod from database_manager import DatabaseManager, DEFAULT_JOBDATA_DIR from datetime import datetime -CURRENT_DB_VERSION = 16 +CURRENT_DB_VERSION = 17 DB_EXPERIMENT_HEADER_SCHEMA_CHANGES = 14 DB_VERSION_SCHEMA_CHANGES = 12 DEFAULT_DB_VERSION = 10 @@ -43,8 +43,25 @@ class ExperimentHistoryDbManager(DatabaseManager): self._set_table_queries() self.historicaldb_file_path = os.path.join(self.JOBDATA_DIR, "job_data_{0}.db".format(self.expid)) # type : str + def initialize(self): + if self.my_database_exists(): + if not self.is_current_version(): + self.update_historical_database() + else: + self.create_historical_database() + + def my_database_exists(self): + return os.path.exists(self.historicaldb_file_path) + def is_header_ready_db_version(self): - return self._get_pragma_version() >= DB_EXPERIMENT_HEADER_SCHEMA_CHANGES + if self.my_database_exists(): + return self._get_pragma_version() >= DB_EXPERIMENT_HEADER_SCHEMA_CHANGES + return False + + def is_current_version(self): + if self.my_database_exists(): + return self._get_pragma_version() == CURRENT_DB_VERSION + return False def _set_table_queries(self): """ Sets basic table queries. """ @@ -100,6 +117,8 @@ class ExperimentHistoryDbManager(DatabaseManager): out TEXT NOT NULL, err TEXT NOT NULL, rowstatus INTEGER NOT NULL DEFAULT 0, + children TEXT, + platform_output TEXT, UNIQUE(counter,job_name) ); ''') @@ -128,6 +147,11 @@ class ExperimentHistoryDbManager(DatabaseManager): self.version_schema_changes.extend([ "ALTER TABLE experiment_run ADD COLUMN modified TEXT" ]) + # Version 17 + self.version_schema_changes.extend([ + "ALTER TABLE job_data ADD COLUMN children TEXT", + "ALTER TABLE job_data ADD COLUMN platform_output TEXT" + ]) def create_historical_database(self): """ Creates the historical database with the latest changes. """ @@ -138,22 +162,30 @@ class ExperimentHistoryDbManager(DatabaseManager): def update_historical_database(self): """ Updates the historical database with the latest changes IF necessary. """ - if self._get_pragma_version() == CURRENT_DB_VERSION: - self.execute_many_statements_on_dbfile(self.historicaldb_file_path, self.version_schema_changes) - self.execute_statement_on_dbfile(self.historicaldb_file_path, self.create_index_query) - self.execute_statement_on_dbfile(self.historicaldb_file_path, self.create_table_header_query) - self._set_historical_pragma_version(CURRENT_DB_VERSION) + self.execute_many_statements_on_dbfile(self.historicaldb_file_path, self.version_schema_changes) + self.execute_statement_on_dbfile(self.historicaldb_file_path, self.create_index_query) + self.execute_statement_on_dbfile(self.historicaldb_file_path, self.create_table_header_query) + self._set_historical_pragma_version(CURRENT_DB_VERSION) def get_experiment_run_dc_with_max_id(self): """ Get Current (latest) ExperimentRun data class. """ return ExperimentRun.from_model(self._get_experiment_run_with_max_id()) + + def register_experiment_run_dc(self, experiment_run_dc): + self._insert_experiment_run(experiment_run_dc) + return ExperimentRun.from_model(self._get_experiment_run_with_max_id()) + + def update_experiment_run_dc_by_id(self, experiment_run_dc): + """ Requires ExperimentRun data class. """ + self._update_experiment_run(experiment_run_dc) + return ExperimentRun.from_model(self._get_experiment_run_with_max_id()) def _get_experiment_run_with_max_id(self): """ Get Models.ExperimentRunRow for the maximum id run. """ statement = self.get_built_select_statement("experiment_run", "run_id > 0 ORDER BY run_id DESC LIMIT 0, 1") max_experiment_run = self.get_from_statement(self.historicaldb_file_path, statement) if len(max_experiment_run) == 0: - raise None + raise Exception("No Experiment Runs registered.") return Models.ExperimentRunRow(*max_experiment_run[0]) def get_job_data_all(self): @@ -162,42 +194,107 @@ class ExperimentHistoryDbManager(DatabaseManager): job_data_rows = self.get_from_statement(self.historicaldb_file_path, statement) return [Models.JobDataRow(*row) for row in job_data_rows] + def register_submitted_job_data_dc(self, job_data_dc): + """ Sets previous register to last=0 and inserts the new job_data_dc data class.""" + self._set_current_job_data_rows_last_to_zero_by_job_name(job_data_dc.job_name) + self._insert_job_data(job_data_dc) + return self.get_job_data_dc_unique_latest_by_job_name(job_data_dc.job_name) + + def _set_current_job_data_rows_last_to_zero_by_job_name(self, job_name): + """ Sets the column last = 0 for all job_rows by job_name and last = 1. """ + job_data_row_last = self._get_job_data_last_by_name(job_name) + job_data_dc_list = [JobData.from_model(row) for row in job_data_row_last] + for job_data_dc in job_data_dc_list: + job_data_dc.last = 0 + self._update_job_data_by_id(job_data_dc) + def update_job_data_dc_by_id(self, job_data_dc): """ Update JobData data class. Returns latest last=1 row from job_data by job_name. """ self._update_job_data_by_id(job_data_dc) return self.get_job_data_dc_unique_latest_by_job_name(job_data_dc.job_name) + + def update_list_job_data_dc_by_each_id(self, job_data_dcs): + """ Return length of updated list. """ + for job_data_dc in job_data_dcs: + self._update_job_data_by_id(job_data_dc) + return len(job_data_dcs) + + def get_job_data_dc_unique_latest_by_job_name(self, job_name): + """ Returns JobData data class for the latest job_data_row with last=1 by job_name. """ + job_data_row_last = self._get_job_data_last_by_name(job_name) + if len(job_data_row_last) > 0: + return JobData.from_model(job_data_row_last[0]) + return None - def update_experiment_run_dc_by_id(self, experiment_run_dc): - """ Requires ExperimentRun data class. """ - self._update_experiment_run(experiment_run_dc) - return ExperimentRun.from_model(self.get_experiment_run_with_max_id()) + def _get_job_data_last_by_name(self, job_name): + """ Get List of Models.JobDataRow for job_name and last=1 """ + statement = self.get_built_select_statement("job_data", "last=1 and job_name=? ORDER BY counter DESC") + arguments = (job_name,) + job_data_rows_last = self.get_from_statement_with_arguments(self.historicaldb_file_path, statement, arguments) + return [Models.JobDataRow(*row) for row in job_data_rows_last] def get_job_data_dcs_last_by_run_id(self, run_id): job_data_rows = self._get_job_data_last_by_run_id(run_id) return [JobData.from_model(row) for row in job_data_rows] + def _get_job_data_last_by_run_id(self, run_id): + """ Get List of Models.JobDataRow for last=1 and run_id """ + statement = self.get_built_select_statement("job_data", "run_id=? and last=1 and rowtype >= 2 ORDER BY id") + arguments = (run_id,) + job_data_rows = self.get_from_statement_with_arguments(self.historicaldb_file_path, statement, arguments) + return [Models.JobDataRow(*row) for row in job_data_rows] + + def get_job_data_dcs_last_by_wrapper_code(self, wrapper_code): + if wrapper_code: + return [JobData.from_model(row) for row in self._get_job_data_last_by_wrapper_code(wrapper_code)] + else: + return [] + + def _get_job_data_last_by_wrapper_code(self, wrapper_code): + """ Get List of Models.JobDataRow for last=1 and rowtype=wrapper_code """ + statement = self.get_built_select_statement("job_data", "rowtype = ? and last=1 ORDER BY id") + arguments = (wrapper_code,) + job_data_rows = self.get_from_statement_with_arguments(self.historicaldb_file_path, statement, arguments) + return [Models.JobDataRow(*row) for row in job_data_rows] + def get_all_last_job_data_dcs(self): """ Gets JobData data classes in job_data for last=1. """ job_data_rows = self._get_all_last_job_data_rows() return [JobData.from_model(row) for row in job_data_rows] - def register_submitted_job_data_dc(self, job_data_dc): - """ Sets previous register to last=0 and inserts the new job_data_dc data class.""" - self._set_current_job_data_rows_last_to_zero_by_job_name(job_data_dc.job_name) - self._insert_job_data(job_data_dc) - return self.get_job_data_dc_unique_latest_by_job_name(job_data_dc.job_name) - - def register_experiment_run_dc(self, experiment_run_dc): - self._insert_experiment_run(experiment_run_dc) - return ExperimentRun.from_model(self.get_experiment_run_with_max_id()) - - def _set_current_job_data_rows_last_to_zero_by_job_name(self, job_name): - """ Sets the column last = 0 for all job_rows by job_name and last = 1. """ - job_data_row_last = self._get_job_data_last_by_name(job_name) - job_data_dc_list = [JobData.from_model(row) for row in job_data_row_last] - for job_data_dc in job_data_dc_list: - job_data_dc.last = 0 - self._update_job_data_by_id(job_data_dc) + def _get_all_last_job_data_rows(self): + """ Get List of Models.JobDataRow for last=1. """ + statement = self.get_built_select_statement("job_data", "last=1") + job_data_rows = self.get_from_statement(self.historicaldb_file_path, statement) + return [Models.JobDataRow(*row) for row in job_data_rows] + + def _insert_job_data(self, job_data): + # type : (JobData) -> int + """ Insert data class JobData into job_data table. """ + statement = ''' INSERT INTO job_data(counter, job_name, created, modified, + submit, start, finish, status, rowtype, ncpus, + wallclock, qos, energy, date, section, member, chunk, last, + platform, job_id, extra_data, nnodes, run_id, MaxRSS, AveRSS, + out, err, rowstatus, children, platform_output) + VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?) ''' + arguments = (job_data.counter, job_data.job_name, HUtils.get_current_datetime(), HUtils.get_current_datetime(), + job_data.submit, job_data.start, job_data.finish, job_data.status, job_data.rowtype, job_data.ncpus, + job_data.wallclock, job_data.qos, job_data.energy, job_data.date, job_data.section, job_data.member, job_data.chunk, job_data.last, + job_data.platform, job_data.job_id, job_data.extra_data, job_data.nnodes, job_data.run_id, job_data.MaxRSS, job_data.AveRSS, + job_data.out, job_data.err, job_data.rowstatus, job_data.children, job_data.platform_output) + return self.insert_statement_with_arguments(self.historicaldb_file_path, statement, arguments) + + def _insert_experiment_run(self, experiment_run): + """ Insert data class ExperimentRun into database """ + statement = ''' INSERT INTO experiment_run(created, modified, start, finish, + chunk_unit, chunk_size, completed, total, + failed, queuing, running, + submitted, suspended, metadata) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?) ''' + arguments = (HUtils.get_current_datetime(), HUtils.get_current_datetime(), experiment_run.start, experiment_run.finish, + experiment_run.chunk_unit, experiment_run.chunk_size, experiment_run.completed, experiment_run.total, + experiment_run.failed, experiment_run.queuing, experiment_run.running, + experiment_run.submitted, experiment_run.suspended, experiment_run.metadata) + return self.insert_statement_with_arguments(self.historicaldb_file_path, statement, arguments) def update_many_job_data_change_status(self, changes): # type : (List[Tuple]) -> None @@ -213,8 +310,14 @@ class ExperimentHistoryDbManager(DatabaseManager): Update job_data table with data class JobData. Update last, submit, start, finish, modified, job_id, status, energy, extra_data, nnodes, ncpus, rowstatus, out, err by id. """ - statement = ''' UPDATE job_data SET last=?, submit=?, start=?, finish=?, modified=?, job_id=?, status=?, energy=?, extra_data=?, nnodes=?, ncpus=?, rowstatus=?, out=?, err=? WHERE id=? ''' - arguments = (job_data_dc.last, job_data_dc.submit, job_data_dc.start, job_data_dc.finish, HUtils.get_current_datetime(), job_data_dc.job_id, job_data_dc.status, job_data_dc.energy, job_data_dc.extra_data, job_data_dc.nnodes, job_data_dc.ncpus, job_data_dc.rowstatus, job_data_dc.out, job_data_dc.err, job_data_dc._id) + statement = ''' UPDATE job_data SET last=?, submit=?, start=?, finish=?, modified=?, + job_id=?, status=?, energy=?, extra_data=?, + nnodes=?, ncpus=?, rowstatus=?, out=?, err=?, + children=?, platform_output=? WHERE id=? ''' + arguments = (job_data_dc.last, job_data_dc.submit, job_data_dc.start, job_data_dc.finish, HUtils.get_current_datetime(), + job_data_dc.job_id, job_data_dc.status, job_data_dc.energy, job_data_dc.extra_data, + job_data_dc.nnodes, job_data_dc.ncpus, job_data_dc.rowstatus, job_data_dc.out, job_data_dc.err, + job_data_dc.children, job_data_dc.platform_output, job_data_dc._id) self.execute_statement_with_arguments_on_dbfile(self.historicaldb_file_path, statement, arguments) def _update_experiment_run(self, experiment_run_dc): @@ -222,31 +325,14 @@ class ExperimentHistoryDbManager(DatabaseManager): Update experiment_run table with data class ExperimentRun. Updates by run_id (finish, chunk_unit, chunk_size, completed, total, failed, queuing, running, submitted, suspended) """ - statement = ''' UPDATE experiment_run SET finish=?, chunk_unit=?, chunk_size=?, completed=?, total=?, failed=?, queuing=?, running=?, submitted=?, suspended=?, modified=? WHERE run_id=? ''' - arguments = (experiment_run_dc.finish, experiment_run_dc.chunk_unit, experiment_run_dc.chunk_size, experiment_run_dc.completed, experiment_run_dc.total, experiment_run_dc.failed, experiment_run_dc.queuing, experiment_run_dc.running, experiment_run_dc.submitted, experiment_run_dc.suspended, HUtils.get_current_datetime(), experiment_run_dc.run_id) + statement = ''' UPDATE experiment_run SET finish=?, chunk_unit=?, chunk_size=?, completed=?, total=?, + failed=?, queuing=?, running=?, submitted=?, + suspended=?, modified=? WHERE run_id=? ''' + arguments = (experiment_run_dc.finish, experiment_run_dc.chunk_unit, experiment_run_dc.chunk_size, experiment_run_dc.completed, experiment_run_dc.total, + experiment_run_dc.failed, experiment_run_dc.queuing, experiment_run_dc.running, experiment_run_dc.submitted, + experiment_run_dc.suspended, HUtils.get_current_datetime(), experiment_run_dc.run_id) self.execute_statement_with_arguments_on_dbfile(self.historicaldb_file_path, statement, arguments) - def _insert_job_data(self, job_data): - # type : (JobData) -> int - """ Insert data class JobData into job_data table. """ - statement = ''' INSERT INTO job_data(counter, job_name, created, modified, submit, start, finish, status, rowtype, ncpus, wallclock, qos, energy, date, section, member, chunk, last, platform, job_id, extra_data, nnodes, run_id, MaxRSS, AveRSS, out, err, rowstatus) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?) ''' - arguments = (job_data.counter, job_data.job_name, HUtils.get_current_datetime(), HUtils.get_current_datetime(), job_data.submit, job_data.start, job_data.finish, job_data.status, job_data.rowtype, job_data.ncpus, job_data.wallclock, job_data.qos, job_data.energy, job_data.date, job_data.section, job_data.member, job_data.chunk, job_data.last, job_data.platform, job_data.job_id, job_data.extra_data, job_data.nnodes, job_data.run_id, job_data.MaxRSS, job_data.AveRSS, job_data.out, job_data.err, job_data.rowstatus) - return self.insert_statement_with_arguments(self.historicaldb_file_path, statement, arguments) - - def _insert_experiment_run(self, experiment_run): - """ Insert data class ExperimentRun into database """ - statement = ''' INSERT INTO experiment_run(created, modified, start, finish, chunk_unit, chunk_size, completed, total, failed, queuing, running, submitted, suspended, metadata) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?) ''' - arguments = (HUtils.get_current_datetime(), HUtils.get_current_datetime(), experiment_run.start, experiment_run.finish, experiment_run.chunk_unit, experiment_run.chunk_size, experiment_run.completed, - experiment_run.total, experiment_run.failed, experiment_run.queuing, experiment_run.running, experiment_run.submitted, experiment_run.suspended, experiment_run.metadata) - return self.insert_statement_with_arguments(self.historicaldb_file_path, statement, arguments) - - def get_job_data_dc_unique_latest_by_job_name(self, job_name): - """ Returns JobData data class for the latest job_data_row with last=1 by job_name. """ - job_data_row_last = self._get_job_data_last_by_name(job_name) - if len(job_data_row_last) > 0: - return JobData.from_model(job_data_row_last[0]) - return None - def _get_job_data_last_by_run_id_and_finished(self, run_id): """ Get List of Models.JobDataRow for last=1, finished > 0 and run_id """ statement = self.get_built_select_statement("job_data", "run_id=? and last=1 and finish > 0 and rowtype >= 2 ORDER BY id") @@ -254,13 +340,6 @@ class ExperimentHistoryDbManager(DatabaseManager): job_data_rows = self.get_from_statement_with_arguments(self.historicaldb_file_path, statement, arguments) return [Models.JobDataRow(*row) for row in job_data_rows] - def _get_job_data_last_by_run_id(self, run_id): - """ Get List of Models.JobDataRow for last=1 and run_id """ - statement = self.get_built_select_statement("job_data", "run_id=? and last=1 and rowtype >= 2 ORDER BY id") - arguments = (run_id,) - job_data_rows = self.get_from_statement_with_arguments(self.historicaldb_file_path, statement, arguments) - return [Models.JobDataRow(*row) for row in job_data_rows] - def get_job_data_by_name(self, job_name): """ Get List of Models.JobDataRow for job_name """ statement = self.get_built_select_statement("job_data", "job_name=? ORDER BY counter DESC") @@ -268,19 +347,6 @@ class ExperimentHistoryDbManager(DatabaseManager): job_data_rows = self.get_from_statement_with_arguments(self.historicaldb_file_path, statement, arguments) return [Models.JobDataRow(*row) for row in job_data_rows] - def _get_all_last_job_data_rows(self): - """ Get List of Models.JobDataRow for last=1. """ - statement = self.get_built_select_statement("job_data", "last=1") - job_data_rows = self.get_from_statement(self.historicaldb_file_path, statement) - return [Models.JobDataRow(*row) for row in job_data_rows] - - def _get_job_data_last_by_name(self, job_name): - """ Get List of Models.JobDataRow for job_name and last=1 """ - statement = self.get_built_select_statement("job_data", "last=1 and job_name=? ORDER BY counter DESC") - arguments = (job_name,) - job_data_rows_last = self.get_from_statement_with_arguments(self.historicaldb_file_path, statement, arguments) - return [Models.JobDataRow(*row) for row in job_data_rows_last] - def get_job_data_max_counter(self): """ The max counter is the maximum count value for the count column in job_data. """ statement = "SELECT MAX(counter) as maxcounter FROM job_data" diff --git a/autosubmit/history/database_managers/tests.py b/autosubmit/history/database_managers/tests.py index 4c07b8f7a..a78d68ebe 100644 --- a/autosubmit/history/database_managers/tests.py +++ b/autosubmit/history/database_managers/tests.py @@ -70,6 +70,7 @@ class TestExperimentHistoryDbManager(unittest.TestCase): """ Covers Experiment History Database Manager and Data Models """ def setUp(self): self.experiment_database = ExperimentHistoryDbManager(EXPID, JOBDATA_DIR) + self.experiment_database.initialize() def tearDown(self): pass @@ -80,7 +81,7 @@ class TestExperimentHistoryDbManager(unittest.TestCase): self.assertTrue(max_item.run_id >= 18) # Max is 18 def test_pragma(self): - self.assertTrue(self.experiment_database._get_pragma_version() == 16) + self.assertTrue(self.experiment_database._get_pragma_version() == 17) # Update version on changes def test_get_job_data(self): job_data = self.experiment_database._get_job_data_last_by_name("a29z_20000101_fc0_1_SIM") @@ -207,9 +208,35 @@ class TestExperimentHistoryDbManager(unittest.TestCase): self.assertTrue(job_data_dc._id == updated_job_data_dc._id) job_data_dc = self.experiment_database.get_job_data_dc_unique_latest_by_job_name("a29z_20000101_fc0_1_SIM") self.assertTrue(job_data_dc._id == updated_job_data_dc._id) + + def test_update_children_and_platform_output(self): + job_data_dc = self.experiment_database.get_job_data_dc_unique_latest_by_job_name("a29z_20000101_fc0_1_SIM") + children_str = "a00, a01, a02" + platform_output_str = " SLURM OUTPUT " + job_data_dc.children = children_str + job_data_dc.platform_output = platform_output_str + self.experiment_database.update_job_data_dc_by_id(job_data_dc) + job_data_dc_updated = self.experiment_database.get_job_data_dc_unique_latest_by_job_name("a29z_20000101_fc0_1_SIM") + self.assertTrue(job_data_dc_updated.children == children_str) + self.assertTrue(job_data_dc_updated.platform_output == platform_output_str) + # Back to normal + job_data_dc.children = "" + job_data_dc.platform_output = "NO OUTPUT" + self.experiment_database.update_job_data_dc_by_id(job_data_dc) + job_data_dc_updated = self.experiment_database.get_job_data_dc_unique_latest_by_job_name("a29z_20000101_fc0_1_SIM") + self.assertTrue(job_data_dc_updated.children == "") + self.assertTrue(job_data_dc_updated.platform_output == "NO OUTPUT") + + def test_experiment_run_dc(self): - experiment_run = self.experiment_database + experiment_run = self.experiment_database.get_experiment_run_dc_with_max_id() + self.assertIsNotNone(experiment_run) + + def test_if_database_exists(self): + exp_manager = ExperimentHistoryDbManager("0000") + self.assertTrue(exp_manager.my_database_exists() == False) + if __name__ == '__main__': unittest.main() \ No newline at end of file diff --git a/autosubmit/history/experiment_history.py b/autosubmit/history/experiment_history.py index 485a574c0..6f4eb27d0 100644 --- a/autosubmit/history/experiment_history.py +++ b/autosubmit/history/experiment_history.py @@ -17,6 +17,7 @@ # along with Autosubmit. If not, see . import os import traceback +from autosubmit.history.data_classes import job_data import database_managers.database_models as Models import utils as HUtils from time import time, sleep @@ -24,7 +25,7 @@ from database_managers.experiment_history_db_manager import ExperimentHistoryDbM from data_classes.job_data import JobData from data_classes.experiment_run import ExperimentRun from platform_monitor.slurm_monitor import SlurmMonitor -from logging import Logging +from internal_logging import Logging SECONDS_WAIT_PLATFORM = 60 @@ -41,24 +42,19 @@ class ExperimentHistory(): def initialize_database(self): try: - if self.my_database_exists(): - self.manager.update_historical_database() - else: - self.manager.create_historical_database() + self.manager.initialize() except Exception as exp: self._log.log(str(exp), traceback.format_exc()) self.manager = None - def my_database_exists(self): - return os.path.exists(self.manager.historicaldb_file_path) - def is_header_ready(self): if self.manager: return self.manager.is_header_ready_db_version() return False - def write_submit_time(self, job_name, submit=0, status="UNKNOWN", ncpus=0, wallclock="00:00", qos="debug", date="", member="", section="", chunk=0, platform="NA", job_id=0, wrapper_queue=None, wrapper_code=None): + def write_submit_time(self, job_name, submit=0, status="UNKNOWN", ncpus=0, wallclock="00:00", qos="debug", date="", + member="", section="", chunk=0, platform="NA", job_id=0, wrapper_queue=None, wrapper_code=None, children=""): try: next_counter = self._get_next_counter_by_job_name(job_name) job_data_dc = JobData(_id=0, @@ -75,33 +71,63 @@ class ExperimentHistory(): section=section, chunk=chunk, platform=platform, - job_id=job_id) + job_id=job_id, + children=children) return self.manager.register_submitted_job_data_dc(job_data_dc) except Exception as exp: self._log.log(str(exp), traceback.format_exc()) return None - def write_start_time(self, job_name, start=0, status="UNKWOWN", ncpus=0, wallclock="00:00", qos="debug", date="", member="", section="", chunk=0, platform="NA", job_id=0, wrapper_queue=None, wrapper_code=None): + def write_start_time(self, job_name, start=0, status="UNKWOWN", ncpus=0, wallclock="00:00", qos="debug", date="", + member="", section="", chunk=0, platform="NA", job_id=0, wrapper_queue=None, wrapper_code=None, children=""): try: job_data_dc_last = self.manager.get_job_data_dc_unique_latest_by_job_name(job_name) if not job_data_dc_last: - job_data_dc_last = self.write_submit_time(job_name=job_name, status=status, ncpus=ncpus, wallclock=wallclock, qos=qos, date=date, member=member, section=section, chunk=chunk, platform=platform, job_id=job_id, wrapper_queue=wrapper_queue, wrapper_code=wrapper_code) + job_data_dc_last = self.write_submit_time(job_name=job_name, + status=status, + ncpus=ncpus, + wallclock=wallclock, + qos=qos, + date=date, + member=member, + section=section, + chunk=chunk, + platform=platform, + job_id=job_id, + wrapper_queue=wrapper_queue, + wrapper_code=wrapper_code) self._log.log("write_start_time {0} start not found.".format(job_name)) job_data_dc_last.start = start job_data_dc_last.qos = self._get_defined_queue_name(wrapper_queue, wrapper_code, qos) job_data_dc_last.status = status job_data_dc_last.rowtype = self._get_defined_rowtype(wrapper_code) job_data_dc_last.job_id = job_id + job_data_dc_last.children = children return self.manager.update_job_data_dc_by_id(job_data_dc_last) except Exception as exp: self._log.log(str(exp), traceback.format_exc()) return None - def write_finish_time(self, job_name, finish=0, status="UNKNOWN", ncpus=0, wallclock="00:00", qos="debug", date="", member="", section="", chunk=0, platform="NA", job_id=0, platform_object=None, packed=False, parent_id_list=None, no_slurm=True, out_file_path=None, out_file=None, err_file=None, wrapper_queue=None, wrapper_code=None): + def write_finish_time(self, job_name, finish=0, status="UNKNOWN", ncpus=0, wallclock="00:00", qos="debug", date="", + member="", section="", chunk=0, platform="NA", job_id=0, out_file=None, err_file=None, + wrapper_queue=None, wrapper_code=None, children=""): try: job_data_dc_last = self.manager.get_job_data_dc_unique_latest_by_job_name(job_name) if not job_data_dc_last: - job_data_dc_last = self.write_submit_time(job_name=job_name, status=status, ncpus=ncpus, wallclock=wallclock, qos=qos, date=date, member=member, section=section, chunk=chunk, platform=platform, job_id=job_id, wrapper_queue=wrapper_queue, wrapper_code=wrapper_code) + job_data_dc_last = self.write_submit_time(job_name=job_name, + status=status, + ncpus=ncpus, + wallclock=wallclock, + qos=qos, + date=date, + member=member, + section=section, + chunk=chunk, + platform=platform, + job_id=job_id, + wrapper_queue=wrapper_queue, + wrapper_code=wrapper_code, + children=children) self._log.log("write_finish_time {0} submit not found.".format(job_name)) job_data_dc_last.finish = finish if finish > 0 else int(time()) job_data_dc_last.status = status @@ -115,16 +141,61 @@ class ExperimentHistory(): return None def write_platform_data_after_finish(self, job_data_dc, platform_obj): - """ """ + """ + Call it in a thread. + """ try: - sleep(SECONDS_WAIT_PLATFORM) - slurm_monitor = SlurmMonitor(platform_obj.check_job_energy(job_data_dc.job_id)) - # Get current rows in run_id by rowtype (wrapper code) - # Add children names columnd to job_data - + sleep(SECONDS_WAIT_PLATFORM) + ssh_output = platform_obj.check_job_energy(job_data_dc.job_id) + slurm_monitor = SlurmMonitor(ssh_output) + job_data_dcs_in_wrapper = self.manager.get_job_data_dcs_last_by_wrapper_code(job_data_dc.wrapper_code) + if len(job_data_dcs_in_wrapper) > 0: + job_data_dcs_in_wrapper = self._distribute_energy_in_wrapper(job_data_dcs_in_wrapper, slurm_monitor) + self.manager.update_list_job_data_dc_by_each_id(job_data_dcs_in_wrapper) + else: + job_data_dc = self._assign_platform_information_to_job_data_dc(job_data_dc, slurm_monitor) + job_data_dc = self._assign_platform_information_to_job_data_dc(job_data_dc, slurm_monitor) + return self.manager.update_job_data_dc_by_id(job_data_dc) except Exception as exp: self._log.log(str(exp), traceback.format_exc()) return None + + def _distribute_energy_in_wrapper(self, job_data_dcs, slurm_monitor): + """ SlurmMonitor with data. """ + computational_weights = self._get_calculated_weights_of_jobs_in_wrapper(job_data_dcs) + if len(job_data_dcs) == slurm_monitor.step_count: + for job_dc, input_item in zip(job_data_dcs, slurm_monitor.input_items): + job_dc.energy = input_item.energy + computational_weights.get(job_dc.job_name, 0) * slurm_monitor.extern.energy + job_dc.platform_output = "" + else: + for job_dc in job_data_dcs: + job_dc.energy = computational_weights.get(job_dc.job_name, 0) * slurm_monitor.total_energy + job_dc.platform_output = "" + return job_data_dcs + + def _set_job_as_processed_in_platform(self, job_data_dc, slurm_monitor): + """ """ + job_data_dc.platform_output = slurm_monitor.original_input + job_data_dc.rowstatus = Models.RowStatus.PROCESSED + return job_data_dc + + def _assign_platform_information_to_job_data_dc(self, job_data_dc, slurm_monitor): + """ Basic Assignment """ + job_data_dc.submit = slurm_monitor.header.submit + job_data_dc.start = slurm_monitor.header.start + job_data_dc.finish = slurm_monitor.header.finish + job_data_dc.ncpus = slurm_monitor.header.ncpus + job_data_dc.nnodes = slurm_monitor.header.nnodes + job_data_dc.energy = slurm_monitor.header.energy + job_data_dc.MaxRSS = slurm_monitor.header.MaxRSS + job_data_dc.AveRSS = slurm_monitor.header.AveRSS + job_data_dc.platform_output = slurm_monitor.original_input + return job_data_dc + + def _get_calculated_weights_of_jobs_in_wrapper(self, job_data_dcs): + """ Based on computational weight: running time in seconds * number of cpus. """ + total_weight = sum(job.computational_weight for job in job_data_dcs) + return {job.job_name: round(job.computational_weight/total_weight, 2) for job in job_data_dcs} def process_status_changes(self, job_list=None, chunk_unit="NA", chunk_size=0, current_config=""): """ Detect status differences between job_list and current job_data rows, and update. Creates a new run if necessary. """ diff --git a/autosubmit/history/experiment_status.py b/autosubmit/history/experiment_status.py index 000e73294..8de57f870 100644 --- a/autosubmit/history/experiment_status.py +++ b/autosubmit/history/experiment_status.py @@ -18,7 +18,7 @@ import traceback from database_managers.experiment_status_db_manager import ExperimentStatusDbManager, DEFAULT_LOCAL_ROOT_DIR -from logging import Logging +from internal_logging import Logging class ExperimentStatus(): """ Represents the Experiment Status Mechanism that keeps track of currently active experiments """ diff --git a/autosubmit/history/logging.py b/autosubmit/history/internal_logging.py similarity index 100% rename from autosubmit/history/logging.py rename to autosubmit/history/internal_logging.py diff --git a/autosubmit/history/platform_monitor/platform_monitor.py b/autosubmit/history/platform_monitor/platform_monitor.py index 10b044703..c9570e26f 100644 --- a/autosubmit/history/platform_monitor/platform_monitor.py +++ b/autosubmit/history/platform_monitor/platform_monitor.py @@ -23,7 +23,7 @@ class PlatformMonitor(): def __init__(self, platform_output): self.original_input = platform_output self.input = str(platform_output).strip() - self.input_items = [] + @abstractmethod def identify_input_rows(self): diff --git a/autosubmit/history/platform_monitor/slurm_monitor.py b/autosubmit/history/platform_monitor/slurm_monitor.py index 0727e2de2..e0118620a 100644 --- a/autosubmit/history/platform_monitor/slurm_monitor.py +++ b/autosubmit/history/platform_monitor/slurm_monitor.py @@ -23,29 +23,38 @@ class SlurmMonitor(PlatformMonitor): """ Manages Slurm commands interpretation. """ def __init__(self, platform_output): super(SlurmMonitor, self).__init__(platform_output) + self._identify_input_rows() - def identify_input_rows(self): - lines = self.input.split("\n") - for line in lines: - self.input_items.append(SlurmMonitorItem.from_line(line)) - + @property + def steps_energy(self): + return sum([step.energy for step in self.input_items if step.is_step]) + + @property + def total_energy(self): + return max(self.header_energy, self.steps_energy + self.extern_energy) + + @property def step_count(self): return len([step for step in self.input_items if step.is_step]) - - def get_header(self): - return next((header for header in self.input_items if header.is_header), None) - def sum_steps_energy(self): - return sum([step.energy for step in self.input_items if step.is_step]) + def _identify_input_rows(self): + lines = self.input.split("\n") + self.input_items = [SlurmMonitorItem.from_line(line) for line in lines] + + @property + def header(self): + return next((header for header in self.input_items if header.is_header), None) - def get_batch(self): + @property + def batch(self): return next((batch for batch in self.input_items if batch.is_batch), None) - def get_extern(self): + @property + def extern(self): return next((extern for extern in self.input_items if extern.is_extern), None) def steps_plus_extern_approximate_header_energy(self): - return abs((self.sum_steps_energy() + self.get_extern().energy) - self.get_header().energy) <= 10 + return abs(self.steps_energy + self.extern.energy - self.header.energy) <= 10 def print_items(self): for item in self.input_items: diff --git a/autosubmit/history/platform_monitor/test.py b/autosubmit/history/platform_monitor/test.py index 6279ccefa..a7100243b 100644 --- a/autosubmit/history/platform_monitor/test.py +++ b/autosubmit/history/platform_monitor/test.py @@ -89,14 +89,7 @@ class TestSlurmMonitor(unittest.TestCase): extern = slurm_monitor.get_extern() self.assertIsNotNone(header) self.assertIsNotNone(batch) - self.assertIsNotNone(extern) - steps_and_extern_energy = slurm_monitor.sum_steps_energy() + extern.energy - # print(slurm_monitor.sum_steps_energy()) - # print(slurm_monitor.sum_steps_energy() + extern.energy) - # print("{0} {1} <- {2}".format(batch.name, batch.energy, batch.energy_str)) - # print("{0} {1} <- {2}".format(extern.name, extern.energy, extern.energy_str)) - # print("{0} {1} <- {2}".format(header.name, header.energy, header.energy_str)) - # ENERGY: extern + steps ~ header + self.assertIsNotNone(extern) self.assertTrue(slurm_monitor.steps_plus_extern_approximate_header_energy()) diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 62448be73..8de6e47e4 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -180,6 +180,13 @@ class Job(object): String representation of the current status """ return Status.VALUE_TO_KEY.get(self.status, "UNKNOWN") + + @property + def children_names_str(self): + """ + Comma separated list of children's names + """ + return ",".join([str(child.name) for child in self._children]) @parents.setter def parents(self, parents): -- GitLab From 1d716db6393d1905e6646e5e12e05e3942fc0ca6 Mon Sep 17 00:00:00 2001 From: Wilmer Uruchi Ticona Date: Fri, 15 Oct 2021 20:03:45 +0200 Subject: [PATCH 6/8] Testing main implementation. Adding strategies for energy distribution --- autosubmit/history/data_classes/job_data.py | 2 +- autosubmit/history/database_managers/tests.py | 23 +- autosubmit/history/experiment_history.py | 81 +++--- .../platform_monitor/platform_monitor.py | 2 +- .../history/platform_monitor/slurm_monitor.py | 6 +- autosubmit/history/strategies.py | 133 +++++++++ autosubmit/history/tests.py | 263 +++++++++++++++++- 7 files changed, 453 insertions(+), 57 deletions(-) create mode 100644 autosubmit/history/strategies.py diff --git a/autosubmit/history/data_classes/job_data.py b/autosubmit/history/data_classes/job_data.py index 23610fc54..f216fd2c6 100644 --- a/autosubmit/history/data_classes/job_data.py +++ b/autosubmit/history/data_classes/job_data.py @@ -111,7 +111,7 @@ class JobData(object): @property def computational_weight(self): - return round(float(self.running_time * self.ncpus),2) + return round(float(self.running_time * self.ncpus),4) @property def submit(self): diff --git a/autosubmit/history/database_managers/tests.py b/autosubmit/history/database_managers/tests.py index a78d68ebe..2a4aec4f7 100644 --- a/autosubmit/history/database_managers/tests.py +++ b/autosubmit/history/database_managers/tests.py @@ -19,12 +19,16 @@ import unittest import time import random +import os +from shutil import copy2 from experiment_history_db_manager import ExperimentHistoryDbManager from experiment_status_db_manager import ExperimentStatusDbManager from autosubmit.history.data_classes.experiment_run import ExperimentRun from autosubmit.history.data_classes.job_data import JobData from autosubmit.config.basicConfig import BasicConfig import autosubmit.history.utils as HUtils +EXPID_TT00_SOURCE = "test_database.db~" +EXPID_TT01_SOURCE = "test_database_no_run.db~" EXPID = "tt00" EXPID_NONE = "tt01" BasicConfig.read() @@ -35,9 +39,6 @@ class TestExperimentStatusDatabaseManager(unittest.TestCase): """ Covers Experiment Status Database Manager """ def setUp(self): self.exp_status_db = ExperimentStatusDbManager(EXPID, LOCAL_ROOT_DIR) - - def tearDown(self): - pass def test_get_current_experiment_status_row(self): exp_status_row = self.exp_status_db.get_experiment_status_row_by_expid(EXPID) @@ -69,11 +70,18 @@ class TestExperimentStatusDatabaseManager(unittest.TestCase): class TestExperimentHistoryDbManager(unittest.TestCase): """ Covers Experiment History Database Manager and Data Models """ def setUp(self): - self.experiment_database = ExperimentHistoryDbManager(EXPID, JOBDATA_DIR) - self.experiment_database.initialize() + self.experiment_database = ExperimentHistoryDbManager(EXPID, JOBDATA_DIR) + source_path_tt00 = os.path.join(JOBDATA_DIR, EXPID_TT00_SOURCE) + self.target_path_tt00 = os.path.join(JOBDATA_DIR, "job_data_{0}.db".format(EXPID)) + copy2(source_path_tt00, self.target_path_tt00) + source_path_tt01 = os.path.join(JOBDATA_DIR, EXPID_TT01_SOURCE) + self.target_path_tt01 = os.path.join(JOBDATA_DIR, "job_data_{0}.db".format(EXPID_NONE)) + copy2(source_path_tt01, self.target_path_tt01) + self.experiment_database.initialize() def tearDown(self): - pass + os.remove(self.target_path_tt00) + os.remove(self.target_path_tt01) def test_get_max_id(self): max_item = self.experiment_database.get_experiment_run_dc_with_max_id() @@ -183,7 +191,8 @@ class TestExperimentHistoryDbManager(unittest.TestCase): self.experiment_database.update_many_job_data_change_status(backup_changes) def test_job_data_maxcounter(self): - new_job_data = ExperimentHistoryDbManager(EXPID_NONE, JOBDATA_DIR) + new_job_data = ExperimentHistoryDbManager(EXPID_NONE, JOBDATA_DIR) + new_job_data.initialize() max_empty_table_counter = new_job_data.get_job_data_max_counter() self.assertTrue(max_empty_table_counter == 0) max_existing_counter = self.experiment_database.get_job_data_max_counter() diff --git a/autosubmit/history/experiment_history.py b/autosubmit/history/experiment_history.py index 6f4eb27d0..a2421a0e1 100644 --- a/autosubmit/history/experiment_history.py +++ b/autosubmit/history/experiment_history.py @@ -154,24 +154,29 @@ class ExperimentHistory(): self.manager.update_list_job_data_dc_by_each_id(job_data_dcs_in_wrapper) else: job_data_dc = self._assign_platform_information_to_job_data_dc(job_data_dc, slurm_monitor) - job_data_dc = self._assign_platform_information_to_job_data_dc(job_data_dc, slurm_monitor) + job_data_dc = self._set_job_as_processed_in_platform(job_data_dc, slurm_monitor) return self.manager.update_job_data_dc_by_id(job_data_dc) except Exception as exp: self._log.log(str(exp), traceback.format_exc()) return None def _distribute_energy_in_wrapper(self, job_data_dcs, slurm_monitor): - """ SlurmMonitor with data. """ + """ Requires SlurmMonitor with data. """ computational_weights = self._get_calculated_weights_of_jobs_in_wrapper(job_data_dcs) - if len(job_data_dcs) == slurm_monitor.step_count: - for job_dc, input_item in zip(job_data_dcs, slurm_monitor.input_items): - job_dc.energy = input_item.energy + computational_weights.get(job_dc.job_name, 0) * slurm_monitor.extern.energy + if len(job_data_dcs) == slurm_monitor.step_count: + for job_dc, step in zip(job_data_dcs, slurm_monitor.steps): + job_dc.energy = step.energy + computational_weights.get(job_dc.job_name, 0) * slurm_monitor.extern.energy + job_dc.AveRSS = step.AveRSS + job_dc.MaxRSS = step.MaxRSS job_dc.platform_output = "" else: for job_dc in job_data_dcs: job_dc.energy = computational_weights.get(job_dc.job_name, 0) * slurm_monitor.total_energy job_dc.platform_output = "" return job_data_dcs + + + def _set_job_as_processed_in_platform(self, job_data_dc, slurm_monitor): """ """ @@ -180,23 +185,18 @@ class ExperimentHistory(): return job_data_dc def _assign_platform_information_to_job_data_dc(self, job_data_dc, slurm_monitor): - """ Basic Assignment """ + """ Basic Assignment. No Wrapper. """ job_data_dc.submit = slurm_monitor.header.submit job_data_dc.start = slurm_monitor.header.start job_data_dc.finish = slurm_monitor.header.finish job_data_dc.ncpus = slurm_monitor.header.ncpus job_data_dc.nnodes = slurm_monitor.header.nnodes job_data_dc.energy = slurm_monitor.header.energy - job_data_dc.MaxRSS = slurm_monitor.header.MaxRSS - job_data_dc.AveRSS = slurm_monitor.header.AveRSS + job_data_dc.MaxRSS = max(slurm_monitor.header.MaxRSS, slurm_monitor.batch.MaxRSS, slurm_monitor.extern.MaxRSS) # TODO: Improve this rule + job_data_dc.AveRSS = max(slurm_monitor.header.AveRSS, slurm_monitor.batch.AveRSS, slurm_monitor.extern.AveRSS) job_data_dc.platform_output = slurm_monitor.original_input return job_data_dc - def _get_calculated_weights_of_jobs_in_wrapper(self, job_data_dcs): - """ Based on computational weight: running time in seconds * number of cpus. """ - total_weight = sum(job.computational_weight for job in job_data_dcs) - return {job.job_name: round(job.computational_weight/total_weight, 2) for job in job_data_dcs} - def process_status_changes(self, job_list=None, chunk_unit="NA", chunk_size=0, current_config=""): """ Detect status differences between job_list and current job_data rows, and update. Creates a new run if necessary. """ try: @@ -205,7 +205,7 @@ class ExperimentHistory(): if len(update_these_changes) > 0: self.manager.update_many_job_data_change_status(update_these_changes) if self.should_we_create_a_new_run(job_list, len(update_these_changes), current_experiment_run_dc.total): - return self.create_new_experiment_run(chunk_unit, chunk_size, current_config) + return self.create_new_experiment_run(chunk_unit, chunk_size, current_config, job_list) return self.update_counts_on_experiment_run_dc(current_experiment_run_dc, job_list) except Exception as exp: self._log.log(str(exp), traceback.format_exc()) @@ -234,6 +234,7 @@ class ExperimentHistory(): experiment_run_dc.submitted = status_counts[HUtils.SupportedStatus.SUBMITTED] experiment_run_dc.running = status_counts[HUtils.SupportedStatus.RUNNING] experiment_run_dc.suspended = status_counts[HUtils.SupportedStatus.SUSPENDED] + experiment_run_dc.total = status_counts["TOTAL"] return self.manager.update_experiment_run_dc_by_id(experiment_run_dc) def finish_current_experiment_run(self): @@ -241,25 +242,10 @@ class ExperimentHistory(): current_experiment_run_dc.finish = int(time()) return self.manager.update_experiment_run_dc_by_id(current_experiment_run_dc) - def create_new_experiment_run(self, chunk_unit="NA", chunk_size=0, current_config=""): + def create_new_experiment_run(self, chunk_unit="NA", chunk_size=0, current_config="", job_list=None): """ Also writes the finish timestamp of the previous run. """ self.finish_current_experiment_run() - return self._create_new_experiment_run_dc_with_counts(chunk_unit=chunk_unit, chunk_size=chunk_size, current_config=current_config) - - def detect_changes_in_job_list(self, job_list): - """ Detect changes in job_list compared to the current contents of job_data table. Returns a list of JobData data classes where the status of each item is the new status.""" - job_name_to_job = {job.name: job for job in job_list} - current_job_data_dcs = self.manager.get_all_last_job_data_dcs() - differences = [] - for job_dc in current_job_data_dcs: - if job_dc.job_name in job_name_to_job and job_dc.status != job_name_to_job[job_dc.job_name].status_str: - job_dc.status = job_name_to_job[job_dc.job_name].status_str - differences.append(job_dc) - return differences - - def update_experiment_history_from_job_list(self, job_list): - """ job_list: List of objects, each object must have attributes date, member, status_str. """ - raise NotImplementedError + return self._create_new_experiment_run_dc_with_counts(chunk_unit=chunk_unit, chunk_size=chunk_size, current_config=current_config, job_list=job_list) def _create_new_experiment_run_dc_with_counts(self, chunk_unit, chunk_size, current_config="", job_list=None): """ Create new experiment_run row and return the new Models.ExperimentRun data class from database. """ @@ -275,7 +261,27 @@ class ExperimentHistory(): running=status_counts[HUtils.SupportedStatus.RUNNING], submitted=status_counts[HUtils.SupportedStatus.SUBMITTED], suspended=status_counts[HUtils.SupportedStatus.SUSPENDED]) - return self.manager.register_experiment_run_dc(experiment_run_dc) + return self.manager.register_experiment_run_dc(experiment_run_dc) + + def _get_built_list_of_changes(self, job_list): + """ Return: List of (current timestamp, current datetime str, status, rowstatus, id in job_data). One tuple per change. """ + job_data_dcs = self.detect_changes_in_job_list(job_list) + return [(int(time()), HUtils.get_current_datetime(), job.status, Models.RowStatus.CHANGED, job._id) for job in job_data_dcs] + + def detect_changes_in_job_list(self, job_list): + """ Detect changes in job_list compared to the current contents of job_data table. Returns a list of JobData data classes where the status of each item is the new status.""" + job_name_to_job = {job.name: job for job in job_list} + current_job_data_dcs = self.manager.get_all_last_job_data_dcs() + differences = [] + for job_dc in current_job_data_dcs: + if job_dc.job_name in job_name_to_job and job_dc.status != job_name_to_job[job_dc.job_name].status_str: + job_dc.status = job_name_to_job[job_dc.job_name].status_str + differences.append(job_dc) + return differences + + def update_experiment_history_from_job_list(self, job_list): + """ job_list: List of objects, each object must have attributes name, date, member, status_str, children. """ + raise NotImplementedError def _get_defined_rowtype(self, code): if code: @@ -299,13 +305,8 @@ class ExperimentHistory(): def _get_date_member_completed_count(self, job_list): """ Each item in the job_list must have attributes: date, member, status_str. """ - job_list = job_list if job_list else [] - return sum(1 for job in job_list if job.date is not None and job.member is None and job.status_str == HUtils.SupportedStatus.COMPLETED) - - def _get_built_list_of_changes(self, job_list): - """ Return: List of (current timestamp, current datetime str, status, rowstatus, id in job_data). One tuple per change. """ - job_data_dcs = self.detect_changes_in_job_list(job_list) - return [(int(time()), HUtils.get_current_datetime(), job.status, Models.RowStatus.CHANGED, job._id) for job in job_data_dcs] + job_list = job_list if job_list else [] + return sum(1 for job in job_list if job.date is not None and job.member is not None and job.status_str == HUtils.SupportedStatus.COMPLETED) def get_status_counts_from_job_list(self, job_list): """ @@ -317,7 +318,7 @@ class ExperimentHistory(): HUtils.SupportedStatus.QUEUING: 0, HUtils.SupportedStatus.SUBMITTED: 0, HUtils.SupportedStatus.RUNNING: 0, - HUtils.SupportedStatus.RUNNING: 0, + HUtils.SupportedStatus.SUSPENDED: 0, "TOTAL": 0 } diff --git a/autosubmit/history/platform_monitor/platform_monitor.py b/autosubmit/history/platform_monitor/platform_monitor.py index c9570e26f..8439190cc 100644 --- a/autosubmit/history/platform_monitor/platform_monitor.py +++ b/autosubmit/history/platform_monitor/platform_monitor.py @@ -26,6 +26,6 @@ class PlatformMonitor(): @abstractmethod - def identify_input_rows(self): + def _identify_input_rows(self): """ """ diff --git a/autosubmit/history/platform_monitor/slurm_monitor.py b/autosubmit/history/platform_monitor/slurm_monitor.py index e0118620a..2c6d38200 100644 --- a/autosubmit/history/platform_monitor/slurm_monitor.py +++ b/autosubmit/history/platform_monitor/slurm_monitor.py @@ -31,7 +31,7 @@ class SlurmMonitor(PlatformMonitor): @property def total_energy(self): - return max(self.header_energy, self.steps_energy + self.extern_energy) + return max(self.header.energy, self.steps_energy + self.extern.energy) @property def step_count(self): @@ -41,6 +41,10 @@ class SlurmMonitor(PlatformMonitor): lines = self.input.split("\n") self.input_items = [SlurmMonitorItem.from_line(line) for line in lines] + @property + def steps(self): + return [item for item in self.input_items if item.is_step] + @property def header(self): return next((header for header in self.input_items if header.is_header), None) diff --git a/autosubmit/history/strategies.py b/autosubmit/history/strategies.py new file mode 100644 index 000000000..1470d8a95 --- /dev/null +++ b/autosubmit/history/strategies.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python + +# Copyright 2015-2020 Earth Sciences Department, BSC-CNS +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . + +from abc import ABCMeta, abstractmethod +import database_managers.database_models as Models + +class PlatformInformationHandler(): + def __init__(self, strategy): + self._strategy = strategy + + @property + def strategy(self): + return self._strategy + + @strategy.setter + def strategy(self, strategy): + self._strategy = strategy + + def execute_distribution(self, job_data_dc, job_data_dcs_in_wrapper, slurm_monitor): + result = self._strategy.apply_distribution() + + +class Strategy(): + """ Strategy Interface """ + __metaclass__ = ABCMeta + + @abstractmethod + def apply_distribution(self, job_data_dc, job_data_dcs_in_wrapper, slurm_monitor): + pass + + def set_job_data_dc_as_processed(self, job_data_dc, original_ssh_output): + job_data_dc.platform_output = original_ssh_output + job_data_dc.row_status = Models.RowStatus.PROCESSED + return job_data_dc + + def get_calculated_weights_of_jobs_in_wrapper(self, job_data_dcs_in_wrapper): + """ Based on computational weight: running time in seconds * number of cpus. """ + total_weight = sum(job.computational_weight for job in job_data_dcs_in_wrapper) + return {job.job_name: round(job.computational_weight/total_weight, 4) for job in job_data_dcs_in_wrapper} + + +class SimpleAssociationStrategy(Strategy): + def apply_distribution(self, job_data_dc, job_data_dcs_in_wrapper, slurm_monitor): + if len(job_data_dcs_in_wrapper) > 0: + return [] + job_data_dc.submit = slurm_monitor.header.submit + job_data_dc.start = slurm_monitor.header.start + job_data_dc.finish = slurm_monitor.header.finish + job_data_dc.ncpus = slurm_monitor.header.ncpus + job_data_dc.nnodes = slurm_monitor.header.nnodes + job_data_dc.energy = slurm_monitor.header.energy + job_data_dc.MaxRSS = max(slurm_monitor.header.MaxRSS, slurm_monitor.batch.MaxRSS, slurm_monitor.extern.MaxRSS) # TODO: Improve this rule + job_data_dc.AveRSS = max(slurm_monitor.header.AveRSS, slurm_monitor.batch.AveRSS, slurm_monitor.extern.AveRSS) + job_data_dc = self.set_job_data_dc_as_processed(job_data_dc, slurm_monitor.original_input) + return [job_data_dc] + +class StraightAssociationStrategy(Strategy): + def apply_distribution(self, job_data_dc, job_data_dcs_in_wrapper, slurm_monitor): + """ """ + if len(job_data_dcs_in_wrapper) != slurm_monitor.step_count: + return [] + result = [] + computational_weights = self.get_calculated_weights_of_jobs_in_wrapper(job_data_dcs_in_wrapper) + for job_dc, step in zip(job_data_dcs_in_wrapper, slurm_monitor.steps): + job_dc.energy = step.energy + computational_weights.get(job_dc.job_name, 0) * slurm_monitor.extern.energy + job_dc.AveRSS = step.AveRSS + job_dc.MaxRSS = step.MaxRSS + job_dc.platform_output = "" + result.append(job_dc) + job_data_dc = self.set_job_data_dc_as_processed(job_data_dc, slurm_monitor.original_input) + result.append(job_data_dc) + return result + +class GeneralizedDistributionStrategy(Strategy): + def apply_distribution(self, job_data_dc, job_data_dcs_in_wrapper, slurm_monitor): + result = [] + computational_weights = self.get_calculated_weights_of_jobs_in_wrapper(job_data_dcs_in_wrapper) + for job_dc in job_data_dcs_in_wrapper: + job_dc.energy = computational_weights.get(job_dc.job_name, 0) * slurm_monitor.total_energy + job_dc.platform_output = "" + result.append(job_dc) + job_data_dc = self.set_job_data_dc_as_processed(job_data_dc, slurm_monitor.original_input) + result.append(job_data_dc) + return result + +class TwoDimWrapperDistributionStrategy(Strategy): + def apply_distribution(self, job_data_dc, job_data_dcs_in_wrapper, slurm_monitor): + result = [] + # Challenge: Get jobs per level and then distribute energy + return result + + def get_jobs_per_level(self, job_data_dcs_in_wrapper): + job_name_to_children_names = {job.job_name:job.children.split(",") for job in job_data_dcs_in_wrapper} + children_names = [] + for job_name in job_name_to_children_names: + children_names.extend(job_name_to_children_names[job_name]) + + + + + + + + + + + +def simple_association_strategy(job_data_dc, slurm_monitor): + job_data_dc.submit = slurm_monitor.header.submit + job_data_dc.start = slurm_monitor.header.start + job_data_dc.finish = slurm_monitor.header.finish + job_data_dc.ncpus = slurm_monitor.header.ncpus + job_data_dc.nnodes = slurm_monitor.header.nnodes + job_data_dc.energy = slurm_monitor.header.energy + job_data_dc.MaxRSS = max(slurm_monitor.header.MaxRSS, slurm_monitor.batch.MaxRSS, slurm_monitor.extern.MaxRSS) # TODO: Improve this rule + job_data_dc.AveRSS = max(slurm_monitor.header.AveRSS, slurm_monitor.batch.AveRSS, slurm_monitor.extern.AveRSS) + job_data_dc.platform_output = slurm_monitor.original_input + return job_data_dc \ No newline at end of file diff --git a/autosubmit/history/tests.py b/autosubmit/history/tests.py index 67462a811..5b7b26cea 100644 --- a/autosubmit/history/tests.py +++ b/autosubmit/history/tests.py @@ -18,28 +18,277 @@ import unittest import traceback +import os +import time +from shutil import copy2 +from collections import namedtuple from experiment_history import ExperimentHistory -from logging import Logging +from internal_logging import Logging +from autosubmit.config.basicConfig import BasicConfig +from platform_monitor.slurm_monitor import SlurmMonitor +EXPID_TT00_SOURCE = "test_database.db~" +EXPID_TT01_SOURCE = "test_database_no_run.db~" +EXPID = "tt00" +EXPID_NONE = "tt01" +BasicConfig.read() +JOBDATA_DIR = BasicConfig.JOBDATA_DIR +LOCAL_ROOT_DIR = BasicConfig.LOCAL_ROOT_DIR +job = namedtuple("Job", ["name", "date", "member", "status_str", "children"]) class TestExperimentHistory(unittest.TestCase): # @classmethod # def setUpClass(cls): # cls.exp = ExperimentHistory("tt00") # example database def setUp(self): - pass + source_path_tt00 = os.path.join(JOBDATA_DIR, EXPID_TT00_SOURCE) + self.target_path_tt00 = os.path.join(JOBDATA_DIR, "job_data_{0}.db".format(EXPID)) + copy2(source_path_tt00, self.target_path_tt00) + source_path_tt01 = os.path.join(JOBDATA_DIR, EXPID_TT01_SOURCE) + self.target_path_tt01 = os.path.join(JOBDATA_DIR, "job_data_{0}.db".format(EXPID_NONE)) + copy2(source_path_tt01, self.target_path_tt01) + self.job_list = [ + job("a29z_20000101_fc2_1_POST", "2000-01-01 00:00:00", "POST", "COMPLETED", ""), + job("a29z_20000101_fc1_1_CLEAN", "2000-01-01 00:00:00", "CLEAN", "COMPLETED", ""), + job("a29z_20000101_fc3_1_POST", "2000-01-01 00:00:00", "POST", "RUNNING", ""), + job("a29z_20000101_fc2_1_CLEAN", "2000-01-01 00:00:00", "CLEAN", "COMPLETED", ""), + job("a29z_20000101_fc0_3_SIM", "2000-01-01 00:00:00", "SIM", "COMPLETED", ""), + job("a29z_20000101_fc1_2_POST", "2000-01-01 00:00:00", "POST", "QUEUING", ""), + ] # 2 differences, all COMPLETED + self.job_list_large = [ + job("a29z_20000101_fc2_1_POST", "2000-01-01 00:00:00", "POST", "COMPLETED", ""), + job("a29z_20000101_fc1_1_CLEAN", "2000-01-01 00:00:00", "CLEAN", "COMPLETED", ""), + job("a29z_20000101_fc3_1_POST", "2000-01-01 00:00:00", "POST", "RUNNING", ""), + job("a29z_20000101_fc2_1_CLEAN", "2000-01-01 00:00:00", "CLEAN", "COMPLETED", ""), + job("a29z_20000101_fc0_3_SIM", "2000-01-01 00:00:00", "SIM", "COMPLETED", ""), + job("a29z_20000101_fc1_2_POST", "2000-01-01 00:00:00", "POST", "QUEUING", ""), + job("a29z_20000101_fc1_5_POST", "2000-01-01 00:00:00", "POST", "SUSPENDED", ""), + job("a29z_20000101_fc1_4_POST", "2000-01-01 00:00:00", "POST", "FAILED", ""), + job("a29z_20000101_fc2_5_CLEAN", "2000-01-01 00:00:00", "CLEAN", "SUBMITTED", ""), + job("a29z_20000101_fc0_1_POST", "2000-01-01 00:00:00", "POST", "RUNNING", ""), + ] + + def tearDown(self): + os.remove(self.target_path_tt00) + os.remove(self.target_path_tt01) def test_db_exists(self): exp_history = ExperimentHistory("tt00") - self.assertTrue(exp_history.my_database_exists() == True) + exp_history.initialize_database() + self.assertTrue(exp_history.manager.my_database_exists() == True) exp_history = ExperimentHistory("tt99") - self.assertTrue(exp_history.my_database_exists() == False) + self.assertTrue(exp_history.manager.my_database_exists() == False) def test_is_header_ready(self): exp_history = ExperimentHistory("tt00") self.assertTrue(exp_history.is_header_ready() == True) - def test_get_all_job_data(self): - pass + def test_detect_differences_job_list(self): + exp_history = ExperimentHistory("tt00") + exp_history.initialize_database() + differences = exp_history.detect_changes_in_job_list(self.job_list) + expected_differences = ["a29z_20000101_fc3_1_POST", "a29z_20000101_fc1_2_POST"] + for job_dc in differences: + self.assertTrue(job_dc.job_name in expected_differences) + self.assertTrue(len(differences) == 2) + + def test_built_list_of_changes(self): + exp_history = ExperimentHistory("tt00") + exp_history.initialize_database() + built_differences = exp_history._get_built_list_of_changes(self.job_list) + expected_ids_differences = [90, 101] + for item in built_differences: + self.assertTrue(item[4] in expected_ids_differences) + + def test_get_date_member_count(self): + exp_history = ExperimentHistory("tt00") + exp_history.initialize_database() + dm_count = exp_history._get_date_member_completed_count(self.job_list) + self.assertTrue(dm_count > 0) + + def test_should_we_create_new_run(self): + exp_history = ExperimentHistory("tt00") + exp_history.initialize_database() + CHANGES_COUNT = 1 + TOTAL_COUNT = 6 + should_we = exp_history.should_we_create_a_new_run(self.job_list, CHANGES_COUNT, TOTAL_COUNT) + self.assertTrue(should_we == False) + TOTAL_COUNT_DIFF = 5 + should_we = exp_history.should_we_create_a_new_run(self.job_list, CHANGES_COUNT, TOTAL_COUNT_DIFF) + self.assertTrue(should_we == True) + CHANGES_COUNT = 5 + should_we = exp_history.should_we_create_a_new_run(self.job_list, CHANGES_COUNT, TOTAL_COUNT) + self.assertTrue(should_we == True) + + def test_status_counts(self): + exp_history = ExperimentHistory("tt00") + exp_history.initialize_database() + result = exp_history.get_status_counts_from_job_list(self.job_list_large) + self.assertTrue(result["COMPLETED"] == 4) + self.assertTrue(result["QUEUING"] == 1) + self.assertTrue(result["RUNNING"] == 2) + self.assertTrue(result["FAILED"] == 1) + self.assertTrue(result["SUSPENDED"] == 1) + self.assertTrue(result["TOTAL"] == len(self.job_list_large)) + + def test_create_new_experiment_run_with_counts(self): + exp_history = ExperimentHistory("tt00") + exp_history.initialize_database() + exp_run = exp_history.create_new_experiment_run(job_list=self.job_list) + self.assertTrue(exp_run.chunk_size == 0) + self.assertTrue(exp_run.chunk_unit == "NA") + self.assertTrue(exp_run.total == len(self.job_list)) + self.assertTrue(exp_run.completed == 4) + + def test_finish_current_run(self): + exp_history = ExperimentHistory("tt00") + exp_history.initialize_database() + exp_run = exp_history.finish_current_experiment_run() + self.assertTrue(len(exp_run.modified) > 0) + self.assertTrue(exp_run.finish > 0) + + def test_process_job_list_changes(self): + exp_history = ExperimentHistory("tt00") + exp_history.initialize_database() + exp_run = exp_history.process_job_list_changes_to_experiment_totals(self.job_list) + self.assertTrue(exp_run.total == len(self.job_list)) + self.assertTrue(exp_run.completed == 4) + self.assertTrue(exp_run.running == 1) + self.assertTrue(exp_run.queuing == 1) + + def test_calculated_weights(self): + exp_history = ExperimentHistory("tt00") + exp_history.initialize_database() + job_data_dcs = exp_history.manager.get_all_last_job_data_dcs() + calculated_weights = exp_history._get_calculated_weights_of_jobs_in_wrapper(job_data_dcs) + sum_comp_weight = 0 + for job_name in calculated_weights: + sum_comp_weight += calculated_weights[job_name] + self.assertTrue(abs(sum_comp_weight - 1) <= 0.01) + + def test_assign_platform_information_to_job_data_dc(self): + exp_history = ExperimentHistory("tt00") + exp_history.initialize_database() + ssh_output = ''' 12535498 COMPLETED 2 1 2020-11-18T13:54:24 2020-11-18T13:55:55 2020-11-18T13:56:10 2.77K + 12535498.batch COMPLETED 2 1 2020-11-18T13:55:55 2020-11-18T13:55:55 2020-11-18T13:56:10 2.69K 659K 659K + 12535498.extern COMPLETED 2 1 2020-11-18T13:55:55 2020-11-18T13:55:55 2020-11-18T13:56:10 2.77K 24K 24K ''' + slurm_monitor = SlurmMonitor(ssh_output) + job_data_dc = exp_history.manager.get_job_data_dc_unique_latest_by_job_name("a29z_20000101_fc1_1_CLEAN") + job_data_dc_result = exp_history._assign_platform_information_to_job_data_dc(job_data_dc, slurm_monitor) + self.assertTrue(job_data_dc_result.job_name == job_data_dc.job_name) + self.assertTrue(job_data_dc_result.energy == slurm_monitor.header.energy) + self.assertTrue(job_data_dc_result.status == "COMPLETED") + self.assertTrue(slurm_monitor.header.energy == 2770) + self.assertTrue(job_data_dc_result.MaxRSS == 659000) + + def test_distribute_energy_in_wrapper_1_to_1(self): + exp_history = ExperimentHistory("tt00") + exp_history.initialize_database() + ssh_output = ''' 17857525 COMPLETED 10 1 2021-10-13T15:51:16 2021-10-13T15:51:17 2021-10-13T15:52:47 2.41K + 17857525.batch COMPLETED 10 1 2021-10-13T15:51:17 2021-10-13T15:51:17 2021-10-13T15:52:47 1.88K 6264K 6264K + 17857525.extern COMPLETED 10 1 2021-10-13T15:51:17 2021-10-13T15:51:17 2021-10-13T15:52:47 1.66K 473K 68K + 17857525.0 COMPLETED 10 1 2021-10-13T15:51:21 2021-10-13T15:51:21 2021-10-13T15:51:22 186 352K 312.30K + 17857525.1 COMPLETED 10 1 2021-10-13T15:51:23 2021-10-13T15:51:23 2021-10-13T15:51:24 186 420K 306.70K + 17857525.2 COMPLETED 10 1 2021-10-13T15:51:24 2021-10-13T15:51:24 2021-10-13T15:51:27 188 352K 325.80K + 17857525.3 COMPLETED 10 1 2021-10-13T15:51:28 2021-10-13T15:51:28 2021-10-13T15:51:29 192 352K 341.90K + ''' + slurm_monitor = SlurmMonitor(ssh_output) + job_data_dcs = exp_history.manager.get_all_last_job_data_dcs()[:4] # Get me 4 jobs + weights = exp_history._get_calculated_weights_of_jobs_in_wrapper(job_data_dcs) + job_data_dcs_with_data = exp_history._distribute_energy_in_wrapper(job_data_dcs, slurm_monitor) + self.assertTrue(job_data_dcs_with_data[0].energy == round(slurm_monitor.steps[0].energy + weights[job_data_dcs_with_data[0].job_name]*slurm_monitor.extern.energy, 2)) + self.assertTrue(job_data_dcs_with_data[0].MaxRSS == slurm_monitor.steps[0].MaxRSS) + self.assertTrue(job_data_dcs_with_data[2].energy == round(slurm_monitor.steps[2].energy + weights[job_data_dcs_with_data[2].job_name]*slurm_monitor.extern.energy, 2)) + self.assertTrue(job_data_dcs_with_data[2].AveRSS == slurm_monitor.steps[2].AveRSS) + + def test_distribute_energy_in_wrapper_general_case(self): + exp_history = ExperimentHistory("tt00") + exp_history.initialize_database() + ssh_output = ''' 17857525 COMPLETED 10 1 2021-10-13T15:51:16 2021-10-13T15:51:17 2021-10-13T15:52:47 2.41K + 17857525.batch COMPLETED 10 1 2021-10-13T15:51:17 2021-10-13T15:51:17 2021-10-13T15:52:47 1.88K 6264K 6264K + 17857525.extern COMPLETED 10 1 2021-10-13T15:51:17 2021-10-13T15:51:17 2021-10-13T15:52:47 1.66K 473K 68K + 17857525.0 COMPLETED 10 1 2021-10-13T15:51:21 2021-10-13T15:51:21 2021-10-13T15:51:22 186 352K 312.30K + 17857525.1 COMPLETED 10 1 2021-10-13T15:51:23 2021-10-13T15:51:23 2021-10-13T15:51:24 186 420K 306.70K + 17857525.2 COMPLETED 10 1 2021-10-13T15:51:24 2021-10-13T15:51:24 2021-10-13T15:51:27 188 352K 325.80K + 17857525.3 COMPLETED 10 1 2021-10-13T15:51:28 2021-10-13T15:51:28 2021-10-13T15:51:29 192 352K 341.90K + ''' + slurm_monitor = SlurmMonitor(ssh_output) + job_data_dcs = exp_history.manager.get_all_last_job_data_dcs()[:5] # Get me 5 jobs + weights = exp_history._get_calculated_weights_of_jobs_in_wrapper(job_data_dcs) + job_data_dcs_with_data = exp_history._distribute_energy_in_wrapper(job_data_dcs, slurm_monitor) + self.assertTrue(job_data_dcs_with_data[0].energy == round(slurm_monitor.total_energy * weights[job_data_dcs_with_data[0].job_name], 2)) + self.assertTrue(job_data_dcs_with_data[1].energy == round(slurm_monitor.total_energy * weights[job_data_dcs_with_data[1].job_name], 2)) + self.assertTrue(job_data_dcs_with_data[2].energy == round(slurm_monitor.total_energy * weights[job_data_dcs_with_data[2].job_name], 2)) + self.assertTrue(job_data_dcs_with_data[3].energy == round(slurm_monitor.total_energy * weights[job_data_dcs_with_data[3].job_name], 2)) + sum_energy = sum(job.energy for job in job_data_dcs_with_data) + print(sum_energy) + print(slurm_monitor.total_energy) + self.assertTrue(abs(sum_energy - slurm_monitor.total_energy) <= 10) + + def test_process_status_changes(self): + exp_history = ExperimentHistory("tt00") + exp_history.initialize_database() + CHUNK_UNIT = "month" + CHUNK_SIZE = 20 + CURRENT_CONFIG = "CURRENT CONFIG" + current_experiment_run_dc = exp_history.manager.get_experiment_run_dc_with_max_id() + exp_run = exp_history.process_status_changes(job_list=self.job_list, chunk_unit=CHUNK_UNIT, chunk_size=CHUNK_SIZE, current_config=CURRENT_CONFIG) # Generates new run + self.assertTrue(current_experiment_run_dc.run_id != exp_run.run_id) + self.assertTrue(exp_run.chunk_unit == CHUNK_UNIT) + self.assertTrue(exp_run.metadata == CURRENT_CONFIG) + self.assertTrue(exp_run.total == len(self.job_list)) + current_experiment_run_dc = exp_history.manager.get_experiment_run_dc_with_max_id() + exp_run = exp_history.process_status_changes(job_list=self.job_list, chunk_unit=CHUNK_UNIT, chunk_size=CHUNK_SIZE, current_config=CURRENT_CONFIG) # Same run + self.assertTrue(current_experiment_run_dc.run_id == exp_run.run_id) + new_job_list = [ + job("a29z_20000101_fc2_1_POST", "2000-01-01 00:00:00", "POST", "FAILED", ""), + job("a29z_20000101_fc1_1_CLEAN", "2000-01-01 00:00:00", "CLEAN", "FAILED", ""), + job("a29z_20000101_fc3_1_POST", "2000-01-01 00:00:00", "POST", "RUNNING", ""), + job("a29z_20000101_fc2_1_CLEAN", "2000-01-01 00:00:00", "CLEAN", "FAILED", ""), + job("a29z_20000101_fc0_3_SIM", "2000-01-01 00:00:00", "SIM", "FAILED", ""), + job("a29z_20000101_fc1_2_POST", "2000-01-01 00:00:00", "POST", "QUEUING", ""), + ] + current_experiment_run_dc = exp_history.manager.get_experiment_run_dc_with_max_id() + exp_run = exp_history.process_status_changes(job_list=new_job_list, chunk_unit=CHUNK_UNIT, chunk_size=CHUNK_SIZE, current_config=CURRENT_CONFIG) # Generates new run + self.assertTrue(current_experiment_run_dc.run_id != exp_run.run_id) + self.assertTrue(exp_run.total == len(new_job_list)) + self.assertTrue(exp_run.failed == 4) + + def test_write_submit_time(self): + exp_history = ExperimentHistory("tt00") + exp_history.initialize_database() + JOB_NAME = "a29z_20000101_fc2_1_SIM" + NCPUS = 128 + PLATFORM_NAME = "marenostrum4" + JOB_ID = 101 + inserted_job_data_dc = exp_history.write_submit_time(JOB_NAME, time.time(), "SUBMITTED", NCPUS, "00:30", "debug", "20000101", "fc2", "SIM", 1, PLATFORM_NAME, JOB_ID, "bsc_es", 1, "") + self.assertTrue(inserted_job_data_dc.job_name == JOB_NAME) + self.assertTrue(inserted_job_data_dc.ncpus == NCPUS) + self.assertTrue(inserted_job_data_dc.children == "") + self.assertTrue(inserted_job_data_dc.energy == 0) + self.assertTrue(inserted_job_data_dc.platform == PLATFORM_NAME) + self.assertTrue(inserted_job_data_dc.job_id == JOB_ID) + self.assertTrue(inserted_job_data_dc.qos == "debug") + + + def test_write_start_time(self): + exp_history = ExperimentHistory("tt00") + exp_history.initialize_database() + JOB_NAME = "a29z_20000101_fc2_1_SIM" + NCPUS = 128 + PLATFORM_NAME = "marenostrum4" + JOB_ID = 101 + inserted_job_data_dc_submit = exp_history.write_submit_time(JOB_NAME, time.time(), "SUBMITTED", NCPUS, "00:30", "debug", "20000101", "fc2", "SIM", 1, PLATFORM_NAME, JOB_ID, "bsc_es", 1, "") + inserted_job_data_dc = exp_history.write_start_time(JOB_NAME, time.time(), "RUNNING", NCPUS, "00:30", "debug", "20000101", "fc2", "SIM", 1, PLATFORM_NAME, JOB_ID, "bsc_es", 1, "") + self.assertTrue(inserted_job_data_dc.job_name == JOB_NAME) + self.assertTrue(inserted_job_data_dc.ncpus == NCPUS) + self.assertTrue(inserted_job_data_dc.children == "") + self.assertTrue(inserted_job_data_dc.energy == 0) + self.assertTrue(inserted_job_data_dc.platform == PLATFORM_NAME) + self.assertTrue(inserted_job_data_dc.job_id == JOB_ID) + self.assertTrue(inserted_job_data_dc.status == "RUNNING") + self.assertTrue(inserted_job_data_dc.qos == "debug") + + class TestLogging(unittest.TestCase): @@ -55,7 +304,7 @@ class TestLogging(unittest.TestCase): def test_build_message(self): message = self.log.build_message(self.exp_message, self.trace_message) - print(message) + # print(message) self.assertIsNotNone(message) self.assertTrue(len(message) > 0) -- GitLab From c458d69d1f5b8fb37428412078888938aa5e71e7 Mon Sep 17 00:00:00 2001 From: Wilmer Uruchi Ticona Date: Tue, 19 Oct 2021 15:39:25 +0200 Subject: [PATCH 7/8] Mostly test changes. --- autosubmit/autosubmit.py | 58 +++-- autosubmit/history/data_classes/job_data.py | 6 + .../experiment_history_db_manager.py | 9 +- .../experiment_status_db_manager.py | 2 +- .../database_managers/{tests.py => test.py} | 14 +- autosubmit/history/experiment_history.py | 101 ++++----- autosubmit/history/platform_monitor/test.py | 21 +- autosubmit/history/slurm_parser.py | 163 -------------- autosubmit/history/strategies.py | 202 ++++++++++++------ autosubmit/history/{tests.py => test.py} | 38 ++-- autosubmit/history/test_strategies.py | 94 ++++++++ autosubmit/job/job.py | 87 ++++---- autosubmit/job/job_utils.py | 48 ++--- autosubmit/platforms/paramiko_platform.py | 9 +- 14 files changed, 415 insertions(+), 437 deletions(-) rename autosubmit/history/database_managers/{tests.py => test.py} (96%) delete mode 100644 autosubmit/history/slurm_parser.py rename autosubmit/history/{tests.py => test.py} (89%) create mode 100644 autosubmit/history/test_strategies.py diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index c23e757e4..c5c87644a 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -71,7 +71,6 @@ import portalocker from pkg_resources import require, resource_listdir, resource_exists, resource_string from collections import defaultdict from pyparsing import nestedExpr -from database.db_jobdata import JobDataStructure from history.experiment_status import ExperimentStatus from history.experiment_history import ExperimentHistory """ @@ -1368,23 +1367,16 @@ class Autosubmit: if not check_experiment_exists(start_after): return None # Historical Database: We use the historical database to retrieve the current progress data of the supplied expid (start_after) - # JobStructure object, check_only flag to avoid updating remote experiment - jobStructure = JobDataStructure(start_after, check_only=True) - # Check if database exists - if jobStructure.database_exists == False: - Log.critical( - "Experiment {0} does not have a valid database. Make sure that it is running under the latest version of Autosubmit.".format(start_after)) - return - # Check if database version is correct - if jobStructure.is_header_ready_db_version() == False: - Log.critical("Experiment {0} is running DB version {1} which is not supported by the completion trigger function. An updated DB version is needed.".format( - start_after, jobStructure.db_version)) + exp_history = ExperimentHistory(start_after, BasicConfig.JOBDATA_DIR) + if exp_history.is_header_ready() == False: + Log.critical("Experiment {0} is running a database version which is not supported by the completion trigger function. An updated DB version is needed.".format( + start_after)) return Log.info("Autosubmit will start monitoring experiment {0}. When the number of completed jobs plus suspended jobs becomes equal to the total number of jobs of experiment {0}, experiment {1} will start. Querying every 60 seconds. Status format Completed/Queuing/Running/Suspended/Failed.".format( start_after, expid)) while True: # Query current run - current_run = jobStructure.get_max_id_experiment_run() + current_run = exp_history.manager.get_experiment_run_dc_with_max_id() if current_run and current_run.finish > 0 and current_run.total > 0 and current_run.completed + current_run.suspended == current_run.total: break else: @@ -1549,15 +1541,14 @@ class Autosubmit: # Before starting main loop, setup historical database tables and main information Log.debug("Running job data structure") try: - # Historical Database: Can create a new run if there is a difference in the number of jobs or if the current state does not exist. - job_data_structure = JobDataStructure(expid) - job_data_structure.validate_current_run(job_list.get_job_list( - ), as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), current_config=as_conf.get_full_config_as_json()) - + # Historical Database: Can create a new run if there is a difference in the number of jobs or if the current run does not exist. + exp_history = ExperimentHistory(expid) + exp_history.initialize_database() + exp_history.process_status_changes(job_list.get_job_list(), as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), current_config=as_conf.get_full_config_as_json()) ExperimentStatus(expid).set_as_running() except Exception as e: raise AutosubmitCritical( - "Error while processing job_data_structure", 7067, str(e)) + "Error while processing historical database.", 7067, str(e)) if allowed_members: # Set allowed members after checks have been performed. This triggers the setter and main logic of the -rm feature. job_list.run_members = allowed_members @@ -1761,15 +1752,15 @@ class Autosubmit: if save: job_list.save() # Safe spot to store changes - job_data_structure.process_status_changes( - job_changes_tracker, job_list.get_job_list()) - job_changes_tracker = {} + exp_history = ExperimentHistory(expid, BasicConfig.JOBDATA_DIR) + if len(job_changes_tracker) > 0: + exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) if Autosubmit.exit: job_list.save() time.sleep(safetysleeptime) - except AutosubmitError as e: # If an error is detected, restore all connections and job_list + except AutosubmitError as e: # If an error is detected, restore all connections and job_list Log.error("Trace: {0}", e.trace) Log.error("{1} [eCode={0}]", e.code, e.message) Log.info("Waiting 30 seconds before continue") @@ -1861,8 +1852,8 @@ class Autosubmit: raise Log.result("No more jobs to run.") # Updating job data header with current information when experiment ends - job_data_structure.validate_current_run( - job_list.get_job_list(), as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), must_create=False, only_update=True) + exp_history = ExperimentHistory(expid, BasicConfig.JOBDATA_DIR) + exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) # Wait for all remaining threads of I/O, close remaining connections timeout = 0 @@ -1887,7 +1878,7 @@ class Autosubmit: else: Log.result("Run successful") # Updating finish time for job data header - job_data_structure.update_finish_time() + exp_history.finish_current_experiment_run() except portalocker.AlreadyLocked: message = "We have detected that there is another Autosubmit instance using the experiment\n. Stop other Autosubmit instances that are using the experiment or delete autosubmit.lock file located on tmp folder" raise AutosubmitCritical(message, 7000) @@ -1903,9 +1894,9 @@ class Autosubmit: Log.info("Checking the connection to all platforms in use") issues = "" for platform in platform_to_test: - try: - platform.test_connection() - except BaseException as e : + try: + platform.test_connection() + except BaseException as e : issues += "\n[{1}] Connection Unsuccessful to host {0} trace".format( platform.host, platform.name) continue @@ -2004,6 +1995,7 @@ class Autosubmit: raise AutosubmitCritical("Invalid parameter substitution in {0} template".format( e.job_name), 7014, e.message) except Exception as e: + print(traceback.format_exc()) raise AutosubmitError("{0} submission failed".format( platform.name), 6015, str(e)) except WrongTemplateException as e: @@ -2100,6 +2092,7 @@ class Autosubmit: except AutosubmitCritical as e: raise except Exception as e: + print(traceback.format_exc()) raise AutosubmitError("{0} submission failed".format( platform.name), 6015, str(e)) try: @@ -2117,7 +2110,7 @@ class Autosubmit: # Saving only when it is a real multi job package packages_persistence.save( package.name, package.jobs, package._expid, inspect) - except Exception as e: + except Exception as e: raise AutosubmitError("{0} submission failed".format( platform.name), 6015, str(e)) return save @@ -3916,8 +3909,9 @@ class Autosubmit: # Setting up job historical database header. Must create a new run. # Historical Database: Setup new run - JobDataStructure(expid).validate_current_run(job_list.get_job_list( - ), as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), must_create=True, current_config=as_conf.get_full_config_as_json()) + exp_history = ExperimentHistory(expid, BasicConfig.JOBDATA_DIR) + exp_history.initialize_database() + exp_history.create_new_experiment_run(as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), as_conf.get_full_config_as_json(), job_list.get_job_list()) if not noplot: if group_by: diff --git a/autosubmit/history/data_classes/job_data.py b/autosubmit/history/data_classes/job_data.py index f216fd2c6..b5249b797 100644 --- a/autosubmit/history/data_classes/job_data.py +++ b/autosubmit/history/data_classes/job_data.py @@ -109,6 +109,12 @@ class JobData(object): row.platform_output) return job_data + @property + def children_list(self): + children_list = self.children.split(",") if self.children else [] + result = [str(job_name).strip() for job_name in children_list] + return result + @property def computational_weight(self): return round(float(self.running_time * self.ncpus),4) diff --git a/autosubmit/history/database_managers/experiment_history_db_manager.py b/autosubmit/history/database_managers/experiment_history_db_manager.py index 001fe7b71..e2b5c3bf0 100644 --- a/autosubmit/history/database_managers/experiment_history_db_manager.py +++ b/autosubmit/history/database_managers/experiment_history_db_manager.py @@ -187,6 +187,13 @@ class ExperimentHistoryDbManager(DatabaseManager): if len(max_experiment_run) == 0: raise Exception("No Experiment Runs registered.") return Models.ExperimentRunRow(*max_experiment_run[0]) + + def is_there_a_last_experiment_run(self): + statement = self.get_built_select_statement("experiment_run", "run_id > 0 ORDER BY run_id DESC LIMIT 0, 1") + max_experiment_run = self.get_from_statement(self.historicaldb_file_path, statement) + if len(max_experiment_run) > 0: + return True + return False def get_job_data_all(self): """ Gets all content from job_data as list of Models.JobDataRow from database. """ @@ -302,7 +309,7 @@ class ExperimentHistoryDbManager(DatabaseManager): Update many job_data rows in bulk. Requires a changes list of argument tuples. Only updates finish, modified, status, and rowstatus by id. """ - statement = ''' UPDATE job_data SET finish=?, modified=?, status=?, rowstatus=? WHERE id=? ''' + statement = ''' UPDATE job_data SET modified=?, status=?, rowstatus=? WHERE id=? ''' self.execute_many_statement_with_arguments_on_dbfile(self.historicaldb_file_path, statement, changes) def _update_job_data_by_id(self, job_data_dc): diff --git a/autosubmit/history/database_managers/experiment_status_db_manager.py b/autosubmit/history/database_managers/experiment_status_db_manager.py index 26402cd71..69d5b4576 100644 --- a/autosubmit/history/database_managers/experiment_status_db_manager.py +++ b/autosubmit/history/database_managers/experiment_status_db_manager.py @@ -87,7 +87,7 @@ class ExperimentStatusDbManager(DatabaseManager): """ Get Models.ExperimentRow by expid. """ - experiment_row = self.get_experiment_row_by_expid(expid) + experiment_row = self.get_experiment_row_by_expid(expid) return self.get_experiment_status_row_by_exp_id(experiment_row.id) def get_experiment_row_by_expid(self, expid): diff --git a/autosubmit/history/database_managers/tests.py b/autosubmit/history/database_managers/test.py similarity index 96% rename from autosubmit/history/database_managers/tests.py rename to autosubmit/history/database_managers/test.py index 2a4aec4f7..0b1e3a05a 100644 --- a/autosubmit/history/database_managers/tests.py +++ b/autosubmit/history/database_managers/test.py @@ -29,8 +29,8 @@ from autosubmit.config.basicConfig import BasicConfig import autosubmit.history.utils as HUtils EXPID_TT00_SOURCE = "test_database.db~" EXPID_TT01_SOURCE = "test_database_no_run.db~" -EXPID = "tt00" -EXPID_NONE = "tt01" +EXPID = "t024" +EXPID_NONE = "t027" BasicConfig.read() JOBDATA_DIR = BasicConfig.JOBDATA_DIR LOCAL_ROOT_DIR = BasicConfig.LOCAL_ROOT_DIR @@ -49,7 +49,7 @@ class TestExperimentStatusDatabaseManager(unittest.TestCase): self.assertTrue(exp_status_row.exp_id == exp_row_direct.exp_id) - def test_update_exp_status(self): + def test_update_exp_status(self): self.exp_status_db.update_exp_status(EXPID, "RUNNING") exp_status_row_current = self.exp_status_db.get_experiment_status_row_by_expid(EXPID) self.assertTrue(exp_status_row_current.status == "RUNNING") @@ -176,14 +176,14 @@ class TestExperimentHistoryDbManager(unittest.TestCase): job_data_rows_test = [job for job in all_job_data_rows if job.run_id == 3] backup = [JobData.from_model(job) for job in job_data_rows_test] list_job_data_class = [JobData.from_model(job) for job in job_data_rows_test] - backup_changes = [(job.finish, HUtils.get_current_datetime(), job.status, job.rowstatus, job._id) for job in list_job_data_class] - changes = [(current_time, HUtils.get_current_datetime(), "DELAYED", job.rowstatus, job._id) for job in list_job_data_class] + backup_changes = [(HUtils.get_current_datetime(), job.status, job.rowstatus, job._id) for job in list_job_data_class] + changes = [(HUtils.get_current_datetime(), "DELAYED", job.rowstatus, job._id) for job in list_job_data_class] self.experiment_database.update_many_job_data_change_status(changes) all_job_data_rows = self.experiment_database.get_job_data_all() job_data_rows_validate = [job for job in all_job_data_rows if job.run_id == 3] for (job_val, change_item) in zip(job_data_rows_validate, changes): - finish, modified, status, rowstatus, _id = change_item - self.assertTrue(job_val.finish == finish) + modified, status, rowstatus, _id = change_item + # self.assertTrue(job_val.finish == finish) self.assertTrue(job_val.modified == modified) self.assertTrue(job_val.status == status) self.assertTrue(job_val.rowstatus == rowstatus) diff --git a/autosubmit/history/experiment_history.py b/autosubmit/history/experiment_history.py index a2421a0e1..f436cc53d 100644 --- a/autosubmit/history/experiment_history.py +++ b/autosubmit/history/experiment_history.py @@ -22,6 +22,7 @@ import database_managers.database_models as Models import utils as HUtils from time import time, sleep from database_managers.experiment_history_db_manager import ExperimentHistoryDbManager, DEFAULT_JOBDATA_DIR +from strategies import PlatformInformationHandler, SingleAssociationStrategy, StraightWrapperAssociationStrategy, TwoDimWrapperDistributionStrategy, GeneralizedWrapperDistributionStrategy from data_classes.job_data import JobData from data_classes.experiment_run import ExperimentRun from platform_monitor.slurm_monitor import SlurmMonitor @@ -106,7 +107,6 @@ class ExperimentHistory(): return self.manager.update_job_data_dc_by_id(job_data_dc_last) except Exception as exp: self._log.log(str(exp), traceback.format_exc()) - return None def write_finish_time(self, job_name, finish=0, status="UNKNOWN", ncpus=0, wallclock="00:00", qos="debug", date="", member="", section="", chunk=0, platform="NA", job_id=0, out_file=None, err_file=None, @@ -137,8 +137,7 @@ class ExperimentHistory(): job_data_dc_last.err = err_file if err_file else "" return self.manager.update_job_data_dc_by_id(job_data_dc_last) except Exception as exp: - self._log.log(str(exp), traceback.format_exc()) - return None + self._log.log(str(exp), traceback.format_exc()) def write_platform_data_after_finish(self, job_data_dc, platform_obj): """ @@ -148,54 +147,35 @@ class ExperimentHistory(): sleep(SECONDS_WAIT_PLATFORM) ssh_output = platform_obj.check_job_energy(job_data_dc.job_id) slurm_monitor = SlurmMonitor(ssh_output) - job_data_dcs_in_wrapper = self.manager.get_job_data_dcs_last_by_wrapper_code(job_data_dc.wrapper_code) + self._verify_slurm_monitor(slurm_monitor, job_data_dc) + job_data_dcs_in_wrapper = self.manager.get_job_data_dcs_last_by_wrapper_code(job_data_dc.wrapper_code) + job_data_dcs_to_update = [] if len(job_data_dcs_in_wrapper) > 0: - job_data_dcs_in_wrapper = self._distribute_energy_in_wrapper(job_data_dcs_in_wrapper, slurm_monitor) - self.manager.update_list_job_data_dc_by_each_id(job_data_dcs_in_wrapper) + info_handler = PlatformInformationHandler(StraightWrapperAssociationStrategy()) + job_data_dcs_to_update = info_handler.execute_distribution(job_data_dc, job_data_dcs_in_wrapper, slurm_monitor) + if len(job_data_dcs_to_update) == 0: + info_handler.strategy = TwoDimWrapperDistributionStrategy() + job_data_dcs_to_update = info_handler.execute_distribution(job_data_dc, job_data_dcs_in_wrapper, slurm_monitor) + if len(job_data_dcs_to_update) == 0: + info_handler.strategy = GeneralizedWrapperDistributionStrategy() + job_data_dcs_to_update = info_handler.execute_distribution(job_data_dc, job_data_dcs_in_wrapper, slurm_monitor) else: - job_data_dc = self._assign_platform_information_to_job_data_dc(job_data_dc, slurm_monitor) - job_data_dc = self._set_job_as_processed_in_platform(job_data_dc, slurm_monitor) - return self.manager.update_job_data_dc_by_id(job_data_dc) + info_handler = PlatformInformationHandler(SingleAssociationStrategy()) + job_data_dcs_to_update = info_handler.execute_distribution(job_data_dc, job_data_dcs_in_wrapper, slurm_monitor) + return self.manager.update_list_job_data_dc_by_each_id(job_data_dcs_to_update) except Exception as exp: - self._log.log(str(exp), traceback.format_exc()) - return None - - def _distribute_energy_in_wrapper(self, job_data_dcs, slurm_monitor): - """ Requires SlurmMonitor with data. """ - computational_weights = self._get_calculated_weights_of_jobs_in_wrapper(job_data_dcs) - if len(job_data_dcs) == slurm_monitor.step_count: - for job_dc, step in zip(job_data_dcs, slurm_monitor.steps): - job_dc.energy = step.energy + computational_weights.get(job_dc.job_name, 0) * slurm_monitor.extern.energy - job_dc.AveRSS = step.AveRSS - job_dc.MaxRSS = step.MaxRSS - job_dc.platform_output = "" - else: - for job_dc in job_data_dcs: - job_dc.energy = computational_weights.get(job_dc.job_name, 0) * slurm_monitor.total_energy - job_dc.platform_output = "" - return job_data_dcs - - - - - def _set_job_as_processed_in_platform(self, job_data_dc, slurm_monitor): - """ """ - job_data_dc.platform_output = slurm_monitor.original_input - job_data_dc.rowstatus = Models.RowStatus.PROCESSED - return job_data_dc - - def _assign_platform_information_to_job_data_dc(self, job_data_dc, slurm_monitor): - """ Basic Assignment. No Wrapper. """ - job_data_dc.submit = slurm_monitor.header.submit - job_data_dc.start = slurm_monitor.header.start - job_data_dc.finish = slurm_monitor.header.finish - job_data_dc.ncpus = slurm_monitor.header.ncpus - job_data_dc.nnodes = slurm_monitor.header.nnodes - job_data_dc.energy = slurm_monitor.header.energy - job_data_dc.MaxRSS = max(slurm_monitor.header.MaxRSS, slurm_monitor.batch.MaxRSS, slurm_monitor.extern.MaxRSS) # TODO: Improve this rule - job_data_dc.AveRSS = max(slurm_monitor.header.AveRSS, slurm_monitor.batch.AveRSS, slurm_monitor.extern.AveRSS) - job_data_dc.platform_output = slurm_monitor.original_input - return job_data_dc + self._log.log(str(exp), traceback.format_exc()) + + def _verify_slurm_monitor(self, slurm_monitor, job_data_dc): + try: + if slurm_monitor.header.status not in ["COMPLETED", "FAILED"]: + self._log.log("Assertion Error on job {0} with ssh_output {1}".format(job_data_dc.job_name, slurm_monitor.original_input), + "Slurm status {0} is not COMPLETED nor FAILED for ID {1}.\n".format(slurm_monitor.header.status, slurm_monitor.header.name)) + if not slurm_monitor.steps_plus_extern_approximate_header_energy(): + self._log.log("Assertion Error on job {0} with ssh_output {1}".format(job_data_dc.job_name, slurm_monitor.original_input), + "Steps + extern != total energy for ID {0}. Number of steps {1}.\n".format(slurm_monitor.header.name, slurm_monitor.step_count)) + except Exception as exp: + self._log.log(str(exp), traceback.format_exc()) def process_status_changes(self, job_list=None, chunk_unit="NA", chunk_size=0, current_config=""): """ Detect status differences between job_list and current job_data rows, and update. Creates a new run if necessary. """ @@ -204,11 +184,16 @@ class ExperimentHistory(): update_these_changes = self._get_built_list_of_changes(job_list) if len(update_these_changes) > 0: self.manager.update_many_job_data_change_status(update_these_changes) - if self.should_we_create_a_new_run(job_list, len(update_these_changes), current_experiment_run_dc.total): + if self.should_we_create_a_new_run(job_list, len(update_these_changes), current_experiment_run_dc.total): return self.create_new_experiment_run(chunk_unit, chunk_size, current_config, job_list) return self.update_counts_on_experiment_run_dc(current_experiment_run_dc, job_list) except Exception as exp: self._log.log(str(exp), traceback.format_exc()) + + def _get_built_list_of_changes(self, job_list): + """ Return: List of (current timestamp, current datetime str, status, rowstatus, id in job_data). One tuple per change. """ + job_data_dcs = self.detect_changes_in_job_list(job_list) + return [(HUtils.get_current_datetime(), job.status, Models.RowStatus.CHANGED, job._id) for job in job_data_dcs] def process_job_list_changes_to_experiment_totals(self, job_list=None): """ Updates current experiment_run row with totals calculated from job_list. """ @@ -238,9 +223,11 @@ class ExperimentHistory(): return self.manager.update_experiment_run_dc_by_id(experiment_run_dc) def finish_current_experiment_run(self): - current_experiment_run_dc = self.manager.get_experiment_run_dc_with_max_id() - current_experiment_run_dc.finish = int(time()) - return self.manager.update_experiment_run_dc_by_id(current_experiment_run_dc) + if self.manager.is_there_a_last_experiment_run(): + current_experiment_run_dc = self.manager.get_experiment_run_dc_with_max_id() + current_experiment_run_dc.finish = int(time()) + return self.manager.update_experiment_run_dc_by_id(current_experiment_run_dc) + return None def create_new_experiment_run(self, chunk_unit="NA", chunk_size=0, current_config="", job_list=None): """ Also writes the finish timestamp of the previous run. """ @@ -254,6 +241,7 @@ class ExperimentHistory(): chunk_unit=chunk_unit, chunk_size=chunk_size, metadata=current_config, + start=int(time()), completed=status_counts[HUtils.SupportedStatus.COMPLETED], total=status_counts["TOTAL"], failed=status_counts[HUtils.SupportedStatus.FAILED], @@ -263,11 +251,6 @@ class ExperimentHistory(): suspended=status_counts[HUtils.SupportedStatus.SUSPENDED]) return self.manager.register_experiment_run_dc(experiment_run_dc) - def _get_built_list_of_changes(self, job_list): - """ Return: List of (current timestamp, current datetime str, status, rowstatus, id in job_data). One tuple per change. """ - job_data_dcs = self.detect_changes_in_job_list(job_list) - return [(int(time()), HUtils.get_current_datetime(), job.status, Models.RowStatus.CHANGED, job._id) for job in job_data_dcs] - def detect_changes_in_job_list(self, job_list): """ Detect changes in job_list compared to the current contents of job_data table. Returns a list of JobData data classes where the status of each item is the new status.""" job_name_to_job = {job.name: job for job in job_list} @@ -279,10 +262,6 @@ class ExperimentHistory(): differences.append(job_dc) return differences - def update_experiment_history_from_job_list(self, job_list): - """ job_list: List of objects, each object must have attributes name, date, member, status_str, children. """ - raise NotImplementedError - def _get_defined_rowtype(self, code): if code: return code diff --git a/autosubmit/history/platform_monitor/test.py b/autosubmit/history/platform_monitor/test.py index a7100243b..2f9100371 100644 --- a/autosubmit/history/platform_monitor/test.py +++ b/autosubmit/history/platform_monitor/test.py @@ -23,8 +23,7 @@ from slurm_monitor import SlurmMonitor class TestSlurmMonitor(unittest.TestCase): def test_reader_on_simple_wrapper_example_1(self): ssh_output = utils.read_example("wrapper1.txt") - slurm_monitor = SlurmMonitor(ssh_output) - slurm_monitor.identify_input_rows() + slurm_monitor = SlurmMonitor(ssh_output) # Header self.assertTrue(slurm_monitor.input_items[0].is_batch == False) self.assertTrue(slurm_monitor.input_items[0].is_detail == False) @@ -43,9 +42,9 @@ class TestSlurmMonitor(unittest.TestCase): self.assertTrue(slurm_monitor.input_items[2].is_extern == True) self.assertTrue(slurm_monitor.input_items[2].is_header == False) self.assertTrue(slurm_monitor.input_items[2].is_detail == True) - header = slurm_monitor.get_header() - batch = slurm_monitor.get_batch() - extern = slurm_monitor.get_extern() + header = slurm_monitor.header + batch = slurm_monitor.batch + extern = slurm_monitor.extern self.assertIsNotNone(header) self.assertIsNotNone(batch) self.assertIsNotNone(extern) @@ -57,8 +56,7 @@ class TestSlurmMonitor(unittest.TestCase): def test_reader_on_simple_wrapper_example_2(self): ssh_output = utils.read_example("wrapper2.txt") # not real - slurm_monitor = SlurmMonitor(ssh_output) - slurm_monitor.identify_input_rows() + slurm_monitor = SlurmMonitor(ssh_output) # Header self.assertTrue(slurm_monitor.input_items[0].is_batch == False) self.assertTrue(slurm_monitor.input_items[0].is_detail == False) @@ -82,11 +80,10 @@ class TestSlurmMonitor(unittest.TestCase): def test_reader_on_big_wrapper(self): ssh_output = utils.read_example("wrapper_big.txt") slurm_monitor = SlurmMonitor(ssh_output) - slurm_monitor.identify_input_rows() - self.assertTrue(slurm_monitor.step_count() == 30) - header = slurm_monitor.get_header() - batch = slurm_monitor.get_batch() - extern = slurm_monitor.get_extern() + self.assertTrue(slurm_monitor.step_count == 30) + header = slurm_monitor.header + batch = slurm_monitor.batch + extern = slurm_monitor.extern self.assertIsNotNone(header) self.assertIsNotNone(batch) self.assertIsNotNone(extern) diff --git a/autosubmit/history/slurm_parser.py b/autosubmit/history/slurm_parser.py deleted file mode 100644 index 3247eefd7..000000000 --- a/autosubmit/history/slurm_parser.py +++ /dev/null @@ -1,163 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2015-2020 Earth Sciences Department, BSC-CNS -# This file is part of Autosubmit. - -# Autosubmit is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. - -# Autosubmit is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. - -# You should have received a copy of the GNU General Public License -# along with Autosubmit. If not, see . - -import os -from collections import namedtuple -from datetime import datetime -from time import mktime, time - -SlurmRow = namedtuple("SlurmRow", ["JobId", "State", "NCPUS", "NNodes", "Submit", "Start", "End", "ConsumedEnergy", "MaxRSS", "AveRSS"]) - -def parse_job_finish_data(output, is_wrapped): - """ - Slurm Command 'sacct -n --jobs {0} -o JobId%25,State,NCPUS,NNodes,Submit,Start,End,ConsumedEnergy,MaxRSS%25,AveRSS%25' - - Only not wrapped jobs return submit, start, finish, joules, ncpus, nnodes. - - ConsumedEnergy: Total energy consumed by all tasks in job, in joules. Note: Only in case of exclusive job allocation this value reflects the jobs' real energy consumption. - - :return: submit, start, finish, joules, ncpus, nnodes, detailed_data - :rtype: int, int, int, int, int, int, json object (str) - """ - try: - # Setting up: Storing detail for posterity - detailed_data = dict() - steps = [] - # No blank spaces after or before - output = str(output).strip() if output else None - lines = output.split("\n") if output else [] - is_end_of_wrapper = False - # If there is output, list exists - if len(lines) > 0: - # Collecting information from all output - for line in lines: - line = line.strip().split() - if len(line) > 0: - # Collecting detailed data - name = str(line[0]) - extra_data = {"ncpus": str(line[2] if len(line) > 2 else "NA"), - "nnodes": str(line[3] if len(line) > 3 else "NA"), - "submit": str(line[4] if len(line) > 4 else "NA"), - "start": str(line[5] if len(line) > 5 else "NA"), - "finish": str(line[6] if len(line) > 6 else "NA"), - "energy": str(line[7] if len(line) > 7 else "NA"), - "MaxRSS": str(line[8] if len(line) > 8 else "NA"), - "AveRSS": str(line[9] if len(line) > 9 else "NA")} - # Detailed data will contain the important information from output - detailed_data[name] = extra_data - steps.append(name) - submit = start = finish = energy = nnodes = ncpus = 0 - status = "UNKNOWN" - # Take first line as source - line = lines[0].strip().split() - ncpus = int(line[2] if len(line) > 2 else 0) - nnodes = int(line[3] if len(line) > 3 else 0) - status = str(line[1]) - if is_wrapped == False: - # If it is not wrapper job, take first line as source - if status not in ["COMPLETED", "FAILED", "UNKNOWN"]: - # It not completed, then its error and send default data plus output - return (0, 0, 0, 0, ncpus, nnodes, detailed_data, False) - else: - # If it is a is_wrapped job - # Check if the wrapper has finished - if status in ["COMPLETED", "FAILED", "UNKNOWN"]: - # Wrapper has finished - is_end_of_wrapper = True - # Continue with first line as source - if line: - try: - # Parse submit and start only for normal jobs (not is_wrapped) - submit = int(mktime(datetime.strptime( - line[4], "%Y-%m-%dT%H:%M:%S").timetuple())) if not is_wrapped else 0 - start = int(mktime(datetime.strptime( - line[5], "%Y-%m-%dT%H:%M:%S").timetuple())) if not is_wrapped else 0 - # Assuming the job has been COMPLETED - # If normal job or end of wrapper => Try to get the finish time from the first line of the output, else default to now. - finish = 0 - - if not is_wrapped: - # If normal job, take finish time from first line - finish = (int(mktime(datetime.strptime(line[6], "%Y-%m-%dT%H:%M:%S").timetuple( - ))) if len(line) > 6 and line[6] != "Unknown" else int(time())) - energy = parse_output_number(line[7]) if len( - line) > 7 and len(line[7]) > 0 else 0 - else: - # If it is a wrapper job - # If end of wrapper, take data from first line - if is_end_of_wrapper == True: - finish = (int(mktime(datetime.strptime(line[6], "%Y-%m-%dT%H:%M:%S").timetuple( - ))) if len(line) > 6 and line[6] != "Unknown" else int(time())) - energy = parse_output_number(line[7]) if len( - line) > 7 and len(line[7]) > 0 else 0 - else: - # If wrapped but not end of wrapper, try to get info from current data. - if "finish" in extra_data.keys() and extra_data["finish"] != "Unknown": - # finish data exists - finish = int(mktime(datetime.strptime( - extra_data["finish"], "%Y-%m-%dT%H:%M:%S").timetuple())) - else: - # if finish date does not exist, query previous step. - if len(steps) >= 2 and detailed_data.__contains__(steps[-2]): - new_extra_data = detailed_data[steps[-2]] - if "finish" in new_extra_data.keys() and new_extra_data["finish"] != "Unknown": - # This might result in an job finish < start, need to handle that in the caller function - finish = int(mktime(datetime.strptime( - new_extra_data["finish"], "%Y-%m-%dT%H:%M:%S").timetuple())) - else: - finish = int(time()) - else: - finish = int(time()) - if "energy" in extra_data.keys() and extra_data["energy"] != "NA": - # energy exists - energy = parse_output_number( - extra_data["energy"]) - else: - # if energy does not exist, query previous step - if len(steps) >= 2 and detailed_data.__contains__(steps[-2]): - new_extra_data = detailed_data[steps[-2]] - if "energy" in new_extra_data.keys() and new_extra_data["energy"] != "NA": - energy = parse_output_number( - new_extra_data["energy"]) - else: - energy = 0 - else: - energy = 0 - except Exception as exp: - pass - - detailed_data = detailed_data if not is_wrapped or is_end_of_wrapper == True else extra_data - return (submit, start, finish, energy, ncpus, nnodes, detailed_data, is_end_of_wrapper) - - return (0, 0, 0, 0, 0, 0, dict(), False) - except Exception as exp: - return (0, 0, 0, 0, 0, 0, dict(), False) - - - - -def read_example(example_name): - source_path = "autosubmit/history/output_examples/" - file_path = os.path.join(source_path, example_name) - with open(file_path, "r") as fp: - output_ssh = fp.read() - return output_ssh - -if __name__ == "__main__": - output_ssh = read_example("pending.txt") - print(parse_job_finish_data(output_ssh, True)) \ No newline at end of file diff --git a/autosubmit/history/strategies.py b/autosubmit/history/strategies.py index 1470d8a95..976904f74 100644 --- a/autosubmit/history/strategies.py +++ b/autosubmit/history/strategies.py @@ -18,6 +18,8 @@ from abc import ABCMeta, abstractmethod import database_managers.database_models as Models +import traceback +from internal_logging import Logging class PlatformInformationHandler(): def __init__(self, strategy): @@ -32,7 +34,7 @@ class PlatformInformationHandler(): self._strategy = strategy def execute_distribution(self, job_data_dc, job_data_dcs_in_wrapper, slurm_monitor): - result = self._strategy.apply_distribution() + return self._strategy.apply_distribution(job_data_dc, job_data_dcs_in_wrapper, slurm_monitor) class Strategy(): @@ -45,89 +47,147 @@ class Strategy(): def set_job_data_dc_as_processed(self, job_data_dc, original_ssh_output): job_data_dc.platform_output = original_ssh_output - job_data_dc.row_status = Models.RowStatus.PROCESSED + job_data_dc.rowstatus = Models.RowStatus.PROCESSED + return job_data_dc + + def set_job_data_dc_as_process_failed(self, job_data_dc, original_ssh_output): + job_data_dc.platform_output = original_ssh_output + job_data_dc.rowstatus = Models.RowStatus.FAULTY return job_data_dc def get_calculated_weights_of_jobs_in_wrapper(self, job_data_dcs_in_wrapper): """ Based on computational weight: running time in seconds * number of cpus. """ total_weight = sum(job.computational_weight for job in job_data_dcs_in_wrapper) - return {job.job_name: round(job.computational_weight/total_weight, 4) for job in job_data_dcs_in_wrapper} - + return {job.job_name: round(job.computational_weight/total_weight, 4) for job in job_data_dcs_in_wrapper} + -class SimpleAssociationStrategy(Strategy): +class SingleAssociationStrategy(Strategy): def apply_distribution(self, job_data_dc, job_data_dcs_in_wrapper, slurm_monitor): - if len(job_data_dcs_in_wrapper) > 0: - return [] - job_data_dc.submit = slurm_monitor.header.submit - job_data_dc.start = slurm_monitor.header.start - job_data_dc.finish = slurm_monitor.header.finish - job_data_dc.ncpus = slurm_monitor.header.ncpus - job_data_dc.nnodes = slurm_monitor.header.nnodes - job_data_dc.energy = slurm_monitor.header.energy - job_data_dc.MaxRSS = max(slurm_monitor.header.MaxRSS, slurm_monitor.batch.MaxRSS, slurm_monitor.extern.MaxRSS) # TODO: Improve this rule - job_data_dc.AveRSS = max(slurm_monitor.header.AveRSS, slurm_monitor.batch.AveRSS, slurm_monitor.extern.AveRSS) - job_data_dc = self.set_job_data_dc_as_processed(job_data_dc, slurm_monitor.original_input) - return [job_data_dc] - -class StraightAssociationStrategy(Strategy): + try: + if len(job_data_dcs_in_wrapper) > 0: + return [] + job_data_dc.submit = slurm_monitor.header.submit + job_data_dc.start = slurm_monitor.header.start + job_data_dc.finish = slurm_monitor.header.finish + job_data_dc.ncpus = slurm_monitor.header.ncpus + job_data_dc.nnodes = slurm_monitor.header.nnodes + job_data_dc.energy = slurm_monitor.header.energy + job_data_dc.MaxRSS = max(slurm_monitor.header.MaxRSS, slurm_monitor.batch.MaxRSS if slurm_monitor.batch else 0, slurm_monitor.extern.MaxRSS if slurm_monitor.extern else 0) # TODO: Improve this rule + job_data_dc.AveRSS = max(slurm_monitor.header.AveRSS, slurm_monitor.batch.AveRSS if slurm_monitor.batch else 0, slurm_monitor.extern.AveRSS if slurm_monitor.extern else 0) + job_data_dc = self.set_job_data_dc_as_processed(job_data_dc, slurm_monitor.original_input) + return [job_data_dc] + except Exception as exp: + Logging("strategies").log("SingleAssociationStrategy failed for {0}. Using ssh_output: {1}. Exception message: {2}".format(job_data_dc.job_name, slurm_monitor.original_input, str(exp)), + traceback.format_exc()) + job_data_dc = self.set_job_data_dc_as_process_failed(job_data_dc, slurm_monitor.original_input) + return [job_data_dc] + +class StraightWrapperAssociationStrategy(Strategy): def apply_distribution(self, job_data_dc, job_data_dcs_in_wrapper, slurm_monitor): """ """ - if len(job_data_dcs_in_wrapper) != slurm_monitor.step_count: - return [] - result = [] - computational_weights = self.get_calculated_weights_of_jobs_in_wrapper(job_data_dcs_in_wrapper) - for job_dc, step in zip(job_data_dcs_in_wrapper, slurm_monitor.steps): - job_dc.energy = step.energy + computational_weights.get(job_dc.job_name, 0) * slurm_monitor.extern.energy - job_dc.AveRSS = step.AveRSS - job_dc.MaxRSS = step.MaxRSS - job_dc.platform_output = "" - result.append(job_dc) - job_data_dc = self.set_job_data_dc_as_processed(job_data_dc, slurm_monitor.original_input) - result.append(job_data_dc) - return result - -class GeneralizedDistributionStrategy(Strategy): + try: + if len(job_data_dcs_in_wrapper) != slurm_monitor.step_count: + return [] + result = [] + computational_weights = self.get_calculated_weights_of_jobs_in_wrapper(job_data_dcs_in_wrapper) + for job_dc, step in zip(job_data_dcs_in_wrapper, slurm_monitor.steps): + job_dc.energy = step.energy + computational_weights.get(job_dc.job_name, 0) * slurm_monitor.extern.energy + job_dc.AveRSS = step.AveRSS + job_dc.MaxRSS = step.MaxRSS + job_dc.platform_output = "" + result.append(job_dc) + job_data_dc = self.set_job_data_dc_as_processed(job_data_dc, slurm_monitor.original_input) + result.append(job_data_dc) + return result + except Exception as exp: + Logging("strategies").log("StraightWrapperAssociationStrategy failed for {0}. Using ssh_output: {1}. Exception message: {2}".format(job_data_dc.job_name, slurm_monitor.original_input, str(exp)), + traceback.format_exc()) + job_data_dc = self.set_job_data_dc_as_process_failed(job_data_dc, slurm_monitor.original_input) + return [job_data_dc] + +class GeneralizedWrapperDistributionStrategy(Strategy): def apply_distribution(self, job_data_dc, job_data_dcs_in_wrapper, slurm_monitor): - result = [] - computational_weights = self.get_calculated_weights_of_jobs_in_wrapper(job_data_dcs_in_wrapper) - for job_dc in job_data_dcs_in_wrapper: - job_dc.energy = computational_weights.get(job_dc.job_name, 0) * slurm_monitor.total_energy - job_dc.platform_output = "" - result.append(job_dc) - job_data_dc = self.set_job_data_dc_as_processed(job_data_dc, slurm_monitor.original_input) - result.append(job_data_dc) - return result + try: + result = [] + computational_weights = self.get_calculated_weights_of_jobs_in_wrapper(job_data_dcs_in_wrapper) + for job_dc in job_data_dcs_in_wrapper: + job_dc.energy = round(computational_weights.get(job_dc.job_name, 0) * slurm_monitor.total_energy,2) + job_dc.platform_output = "" + result.append(job_dc) + job_data_dc = self.set_job_data_dc_as_processed(job_data_dc, slurm_monitor.original_input) + result.append(job_data_dc) + return result + except Exception as exp: + Logging("strategies").log("GeneralizedWrapperDistributionStrategy failed for {0}. Using ssh_output: {1}. Exception message: {2}".format(job_data_dc.job_name, slurm_monitor.original_input, str(exp)), + traceback.format_exc()) + job_data_dc = self.set_job_data_dc_as_process_failed(job_data_dc, slurm_monitor.original_input) + return [job_data_dc] class TwoDimWrapperDistributionStrategy(Strategy): def apply_distribution(self, job_data_dc, job_data_dcs_in_wrapper, slurm_monitor): - result = [] - # Challenge: Get jobs per level and then distribute energy - return result + try: + result = [] + self.jobs_per_level = self.get_jobs_per_level(job_data_dcs_in_wrapper) + if len(self.jobs_per_level) != slurm_monitor.step_count: + return [] + comp_weight_per_level = self.get_comp_weight_per_level(self.jobs_per_level) + level_energy = [] + for i, step in enumerate(slurm_monitor.steps): + level_energy.append(step.energy + comp_weight_per_level[i] * slurm_monitor.extern.energy) + for i, jobs in enumerate(self.jobs_per_level): + weights = self.get_comp_weight_per_group_of_job_dcs(jobs) + for j, job_dc in enumerate(jobs): + job_dc.energy = round(level_energy[i] * weights[j], 2) + result.append(job_dc) + job_data_dc = self.set_job_data_dc_as_processed(job_data_dc, slurm_monitor.original_input) + result.append(job_data_dc) + return result + except Exception as exp: + Logging("strategies").log("TwoDimWrapperDistributionStrategy failed for {0}. Using ssh_output: {1}. Exception message: {2}".format(job_data_dc.job_name, slurm_monitor.original_input, str(exp)), + traceback.format_exc()) + job_data_dc = self.set_job_data_dc_as_process_failed(job_data_dc, slurm_monitor.original_input) + return [job_data_dc] def get_jobs_per_level(self, job_data_dcs_in_wrapper): - job_name_to_children_names = {job.job_name:job.children.split(",") for job in job_data_dcs_in_wrapper} - children_names = [] - for job_name in job_name_to_children_names: - children_names.extend(job_name_to_children_names[job_name]) - - - - - - - - - - + """ List of Lists, index of list is the level. """ + job_name_to_object = {job.job_name: job for job in job_data_dcs_in_wrapper} + levels = [] + roots_dcs = self._get_roots(job_data_dcs_in_wrapper) + levels.append(roots_dcs) + next_level = self.get_level(roots_dcs, job_name_to_object) + while len(next_level) > 0: + levels.append([job for job in next_level]) + next_level = self.get_level(next_level, job_name_to_object) + return levels + + def _get_roots(self, job_data_dcs_in_wrapper): + children_names = self._get_all_children(job_data_dcs_in_wrapper) + return [job for job in job_data_dcs_in_wrapper if job.job_name not in children_names] -def simple_association_strategy(job_data_dc, slurm_monitor): - job_data_dc.submit = slurm_monitor.header.submit - job_data_dc.start = slurm_monitor.header.start - job_data_dc.finish = slurm_monitor.header.finish - job_data_dc.ncpus = slurm_monitor.header.ncpus - job_data_dc.nnodes = slurm_monitor.header.nnodes - job_data_dc.energy = slurm_monitor.header.energy - job_data_dc.MaxRSS = max(slurm_monitor.header.MaxRSS, slurm_monitor.batch.MaxRSS, slurm_monitor.extern.MaxRSS) # TODO: Improve this rule - job_data_dc.AveRSS = max(slurm_monitor.header.AveRSS, slurm_monitor.batch.AveRSS, slurm_monitor.extern.AveRSS) - job_data_dc.platform_output = slurm_monitor.original_input - return job_data_dc \ No newline at end of file + def _get_all_children(self, job_data_dcs_in_wrapper): + result = [] + for job_dc in job_data_dcs_in_wrapper: + result.extend(job_dc.children_list) + return result + + def get_comp_weight_per_group_of_job_dcs(self, jobs): + total = sum(job.computational_weight for job in jobs) + return [round(job.computational_weight/total, 4) for job in jobs] + + def get_comp_weight_per_level(self, jobs_per_level): + level_weight = [] + total_weight = 0 + for jobs in jobs_per_level: + computational_weight = sum(job.computational_weight for job in jobs) + total_weight += computational_weight + level_weight.append(computational_weight) + return [round(weight/total_weight, 4) for weight in level_weight] + + def get_level(self, previous_level_dcs, job_name_to_object): + children_names = [] + for job_dc in previous_level_dcs: + children_names.extend(job_dc.children_list) + level_dcs = [job_name_to_object[job_name] for job_name in children_names if job_name in job_name_to_object] + return level_dcs + + \ No newline at end of file diff --git a/autosubmit/history/tests.py b/autosubmit/history/test.py similarity index 89% rename from autosubmit/history/tests.py rename to autosubmit/history/test.py index 5b7b26cea..b12112e71 100644 --- a/autosubmit/history/tests.py +++ b/autosubmit/history/test.py @@ -24,6 +24,7 @@ from shutil import copy2 from collections import namedtuple from experiment_history import ExperimentHistory from internal_logging import Logging +from strategies import StraightWrapperAssociationStrategy, GeneralizedWrapperDistributionStrategy, PlatformInformationHandler from autosubmit.config.basicConfig import BasicConfig from platform_monitor.slurm_monitor import SlurmMonitor EXPID_TT00_SOURCE = "test_database.db~" @@ -97,7 +98,7 @@ class TestExperimentHistory(unittest.TestCase): built_differences = exp_history._get_built_list_of_changes(self.job_list) expected_ids_differences = [90, 101] for item in built_differences: - self.assertTrue(item[4] in expected_ids_differences) + self.assertTrue(item[3] in expected_ids_differences) def test_get_date_member_count(self): exp_history = ExperimentHistory("tt00") @@ -159,26 +160,11 @@ class TestExperimentHistory(unittest.TestCase): exp_history = ExperimentHistory("tt00") exp_history.initialize_database() job_data_dcs = exp_history.manager.get_all_last_job_data_dcs() - calculated_weights = exp_history._get_calculated_weights_of_jobs_in_wrapper(job_data_dcs) + calculated_weights = GeneralizedWrapperDistributionStrategy().get_calculated_weights_of_jobs_in_wrapper(job_data_dcs) sum_comp_weight = 0 for job_name in calculated_weights: sum_comp_weight += calculated_weights[job_name] self.assertTrue(abs(sum_comp_weight - 1) <= 0.01) - - def test_assign_platform_information_to_job_data_dc(self): - exp_history = ExperimentHistory("tt00") - exp_history.initialize_database() - ssh_output = ''' 12535498 COMPLETED 2 1 2020-11-18T13:54:24 2020-11-18T13:55:55 2020-11-18T13:56:10 2.77K - 12535498.batch COMPLETED 2 1 2020-11-18T13:55:55 2020-11-18T13:55:55 2020-11-18T13:56:10 2.69K 659K 659K - 12535498.extern COMPLETED 2 1 2020-11-18T13:55:55 2020-11-18T13:55:55 2020-11-18T13:56:10 2.77K 24K 24K ''' - slurm_monitor = SlurmMonitor(ssh_output) - job_data_dc = exp_history.manager.get_job_data_dc_unique_latest_by_job_name("a29z_20000101_fc1_1_CLEAN") - job_data_dc_result = exp_history._assign_platform_information_to_job_data_dc(job_data_dc, slurm_monitor) - self.assertTrue(job_data_dc_result.job_name == job_data_dc.job_name) - self.assertTrue(job_data_dc_result.energy == slurm_monitor.header.energy) - self.assertTrue(job_data_dc_result.status == "COMPLETED") - self.assertTrue(slurm_monitor.header.energy == 2770) - self.assertTrue(job_data_dc_result.MaxRSS == 659000) def test_distribute_energy_in_wrapper_1_to_1(self): exp_history = ExperimentHistory("tt00") @@ -193,8 +179,9 @@ class TestExperimentHistory(unittest.TestCase): ''' slurm_monitor = SlurmMonitor(ssh_output) job_data_dcs = exp_history.manager.get_all_last_job_data_dcs()[:4] # Get me 4 jobs - weights = exp_history._get_calculated_weights_of_jobs_in_wrapper(job_data_dcs) - job_data_dcs_with_data = exp_history._distribute_energy_in_wrapper(job_data_dcs, slurm_monitor) + weights = StraightWrapperAssociationStrategy().get_calculated_weights_of_jobs_in_wrapper(job_data_dcs) + info_handler = PlatformInformationHandler(StraightWrapperAssociationStrategy()) + job_data_dcs_with_data = info_handler.execute_distribution(job_data_dcs[0], job_data_dcs, slurm_monitor) self.assertTrue(job_data_dcs_with_data[0].energy == round(slurm_monitor.steps[0].energy + weights[job_data_dcs_with_data[0].job_name]*slurm_monitor.extern.energy, 2)) self.assertTrue(job_data_dcs_with_data[0].MaxRSS == slurm_monitor.steps[0].MaxRSS) self.assertTrue(job_data_dcs_with_data[2].energy == round(slurm_monitor.steps[2].energy + weights[job_data_dcs_with_data[2].job_name]*slurm_monitor.extern.energy, 2)) @@ -213,15 +200,16 @@ class TestExperimentHistory(unittest.TestCase): ''' slurm_monitor = SlurmMonitor(ssh_output) job_data_dcs = exp_history.manager.get_all_last_job_data_dcs()[:5] # Get me 5 jobs - weights = exp_history._get_calculated_weights_of_jobs_in_wrapper(job_data_dcs) - job_data_dcs_with_data = exp_history._distribute_energy_in_wrapper(job_data_dcs, slurm_monitor) + weights = GeneralizedWrapperDistributionStrategy().get_calculated_weights_of_jobs_in_wrapper(job_data_dcs) + # print(sum(weights[k] for k in weights)) + info_handler = PlatformInformationHandler(GeneralizedWrapperDistributionStrategy()) + job_data_dcs_with_data = info_handler.execute_distribution(job_data_dcs[0], job_data_dcs, slurm_monitor) self.assertTrue(job_data_dcs_with_data[0].energy == round(slurm_monitor.total_energy * weights[job_data_dcs_with_data[0].job_name], 2)) self.assertTrue(job_data_dcs_with_data[1].energy == round(slurm_monitor.total_energy * weights[job_data_dcs_with_data[1].job_name], 2)) self.assertTrue(job_data_dcs_with_data[2].energy == round(slurm_monitor.total_energy * weights[job_data_dcs_with_data[2].job_name], 2)) self.assertTrue(job_data_dcs_with_data[3].energy == round(slurm_monitor.total_energy * weights[job_data_dcs_with_data[3].job_name], 2)) - sum_energy = sum(job.energy for job in job_data_dcs_with_data) - print(sum_energy) - print(slurm_monitor.total_energy) + self.assertTrue(job_data_dcs_with_data[4].energy == round(slurm_monitor.total_energy * weights[job_data_dcs_with_data[4].job_name], 2)) + sum_energy = sum(job.energy for job in job_data_dcs_with_data[:5]) # Last 1 is original job_data_dc self.assertTrue(abs(sum_energy - slurm_monitor.total_energy) <= 10) def test_process_status_changes(self): @@ -231,7 +219,7 @@ class TestExperimentHistory(unittest.TestCase): CHUNK_SIZE = 20 CURRENT_CONFIG = "CURRENT CONFIG" current_experiment_run_dc = exp_history.manager.get_experiment_run_dc_with_max_id() - exp_run = exp_history.process_status_changes(job_list=self.job_list, chunk_unit=CHUNK_UNIT, chunk_size=CHUNK_SIZE, current_config=CURRENT_CONFIG) # Generates new run + exp_run = exp_history.process_status_changes(job_list=self.job_list, chunk_unit=CHUNK_UNIT, chunk_size=CHUNK_SIZE, current_config=CURRENT_CONFIG) # Generates new run self.assertTrue(current_experiment_run_dc.run_id != exp_run.run_id) self.assertTrue(exp_run.chunk_unit == CHUNK_UNIT) self.assertTrue(exp_run.metadata == CURRENT_CONFIG) diff --git a/autosubmit/history/test_strategies.py b/autosubmit/history/test_strategies.py new file mode 100644 index 000000000..d84d46b15 --- /dev/null +++ b/autosubmit/history/test_strategies.py @@ -0,0 +1,94 @@ +#!/usr/bin/python + +# Copyright 2015-2020 Earth Sciences Department, BSC-CNS +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . + +import unittest +from collections import namedtuple +from data_classes.job_data import JobData +from strategies import StraightWrapperAssociationStrategy, GeneralizedWrapperDistributionStrategy, PlatformInformationHandler, TwoDimWrapperDistributionStrategy +from platform_monitor.slurm_monitor import SlurmMonitor +job_dc = namedtuple("Job", ["job_name", "date", "member", "status_str", "children", "children_list"]) + +class Test2DWrapperDistributionStrategy(unittest.TestCase): + def setUp(self): + self.strategy = TwoDimWrapperDistributionStrategy() + self.job_data_dcs_in_wrapper = [ + JobData(0, job_name="a29z_20000101_fc2_1_POSTR", status="COMPLETED", submit=10, start=100, finish=200, ncpus=100, energy=0, children="a29z_20000101_fc1_1_CLEAN, a29z_20000101_fc3_1_POST"), + JobData(0, job_name="a29z_20000101_fc1_1_CLEAN", status="COMPLETED", submit=10, start=100, finish=200, ncpus=100, energy=0, children="a29z_20000101_fc2_1_CLEAN"), + JobData(0, job_name="a29z_20000101_fc3_1_POST", status="COMPLETED", submit=10, start=100, finish=200, ncpus=100, energy=0, children="a29z_20000101_fc0_3_SIM"), + JobData(0, job_name="a29z_20000101_fc2_1_CLEAN", status="COMPLETED", submit=10, start=100, finish=200, ncpus=100, energy=0, children=""), + JobData(0, job_name="a29z_20000101_fc0_3_SIM", status="COMPLETED", submit=10, start=100, finish=200, ncpus=100, energy=0, children=""), + JobData(0, job_name="a29z_20000101_fc1_2_POSTR1", status="COMPLETED", submit=10, start=100, finish=200, ncpus=100, energy=0, children="a29z_20000101_fc1_5_POST2"), + JobData(0, job_name="a29z_20000101_fc1_5_POST2", status="COMPLETED", submit=10, start=100, finish=200, ncpus=100, energy=0, children="a29z_20000101_fc1_4_POST3"), + JobData(0, job_name="a29z_20000101_fc1_4_POST3", status="COMPLETED", submit=10, start=100, finish=200, ncpus=100, energy=0, children="a29z_20000101_fc2_5_CLEAN4"), + JobData(0, job_name="a29z_20000101_fc2_5_CLEAN4", status="COMPLETED", submit=10, start=100, finish=200, ncpus=100, energy=0, children="a29z_20000101_fc0_1_POST5"), + JobData(0, job_name="a29z_20000101_fc0_1_POST5", status="COMPLETED", submit=10, start=100, finish=200, ncpus=100, energy=0, children=""), + ] + + def test_get_all_children(self): + children = self.strategy._get_all_children(self.job_data_dcs_in_wrapper) + self.assertTrue(len(children) == 8) + + def test_get_roots(self): + roots = self.strategy._get_roots(self.job_data_dcs_in_wrapper) + self.assertTrue(len(roots) == 2) + + def test_get_level(self): + roots = self.strategy._get_roots(self.job_data_dcs_in_wrapper) + job_name_to_children_names = {job.job_name: job.children_list for job in self.job_data_dcs_in_wrapper} + next_level = self.strategy.get_level(roots, job_name_to_children_names) + self.assertTrue(len(next_level) == 3) + + def test_get_jobs_per_level(self): + levels = self.strategy.get_jobs_per_level(self.job_data_dcs_in_wrapper) + for level in levels: + print([job.job_name for job in level]) + self.assertTrue(len(levels) == 5) + self.assertTrue("a29z_20000101_fc0_1_POST5" in [job.job_name for job in levels[4]]) + + def test_energy_distribution(self): + ssh_output = ''' 17857525 COMPLETED 10 1 2021-10-13T15:51:16 2021-10-13T15:51:17 2021-10-13T15:52:47 2.62K + 17857525.batch COMPLETED 10 1 2021-10-13T15:51:17 2021-10-13T15:51:17 2021-10-13T15:52:47 1.88K 6264K 6264K + 17857525.extern COMPLETED 10 1 2021-10-13T15:51:17 2021-10-13T15:51:17 2021-10-13T15:52:47 1.66K 473K 68K + 17857525.0 COMPLETED 10 1 2021-10-13T15:51:21 2021-10-13T15:51:21 2021-10-13T15:51:22 186 352K 312.30K + 17857525.1 COMPLETED 10 1 2021-10-13T15:51:23 2021-10-13T15:51:23 2021-10-13T15:51:24 186 420K 306.70K + 17857525.2 COMPLETED 10 1 2021-10-13T15:51:24 2021-10-13T15:51:24 2021-10-13T15:51:27 188 352K 325.80K + 17857525.3 COMPLETED 10 1 2021-10-13T15:51:28 2021-10-13T15:51:28 2021-10-13T15:51:29 192 352K 341.90K + 17857525.4 COMPLETED 10 1 2021-10-13T15:51:28 2021-10-13T15:51:28 2021-10-13T15:51:29 210 352K 341.90K + ''' + slurm_monitor = SlurmMonitor(ssh_output) + info_handler = PlatformInformationHandler(TwoDimWrapperDistributionStrategy()) + job_dcs = info_handler.execute_distribution(self.job_data_dcs_in_wrapper[0], self.job_data_dcs_in_wrapper, slurm_monitor) + for job in job_dcs: + print("{0} -> {1} and {2} : ncpus {3} running {4}".format(job.job_name, job.energy, job.rowstatus, job.ncpus, job.running_time)) + for level in info_handler.strategy.jobs_per_level: + print([job.job_name for job in level]) + total_in_jobs = sum(job.energy for job in job_dcs[:-1]) # ignore last + self.assertTrue(abs(total_in_jobs - slurm_monitor.total_energy) <= 10) + self.assertTrue(abs(job_dcs[0].energy - 259) < 1) + self.assertTrue(abs(job_dcs[1].energy - 259) < 1) + self.assertTrue(abs(job_dcs[2].energy - 228) < 1) + self.assertTrue(abs(job_dcs[3].energy - 228) < 1) + self.assertTrue(abs(job_dcs[4].energy - 228) < 1) + self.assertTrue(abs(job_dcs[5].energy - 228.67) < 1) + self.assertTrue(abs(job_dcs[6].energy - 228.67) < 1) + self.assertTrue(abs(job_dcs[7].energy - 228.67) < 1) + self.assertTrue(abs(job_dcs[8].energy - 358) < 1) + self.assertTrue(abs(job_dcs[9].energy - 376) < 1) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 8de6e47e4..2cd12f080 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -36,8 +36,9 @@ from autosubmit.config.config_common import AutosubmitConfig from autosubmit.job.job_common import Status, Type, increase_wallclock_by_chunk from autosubmit.job.job_common import StatisticsSnippetBash, StatisticsSnippetPython from autosubmit.job.job_common import StatisticsSnippetR, StatisticsSnippetEmpty +from autosubmit.job.job_utils import get_job_package_code from autosubmit.config.basicConfig import BasicConfig -from autosubmit.database.db_jobdata import JobDataStructure +from autosubmit.history.experiment_history import ExperimentHistory from bscearth.utils.date import date2str, parse_date, previous_day, chunk_end_date, chunk_start_date, Log, subs_dates from time import sleep from threading import Thread @@ -174,6 +175,13 @@ class Job(object): """ return self._parents + @parents.setter + def parents(self, parents): + """ + Sets the parents job list + """ + self._parents = parents + @property def status_str(self): """ @@ -188,13 +196,6 @@ class Job(object): """ return ",".join([str(child.name) for child in self._children]) - @parents.setter - def parents(self, parents): - """ - Sets the parents job list - """ - self._parents = parents - @property def is_serial(self): return str(self.processors) == '1' @@ -1252,9 +1253,11 @@ class Job(object): # Get # Writing database if self.wrapper_type != "vertical" or enabled: - JobDataStructure(self.expid).write_submit_time(self.name, data_time[1], Status.VALUE_TO_KEY[self.status] if self.status in Status.VALUE_TO_KEY.keys() else "UNKNOWN", self.processors, - self.wallclock, self.queue, self.date, self.member, self.section, self.chunk, self.platform_name, self.id, self.packed, self._wrapper_queue) - + exp_history = ExperimentHistory(self.expid) + exp_history.write_submit_time(self.name, submit=data_time[1], status=Status.VALUE_TO_KEY.get(self.status, "UNKNOWN"), ncpus=self.processors, + wallclock=self.wallclock, qos=self.queue, date=self.date, member=self.member, section=self.section, chunk=self.chunk, + platform=self.platform_name, job_id=self.id, wrapper_queue=self._wrapper_queue, wrapper_code=get_job_package_code(self.name), + children=self.children_names_str) def write_start_time(self, enabled = False): """ @@ -1280,8 +1283,11 @@ class Job(object): # noinspection PyTypeChecker f.write(date2str(datetime.datetime.fromtimestamp(start_time), 'S')) # Writing database - JobDataStructure(self.expid).write_start_time(self.name, start_time, Status.VALUE_TO_KEY[self.status] if self.status in Status.VALUE_TO_KEY.keys() else "UNKNOWN", self.processors, - self.wallclock, self._queue, self.date, self.member, self.section, self.chunk, self.platform_name, self.id, self.packed, self._wrapper_queue) + exp_history = ExperimentHistory(self.expid) + exp_history.write_start_time(self.name, start=start_time, status=Status.VALUE_TO_KEY.get(self.status, "UNKNOWN"), ncpus=self.processors, + wallclock=self.wallclock, qos=self.queue, date=self.date, member=self.member, section=self.section, chunk=self.chunk, + platform=self.platform_name, job_id=self.id, wrapper_queue=self._wrapper_queue, wrapper_code=get_job_package_code(self.name), + children=self.children_names_str) return True def write_end_time(self, completed,enabled = False): @@ -1316,13 +1322,17 @@ class Job(object): out, err = self.local_logs path_out = os.path.join(self._tmp_path, 'LOG_' + str(self.expid), out) # Launch first as simple non-threaded function - JobDataStructure(self.expid).write_finish_time(self.name, finish_time, final_status, self.processors, self.wallclock, self._queue, self.date, - self.member, self.section, self.chunk, self.platform_name, self.id, self.platform, self.packed, [job.id for job in self._parents], True, None, out, err, self._wrapper_queue) - # Launch second as threaded function - thread_write_finish = Thread(target=JobDataStructure(self.expid).write_finish_time, args=(self.name, finish_time, final_status, self.processors, - self.wallclock, self._queue, self.date, self.member, self.section, self.chunk, self.platform_name, self.id, self.platform, self.packed, [job.id for job in self._parents], False, path_out, out, err, self._wrapper_queue)) - thread_write_finish.name = "JOB_data_{}".format(self.name) - thread_write_finish.start() + exp_history = ExperimentHistory(self.expid) + job_data_dc = exp_history.write_finish_time(self.name, finish=finish_time, status=final_status, ncpus=self.processors, + wallclock=self.wallclock, qos=self.queue, date=self.date, member=self.member, section=self.section, chunk=self.chunk, + platform=self.platform_name, job_id=self.id, out_file=out, err_file=err, wrapper_queue=self._wrapper_queue, + wrapper_code=get_job_package_code(self.name), children=self.children_names_str) + + # Launch second as threaded function only for slurm + if job_data_dc and type(self.platform) is not str and self.platform.type == "slurm": + thread_write_finish = Thread(target=ExperimentHistory(self.expid).write_platform_data_after_finish, args=(job_data_dc, self.platform)) + thread_write_finish.name = "JOB_data_{}".format(self.name) + thread_write_finish.start() def write_total_stat_by_retries_fix_newline(self): path = os.path.join(self._tmp_path, self.name + '_TOTAL_STATS') @@ -1348,22 +1358,27 @@ class Job(object): path_out = os.path.join(self._tmp_path, 'LOG_' + str(self.expid), out) # Launch first as simple non-threaded function if not first_retrial: - JobDataStructure(self.expid).write_submit_time(self.name, total_stats[0], Status.VALUE_TO_KEY[ - self.status] if self.status in Status.VALUE_TO_KEY.keys() else "UNKNOWN", self.processors, - self.wallclock, self.queue, self.date, self.member, self.section, - self.chunk, self.platform_name, self.id, self.packed, - self._wrapper_queue) - JobDataStructure(self.expid).write_start_time(self.name, total_stats[0], Status.VALUE_TO_KEY[ - self.status] if self.status in Status.VALUE_TO_KEY.keys() else "UNKNOWN", self.processors, - self.wallclock, self._queue, self.date, self.member, - self.section, self.chunk, self.platform_name, self.id, - self.packed, self._wrapper_queue) - JobDataStructure(self.expid).write_finish_time(self.name, total_stats[1], total_stats[2], self.processors, - self.wallclock, self._queue, self.date, - self.member, self.section, self.chunk, self.platform_name, - self.id, self.platform, self.packed, - [job.id for job in self._parents], True, None, out, err, - self._wrapper_queue) + exp_history = ExperimentHistory(self.expid) + exp_history.write_submit_time(self.name, submit=total_stats[0], status=Status.VALUE_TO_KEY.get(self.status, "UNKNOWN"), ncpus=self.processors, + wallclock=self.wallclock, qos=self.queue, date=self.date, member=self.member, section=self.section, chunk=self.chunk, + platform=self.platform_name, job_id=self.id, wrapper_queue=self._wrapper_queue, wrapper_code=get_job_package_code(self.name), + children=self.children_names_str) + exp_history = ExperimentHistory(self.expid) + exp_history.write_start_time(self.name, start=total_stats[0], status=Status.VALUE_TO_KEY.get(self.status, "UNKNOWN"), ncpus=self.processors, + wallclock=self.wallclock, qos=self.queue, date=self.date, member=self.member, section=self.section, chunk=self.chunk, + platform=self.platform_name, job_id=self.id, wrapper_queue=self._wrapper_queue, wrapper_code=get_job_package_code(self.name), + children=self.children_names_str) + + exp_history = ExperimentHistory(self.expid) + job_data_dc = exp_history.write_finish_time(self.name, finish=total_stats[1], status=total_stats[2], ncpus=self.processors, + wallclock=self.wallclock, qos=self.queue, date=self.date, member=self.member, section=self.section, chunk=self.chunk, + platform=self.platform_name, job_id=self.id, out_file=out, err_file=err, wrapper_queue=self._wrapper_queue, + wrapper_code=get_job_package_code(self.name), children=self.children_names_str) + # Launch second as threaded function only for slurm + if job_data_dc and type(self.platform) is not str and self.platform.type == "slurm": + thread_write_finish = Thread(target=ExperimentHistory(self.expid).write_platform_data_after_finish, args=(job_data_dc, self.platform)) + thread_write_finish.name = "JOB_data_{}".format(self.name) + thread_write_finish.start() def check_started_after(self, date_limit): """ diff --git a/autosubmit/job/job_utils.py b/autosubmit/job/job_utils.py index 8deae76ff..a83b76d0f 100644 --- a/autosubmit/job/job_utils.py +++ b/autosubmit/job/job_utils.py @@ -43,31 +43,31 @@ def transitive_reduction(graph): reduced_graph.add_edges_from((u, v) for v in u_edges) return reduced_graph - def get_job_package_code(self, job_name): - """ - Finds the package code and retrieves it. None if no package. +def get_job_package_code(job_name): + """ + Finds the package code and retrieves it. None if no package. - :param BasicConfig: Basic configuration - :type BasicConfig: Configuration Object - :param expid: Experiment Id - :type expid: String - :param current_job_name: Name of job - :type current_jobs: string - :return: package code, None if not found - :rtype: int or None - """ - try: - packages_wrapper = JobPackagePersistence(os.path.join(self.basic_conf.LOCAL_ROOT_DIR, self.expid, "pkl"),"job_packages_" + self.expid).load(wrapper=True) - packages_wrapper_plus = JobPackagePersistence(os.path.join(self.basic_conf.LOCAL_ROOT_DIR, self.expid, "pkl"),"job_packages_" + self.expid).load(wrapper=False) - if (packages_wrapper or packages_wrapper_plus): - packages = packages_wrapper if len(packages_wrapper) > len(packages_wrapper_plus) else packages_wrapper_plus - for exp, package_name, _job_name in packages: - if job_name == _job_name: - code = int(package_name.split("_")[2]) - return code - except: - pass - return 0 + :param BasicConfig: Basic configuration + :type BasicConfig: Configuration Object + :param expid: Experiment Id + :type expid: String + :param current_job_name: Name of job + :type current_jobs: string + :return: package code, None if not found + :rtype: int or None + """ + try: + packages_wrapper = JobPackagePersistence(os.path.join(self.basic_conf.LOCAL_ROOT_DIR, self.expid, "pkl"),"job_packages_" + self.expid).load(wrapper=True) + packages_wrapper_plus = JobPackagePersistence(os.path.join(self.basic_conf.LOCAL_ROOT_DIR, self.expid, "pkl"),"job_packages_" + self.expid).load(wrapper=False) + if (packages_wrapper or packages_wrapper_plus): + packages = packages_wrapper if len(packages_wrapper) > len(packages_wrapper_plus) else packages_wrapper_plus + for exp, package_name, _job_name in packages: + if job_name == _job_name: + code = int(package_name.split("_")[2]) + return code + except: + pass + return 0 class Dependency(object): """ diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index 7fa387511..f0dc578a1 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -47,6 +47,7 @@ class ParamikoPlatform(Platform): self.channels = {} self.poller = select.poll() self.local_x11_display = xlib_connect.get_display(os.environ['DISPLAY']) + @property def header(self): """ @@ -85,13 +86,13 @@ class ParamikoPlatform(Platform): """ Test if the connection is still alive, reconnect if not. """ - try: - self.reset() + try: + self.reset() try: self.restore_connection() except: - pass - transport = self._ssh.get_transport() + pass + transport = self._ssh.get_transport() transport.send_ignore() except EOFError as e: raise AutosubmitError("[{0}] not alive. Host: {1}".format( -- GitLab From 5a4b8de54f1ba4d0f5507f4dcc62d5f6c3f5a00d Mon Sep 17 00:00:00 2001 From: Wilmer Uruchi Ticona Date: Tue, 19 Oct 2021 15:42:56 +0200 Subject: [PATCH 8/8] autosubmit changes --- autosubmit/autosubmit.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index c5c87644a..92e3cfec5 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -1754,7 +1754,8 @@ class Autosubmit: # Safe spot to store changes exp_history = ExperimentHistory(expid, BasicConfig.JOBDATA_DIR) if len(job_changes_tracker) > 0: - exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + job_changes_tracker = {} if Autosubmit.exit: job_list.save() -- GitLab