diff --git a/.vscode/settings.json b/.vscode/settings.json index eaee7b14b66f973c397c133cf7538e7f2da38f64..19a51787bb08887b647555cdb68ea69912b4d461 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,3 +1,4 @@ { - "python.pythonPath": "/Library/Frameworks/Python.framework/Versions/2.7/bin/python" + "python.pythonPath": "/Library/Frameworks/Python.framework/Versions/2.7/bin/python", + "python.linting.enabled": false } \ No newline at end of file diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py index 279ab3255db3fca6636e0affdfb30a68eb90cf1a..92e3cfec55d1244cd1400a9cab8e9e7338d08e8c 100644 --- a/autosubmit/autosubmit.py +++ b/autosubmit/autosubmit.py @@ -71,8 +71,8 @@ import portalocker from pkg_resources import require, resource_listdir, resource_exists, resource_string from collections import defaultdict from pyparsing import nestedExpr -from database.db_jobdata import ExperimentStatus, JobDataStructure - +from history.experiment_status import ExperimentStatus +from history.experiment_history import ExperimentHistory """ Main module for autosubmit. Only contains an interface class to all functionality implemented on autosubmit """ @@ -1367,23 +1367,16 @@ class Autosubmit: if not check_experiment_exists(start_after): return None # Historical Database: We use the historical database to retrieve the current progress data of the supplied expid (start_after) - # JobStructure object, check_only flag to avoid updating remote experiment - jobStructure = JobDataStructure(start_after, check_only=True) - # Check if database exists - if jobStructure.database_exists == False: - Log.critical( - "Experiment {0} does not have a valid database. Make sure that it is running under the latest version of Autosubmit.".format(start_after)) - return - # Check if database version is correct - if jobStructure.is_header_ready_db_version() == False: - Log.critical("Experiment {0} is running DB version {1} which is not supported by the completion trigger function. An updated DB version is needed.".format( - start_after, jobStructure.db_version)) + exp_history = ExperimentHistory(start_after, BasicConfig.JOBDATA_DIR) + if exp_history.is_header_ready() == False: + Log.critical("Experiment {0} is running a database version which is not supported by the completion trigger function. An updated DB version is needed.".format( + start_after)) return Log.info("Autosubmit will start monitoring experiment {0}. When the number of completed jobs plus suspended jobs becomes equal to the total number of jobs of experiment {0}, experiment {1} will start. Querying every 60 seconds. Status format Completed/Queuing/Running/Suspended/Failed.".format( start_after, expid)) while True: # Query current run - current_run = jobStructure.get_max_id_experiment_run() + current_run = exp_history.manager.get_experiment_run_dc_with_max_id() if current_run and current_run.finish > 0 and current_run.total > 0 and current_run.completed + current_run.suspended == current_run.total: break else: @@ -1548,15 +1541,14 @@ class Autosubmit: # Before starting main loop, setup historical database tables and main information Log.debug("Running job data structure") try: - # Historical Database: Can create a new run if there is a difference in the number of jobs or if the current state does not exist. - job_data_structure = JobDataStructure(expid) - job_data_structure.validate_current_run(job_list.get_job_list( - ), as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), current_config=as_conf.get_full_config_as_json()) - - ExperimentStatus(expid).update_running_status() + # Historical Database: Can create a new run if there is a difference in the number of jobs or if the current run does not exist. + exp_history = ExperimentHistory(expid) + exp_history.initialize_database() + exp_history.process_status_changes(job_list.get_job_list(), as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), current_config=as_conf.get_full_config_as_json()) + ExperimentStatus(expid).set_as_running() except Exception as e: raise AutosubmitCritical( - "Error while processing job_data_structure", 7067, str(e)) + "Error while processing historical database.", 7067, str(e)) if allowed_members: # Set allowed members after checks have been performed. This triggers the setter and main logic of the -rm feature. job_list.run_members = allowed_members @@ -1760,15 +1752,16 @@ class Autosubmit: if save: job_list.save() # Safe spot to store changes - job_data_structure.process_status_changes( - job_changes_tracker, job_list.get_job_list()) - job_changes_tracker = {} + exp_history = ExperimentHistory(expid, BasicConfig.JOBDATA_DIR) + if len(job_changes_tracker) > 0: + exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) + job_changes_tracker = {} if Autosubmit.exit: job_list.save() time.sleep(safetysleeptime) - except AutosubmitError as e: # If an error is detected, restore all connections and job_list + except AutosubmitError as e: # If an error is detected, restore all connections and job_list Log.error("Trace: {0}", e.trace) Log.error("{1} [eCode={0}]", e.code, e.message) Log.info("Waiting 30 seconds before continue") @@ -1860,8 +1853,8 @@ class Autosubmit: raise Log.result("No more jobs to run.") # Updating job data header with current information when experiment ends - job_data_structure.validate_current_run( - job_list.get_job_list(), as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), must_create=False, only_update=True) + exp_history = ExperimentHistory(expid, BasicConfig.JOBDATA_DIR) + exp_history.process_job_list_changes_to_experiment_totals(job_list.get_job_list()) # Wait for all remaining threads of I/O, close remaining connections timeout = 0 @@ -1886,7 +1879,7 @@ class Autosubmit: else: Log.result("Run successful") # Updating finish time for job data header - job_data_structure.update_finish_time() + exp_history.finish_current_experiment_run() except portalocker.AlreadyLocked: message = "We have detected that there is another Autosubmit instance using the experiment\n. Stop other Autosubmit instances that are using the experiment or delete autosubmit.lock file located on tmp folder" raise AutosubmitCritical(message, 7000) @@ -1902,9 +1895,9 @@ class Autosubmit: Log.info("Checking the connection to all platforms in use") issues = "" for platform in platform_to_test: - try: - platform.test_connection() - except BaseException as e : + try: + platform.test_connection() + except BaseException as e : issues += "\n[{1}] Connection Unsuccessful to host {0} trace".format( platform.host, platform.name) continue @@ -2003,6 +1996,7 @@ class Autosubmit: raise AutosubmitCritical("Invalid parameter substitution in {0} template".format( e.job_name), 7014, e.message) except Exception as e: + print(traceback.format_exc()) raise AutosubmitError("{0} submission failed".format( platform.name), 6015, str(e)) except WrongTemplateException as e: @@ -2099,6 +2093,7 @@ class Autosubmit: except AutosubmitCritical as e: raise except Exception as e: + print(traceback.format_exc()) raise AutosubmitError("{0} submission failed".format( platform.name), 6015, str(e)) try: @@ -2116,7 +2111,7 @@ class Autosubmit: # Saving only when it is a real multi job package packages_persistence.save( package.name, package.jobs, package._expid, inspect) - except Exception as e: + except Exception as e: raise AutosubmitError("{0} submission failed".format( platform.name), 6015, str(e)) return save @@ -3915,8 +3910,9 @@ class Autosubmit: # Setting up job historical database header. Must create a new run. # Historical Database: Setup new run - JobDataStructure(expid).validate_current_run(job_list.get_job_list( - ), as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), must_create=True, current_config=as_conf.get_full_config_as_json()) + exp_history = ExperimentHistory(expid, BasicConfig.JOBDATA_DIR) + exp_history.initialize_database() + exp_history.create_new_experiment_run(as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), as_conf.get_full_config_as_json(), job_list.get_job_list()) if not noplot: if group_by: @@ -4171,7 +4167,6 @@ class Autosubmit: Log.debug('Status of jobs to change: {0}', filter_status) Log.debug('Sections to change: {0}', filter_section) wrongExpid = 0 - job_tracked_changes = {} as_conf = AutosubmitConfig( expid, BasicConfig, ConfigParserFactory()) as_conf.check_conf_files(True) @@ -4392,9 +4387,6 @@ class Autosubmit: ft = filter_chunks.split(",")[1:] if ft == 'Any': for job in job_list.get_job_list(): - # Tracking changes - job_tracked_changes[job.name] = ( - job.status, final_status) Autosubmit.change_status( final, final_status, job, save) else: @@ -4404,9 +4396,6 @@ class Autosubmit: if filter_chunks: jobs_filtered.append(job) else: - # Tracking changes - job_tracked_changes[job.name] = ( - job.status, final_status) Autosubmit.change_status( final, final_status, job, save) @@ -4563,9 +4552,6 @@ class Autosubmit: job.platform.name, job.name), 6000) continue if job.status != final_status: - # Tracking changes - job_tracked_changes[job.name] = ( - job.status, final_status) # Only real changes performed_changes[job.name] = str( Status.VALUE_TO_KEY[job.status]) + " -> " + str(final) @@ -4594,9 +4580,6 @@ class Autosubmit: if fc == 'Any': for job in jobs_filtered: - # Tracking changes - job_tracked_changes[job.name] = ( - job.status, final_status) Autosubmit.change_status( final, final_status, job, save) else: @@ -4615,16 +4598,10 @@ class Autosubmit: for chunk_json in member_json['cs']: chunk = int(chunk_json) for job in filter(lambda j: j.chunk == chunk and j.synchronize is not None, jobs_date): - # Tracking changes - job_tracked_changes[job.name] = ( - job.status, final_status) Autosubmit.change_status( final, final_status, job, save) for job in filter(lambda j: j.chunk == chunk, jobs_member): - # Tracking changes - job_tracked_changes[job.name] = ( - job.status, final_status) Autosubmit.change_status( final, final_status, job, save) @@ -4634,18 +4611,12 @@ class Autosubmit: Log.debug("Filtering jobs with status {0}", filter_status) if status_list == 'Any': for job in job_list.get_job_list(): - # Tracking changes - job_tracked_changes[job.name] = ( - job.status, final_status) Autosubmit.change_status( final, final_status, job, save) else: for status in status_list: fs = Autosubmit._get_status(status) for job in filter(lambda j: j.status == fs, job_list.get_job_list()): - # Tracking changes - job_tracked_changes[job.name] = ( - job.status, final_status) Autosubmit.change_status( final, final_status, job, save) @@ -4668,22 +4639,16 @@ class Autosubmit: else: for job in job_list.get_job_list(): if job.name in jobs: - # Tracking changes - job_tracked_changes[job.name] = ( - job.status, final_status) Autosubmit.change_status( final, final_status, job, save) job_list.update_list(as_conf, False, True) if save and wrongExpid == 0: - job_list.save() - # Historical Database: Setup new run if greater or equal than 90% of completed date-member jobs are going to be changed. - # Or if the total number of jobs in the job_list is different than the total number of jobs in the current experiment run register in the database - job_data_structure = JobDataStructure(expid) - job_data_structure.process_status_changes( - job_tracked_changes, job_list.get_job_list(), as_conf.get_chunk_size_unit(), as_conf.get_chunk_size(), check_run=True, current_config=as_conf.get_full_config_as_json(), is_setstatus=True) - + job_list.save() + exp_history = ExperimentHistory(expid, BasicConfig.JOBDATA_DIR) + exp_history.initialize_database() + exp_history.process_status_changes(job_list.get_job_list(), chunk_unit=as_conf.get_chunk_size_unit(), chunk_size=as_conf.get_chunk_size(), current_config=as_conf.get_full_config_as_json()) else: Log.printlog( "Changes NOT saved to the JobList!!!!: use -s option to save", 3000) diff --git a/autosubmit/database/db_jobdata.py b/autosubmit/database/db_jobdata.py index 9e17eb319b8ee7a6ceb6cab1da1a3d26cf858090..a7e5f89a441f3e6fc6b57e5bcdaf1a9cc55594bd 100644 --- a/autosubmit/database/db_jobdata.py +++ b/autosubmit/database/db_jobdata.py @@ -1389,9 +1389,6 @@ class JobDataStructure(MainDataBase): return None # warning_messages.append( # "Critical | This version of Autosubmit does not support the database that provides the energy information.") - # Include only those that exist in the pkl and have the same status as in the pkl - # current_job_data = [job for job in current_job_data_last if job.job_name in allJobsDict.keys( - # ) and allJobsDict[job.job_name] == job.status] if current_job_data_last else None # Start processing if current_job_data: # Dropping parents key @@ -2042,7 +2039,6 @@ class JobDataStructure(MainDataBase): def _get_job_data(self, job_name): """ Returns rows belonging to a job_name - """ try: if self.conn: diff --git a/autosubmit/history/__init__.py b/autosubmit/history/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/autosubmit/history/data_classes/__init__.py b/autosubmit/history/data_classes/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/autosubmit/history/data_classes/experiment_run.py b/autosubmit/history/data_classes/experiment_run.py new file mode 100644 index 0000000000000000000000000000000000000000..9aaa107c38d26e49649f4f86c8c75b4076ab5c58 --- /dev/null +++ b/autosubmit/history/data_classes/experiment_run.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python + +# Copyright 2015-2020 Earth Sciences Department, BSC-CNS +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . + +from autosubmit.history.utils import get_current_datetime_if_none + +class ExperimentRun(): + """ + Class that represents an experiment run + """ + def __init__(self, run_id, created=None, modified=None, start=0, finish=0, chunk_unit="NA", chunk_size=0, completed=0, total=0, failed=0, queuing=0, running=0, submitted=0, suspended=0, metadata=""): + self.run_id = run_id + self.created = get_current_datetime_if_none(created) + self.modified = get_current_datetime_if_none(modified) # Added on DB 16 + self.start = start + self.finish = finish + self.chunk_unit = chunk_unit + self.chunk_size = chunk_size + self.submitted = submitted + self.queuing = queuing + self.running = running + self.completed = completed + self.failed = failed + self.total = total + self.suspended = suspended + self.metadata = metadata + + @classmethod + def from_model(cls, row): + """ Build ExperimentRun from ExperimentRunRow """ + experiment_run = cls(0) + experiment_run.run_id = row.run_id + experiment_run.created = get_current_datetime_if_none(row.created) + experiment_run.modified = get_current_datetime_if_none(row.modified) + experiment_run.start = row.start + experiment_run.finish = row.finish + experiment_run.chunk_unit = row.chunk_unit + experiment_run.chunk_size = row.chunk_size + experiment_run.completed = row.completed + experiment_run.total = row.total + experiment_run.failed = row.failed + experiment_run.queuing = row.queuing + experiment_run.running = row.running + experiment_run.submitted = row.submitted + experiment_run.suspended = row.suspended + experiment_run.metadata = row.metadata + return experiment_run + \ No newline at end of file diff --git a/autosubmit/history/data_classes/job_data.py b/autosubmit/history/data_classes/job_data.py new file mode 100644 index 0000000000000000000000000000000000000000..b5249b797873aa61bd351a884d35dbcdb04d33a5 --- /dev/null +++ b/autosubmit/history/data_classes/job_data.py @@ -0,0 +1,315 @@ +#!/usr/bin/env python + +# Copyright 2015-2020 Earth Sciences Department, BSC-CNS +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . + +import collections +import time +import autosubmit.history.utils as HUtils +import autosubmit.history.database_managers.database_models as Models +from datetime import datetime, timedelta +from json import dumps, loads + +class JobData(object): + """ + Robust representation of a row in the job_data table of the experiment history database. + """ + + def __init__(self, _id, counter=1, job_name="None", created=None, modified=None, submit=0, start=0, finish=0, + status="UNKNOWN", rowtype=0, ncpus=0, wallclock="00:00", qos="debug", energy=0, date="", section="", + member="", chunk=0, last=1, platform="NA", job_id=0, extra_data="", nnodes=0, run_id=None, MaxRSS=0.0, + AveRSS=0.0, out="", err="", rowstatus=Models.RowStatus.INITIAL, children="", platform_output=""): + """ + """ + self._id = _id + self.counter = counter + self.job_name = job_name + self.created = HUtils.get_current_datetime_if_none(created) + self.modified = HUtils.get_current_datetime_if_none(modified) + self._submit = int(submit) + self._start = int(start) + self._finish = int(finish) + self.status = status + self.rowtype = rowtype + self.ncpus = ncpus + self.wallclock = wallclock + self.qos = qos if qos else "debug" + self._energy = round(energy, 2) if energy else 0 + self.date = date if date else "" + self.section = section if section else "" + self.member = member if member else "" + self.chunk = chunk if chunk else 0 + self.last = last + self._platform = platform if platform and len( + platform) > 0 else "NA" + self.job_id = job_id if job_id else 0 + try: + self.extra_data_parsed = loads(extra_data) + except Exception as exp: + self.extra_data_parsed = {} # Fail fast + self.extra_data = extra_data + self.nnodes = nnodes + self.run_id = run_id + self.require_update = False + # DB VERSION 15 attributes + self.MaxRSS = MaxRSS + self.AveRSS = AveRSS + self.out = out + self.err = err + self.rowstatus = rowstatus + self.children = children # DB 17 + self.platform_output = platform_output # DB 17 + + @classmethod + def from_model(cls, row): + """ Build JobData from JobDataRow. """ + job_data = cls(row.id, + row.counter, + row.job_name, + row.created, + row.modified, + row.submit, + row.start, + row.finish, + row.status, + row.rowtype, + row.ncpus, + row.wallclock, + row.qos, + row.energy, + row.date, + row.section, + row.member, + row.chunk, + row.last, + row.platform, + row.job_id, + row.extra_data, + row.nnodes, + row.run_id, + row.MaxRSS, + row.AveRSS, + row.out, + row.err, + row.rowstatus, + row.children, + row.platform_output) + return job_data + + @property + def children_list(self): + children_list = self.children.split(",") if self.children else [] + result = [str(job_name).strip() for job_name in children_list] + return result + + @property + def computational_weight(self): + return round(float(self.running_time * self.ncpus),4) + + @property + def submit(self): + """ + Returns the submit time timestamp as an integer. + """ + return int(self._submit) + + @property + def start(self): + """ + Returns the start time timestamp as an integer. + """ + return int(self._start) + + @property + def finish(self): + """ + Returns the finish time timestamp as an integer. + """ + return int(self._finish) + + @property + def platform(self): + """ + Returns the name of the platform, "NA" if no platform is set. + """ + return self._platform + + @property + def energy(self): + """ + Returns the energy spent value (JOULES) as an integer. + """ + return self._energy + + @property + def wrapper_code(self): + """ + Another name for rowtype + """ + if self.rowtype > 2: + return self.rowtype + else: + return None + + @submit.setter + def submit(self, submit): + self._submit = int(submit) + + @start.setter + def start(self, start): + self._start = int(start) + + @finish.setter + def finish(self, finish): + self._finish = int(finish) + + @platform.setter + def platform(self, platform): + self._platform = platform if platform and len(platform) > 0 else "NA" + + @energy.setter + def energy(self, energy): + """ + Set the energy value. If it is different than the current energy value, a update flag will be activated. + """ + if energy > 0: + if (energy != self._energy): + # print("Updating energy to {0} from {1}.".format( + # energy, self._energy)) + self.require_update = True + self._energy = round(energy, 2) + + @property + def delta_queue_time(self): + """ + Returns queuing time as a timedelta object. + """ + return str(timedelta(seconds=self.queuing_time())) + + @property + def delta_running_time(self): + """ + Returns running time as a timedelta object. + """ + return str(timedelta(seconds=self.running_time())) + + @property + def submit_datetime(self): + """ + Return the submit time as a datetime object, None if submit time equal 0. + """ + if self.submit > 0: + return datetime.fromtimestamp(self.submit) + return None + + @property + def start_datetime(self): + """ + Return the start time as a datetime object, None if start time equal 0. + """ + if self.start > 0: + return datetime.fromtimestamp(self.start) + return None + + @property + def finish_datetime(self): + """ + Return the finish time as a datetime object, None if start time equal 0. + """ + if self.finish > 0: + return datetime.fromtimestamp(self.finish) + return None + + @property + def submit_datetime_str(self): + """ + Returns the submit datetime as a string with format %Y-%m-%d-%H:%M:%S + """ + o_datetime = self.submit_datetime() + if o_datetime: + return o_datetime.strftime(HUtils.DATETIME_FORMAT) + else: + return None + @property + def start_datetime_str(self): + """ + Returns the start datetime as a string with format %Y-%m-%d-%H:%M:%S + """ + o_datetime = self.start_datetime() + if o_datetime: + return o_datetime.strftime(HUtils.DATETIME_FORMAT) + else: + return None + @property + def finish_datetime_str(self): + """ + Returns the finish datetime as a string with format %Y-%m-%d-%H:%M:%S + """ + o_datetime = self.finish_datetime() + if o_datetime: + return o_datetime.strftime(HUtils.DATETIME_FORMAT) + else: + return None + + @property + def running_time(self): + """ + Calculates and returns the running time of the job, in seconds. + + :return: Running time in seconds. + :rtype: int + """ + if self.status in ["RUNNING", "COMPLETED", "FAILED"]: + return HUtils.calculate_run_time_in_seconds(self.start, self.finish) + return 0 + + @property + def queuing_time(self): + """ + Calculates and returns the queuing time of the job, in seconds. + + :return: Queueing time in seconds. + :rtype: int + """ + if self.status in ["SUBMITTED", "QUEUING", "RUNNING", "COMPLETED", "HELD", "PREPARED", "FAILED", "SKIPPED"]: + return HUtils.calculate_queue_time_in_seconds(self.submit, self.start) + return 0 + + def get_hdata(self): + """ + Get the job data as an ordered dict into a JSON object. + :return: Job data as an ordered dict into a JSON object. + :rtype: JSON object. + """ + hdata = collections.OrderedDict() + hdata["name"] = self.job_name + hdata["date"] = self.date + hdata["section"] = self.section + hdata["member"] = self.member + hdata["chunk"] = self.chunk + hdata["submit"] = self.submit_datetime_str() + hdata["start"] = self.start_datetime_str() + hdata["finish"] = self.finish_datetime_str() + hdata["queue_time"] = self.delta_queue_time() + hdata["run_time"] = self.delta_running_time() + hdata["wallclock"] = self.wallclock + hdata["ncpus"] = self.ncpus + hdata["nnodes"] = self.nnodes + hdata["energy"] = self.energy + hdata["platform"] = self.platform + hdata["MaxRSS"] = self.MaxRSS + hdata["AveRSS"] = self.AveRSS + return dumps(hdata) \ No newline at end of file diff --git a/autosubmit/history/database_managers/__init__.py b/autosubmit/history/database_managers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/autosubmit/history/database_managers/database_manager.py b/autosubmit/history/database_managers/database_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..a4f1516e62e0bd86a136322a47eaa3c153e462bc --- /dev/null +++ b/autosubmit/history/database_managers/database_manager.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python + +# Copyright 2015-2020 Earth Sciences Department, BSC-CNS +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . + +import sqlite3 +import os +import autosubmit.history.utils as HUtils +import autosubmit.history.database_managers.database_models as Models +from abc import ABCMeta, abstractmethod + +DEFAULT_JOBDATA_DIR = os.path.join('/esarchive', 'autosubmit', 'as_metadata', 'data') +DEFAULT_LOCAL_ROOT_DIR = os.path.join('/esarchive', 'autosubmit') +class DatabaseManager(): + """ Simple database manager. Needs expid. """ + __metaclass__ = ABCMeta + AS_TIMES_DB_NAME = "as_times.db" # default AS_TIMES location + ECEARTH_DB_NAME = "ecearth.db" # default EC_EARTH_DB_NAME location + def __init__(self, expid, jobdata_dir_path=DEFAULT_JOBDATA_DIR, local_root_dir_path=DEFAULT_LOCAL_ROOT_DIR): + self.expid = expid + self.JOBDATA_DIR = jobdata_dir_path + self.LOCAL_ROOT_DIR = local_root_dir_path + + def get_connection(self, path): + # type : (str) -> Sqlite3Connection + """ + Create a database connection to the SQLite database specified by path. + :param path: database file name + :return: Connection object or None + """ + if not os.path.exists(path): + self._create_database_file(path) + return sqlite3.connect(path) + + def _create_database_file(self, path): + # type : (str) -> None + """ creates a database files with full permissions """ + os.umask(0) + os.open(path, os.O_WRONLY | os.O_CREAT, 0o776) + + def execute_statement_on_dbfile(self, path, statement): + # type : (str, str) -> None + """ Executes a statement on a database file specified by path. """ + conn = self.get_connection(path) + cursor = conn.cursor() + cursor.execute(statement) + conn.commit() + conn.close() + + def execute_statement_with_arguments_on_dbfile(self, path, statement, arguments): + # type : (str, str, Tuple) -> None + """ Executes an statement with arguments on a database file specified by path. """ + conn = self.get_connection(path) + cursor = conn.cursor() + cursor.execute(statement, arguments) + conn.commit() + conn.close() + + def execute_many_statement_with_arguments_on_dbfile(self, path, statement, arguments_list): + # type : (str, str, List[Tuple]) -> None + """ Executes many statements from a list of arguments specified by a path. """ + conn = self.get_connection(path) + cursor = conn.cursor() + cursor.executemany(statement, arguments_list) + conn.commit() + conn.close() + + def execute_many_statements_on_dbfile(self, path, statements): + # type : (str, List[str]) -> None + """ + Updates the table schema using a **small** list of statements. No Exception raised. + Should be used to execute a list of schema updates that might have been already applied. + """ + for statement in statements: + try: + self.execute_statement_on_dbfile(path, statement) + except Exception as exp: + pass + + def get_from_statement(self, path, statement): + # type : (str, str) -> List[Tuple] + """ Get the rows from a statement with no arguments """ + conn = self.get_connection(path) + conn.text_factory = str + cursor = conn.cursor() + cursor.execute(statement) + statement_rows = cursor.fetchall() + conn.close() + return statement_rows + + def get_from_statement_with_arguments(self, path, statement, arguments): + # type : (str, str, Tuple) -> List[Tuple] + """ Get the rows from a statement with arguments """ + conn = self.get_connection(path) + conn.text_factory = str + cursor = conn.cursor() + cursor.execute(statement, arguments) + statement_rows = cursor.fetchall() + conn.close() + return statement_rows + + def insert_statement(self, path, statement): + # type : (str, str) -> int + """ Insert statement into path """ + conn = self.get_connection(path) + conn.text_factory = str + cursor = conn.cursor() + cursor.execute(statement) + lastrow_id = cursor.lastrowid + conn.commit() + conn.close() + return lastrow_id + + def insert_statement_with_arguments(self, path, statement, arguments): + # type : (str, str, Tuple) -> int + """ Insert statement with arguments into path """ + conn = self.get_connection(path) + conn.text_factory = str + cursor = conn.cursor() + cursor.execute(statement, arguments) + lastrow_id = cursor.lastrowid + conn.commit() + conn.close() + return lastrow_id + + def get_built_select_statement(self, table_name, conditions=None): + # type : (str, namedtuple, str) -> str + """ Build and return a SELECT statement with the same fields as the model. Requires that the table is associated with a model (namedtuple). """ + model = Models.table_name_to_model[table_name] + if conditions: + return "SELECT {0} FROM {1} WHERE {2}".format(HUtils.get_fields_as_comma_str(model), table_name, conditions) + else: + return "SELECT {0} FROM {1}".format(HUtils.get_fields_as_comma_str(model), table_name) + + diff --git a/autosubmit/history/database_managers/database_models.py b/autosubmit/history/database_managers/database_models.py new file mode 100644 index 0000000000000000000000000000000000000000..3d205f965811a6cc7665f459529ce50c514f6bf6 --- /dev/null +++ b/autosubmit/history/database_managers/database_models.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python + +# Copyright 2015-2020 Earth Sciences Department, BSC-CNS +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . + +import collections + +JobDataRow = collections.namedtuple('JobDataRow', ['id', 'counter', 'job_name', 'created', 'modified', 'submit', 'start', 'finish', + 'status', 'rowtype', 'ncpus', 'wallclock', 'qos', 'energy', 'date', 'section', 'member', + 'chunk', 'last', 'platform', 'job_id', 'extra_data', 'nnodes', 'run_id', 'MaxRSS', 'AveRSS', + 'out', 'err', 'rowstatus', 'children', 'platform_output']) + +ExperimentRunRow = collections.namedtuple('ExperimentRunRow', [ + 'run_id', 'created', 'modified', 'start', 'finish', 'chunk_unit', 'chunk_size', 'completed', 'total', 'failed', 'queuing', 'running', 'submitted', 'suspended', 'metadata']) + +ExperimentStatusRow = collections.namedtuple( + 'ExperimentStatusRow', ['exp_id', 'name', 'status', 'seconds_diff', 'modified']) + +ExperimentRow = collections.namedtuple('ExperimentRow', ["id", "name", "autosubmit_version", "description"]) + +PragmaVersion = collections.namedtuple('PragmaVersion', ['version']) +MaxCounterRow = collections.namedtuple('MaxCounter', ['maxcounter']) + +class RunningStatus: + RUNNING = "RUNNING" + NOT_RUNNING = "NOT RUNNING" + +class RowType: + NORMAL = 2 + #PACKED = 2 + +class RowStatus: + INITIAL = 0 + COMPLETED = 1 + PROCESSED = 2 + FAULTY = 3 + CHANGED = 4 + PENDING_PROCESS = 5 + +table_name_to_model = { + "experiment" : ExperimentRow, + "experiment_status" : ExperimentStatusRow, + "job_data" : JobDataRow, + "experiment_run" : ExperimentRunRow, + "pragma_version" : PragmaVersion +} + diff --git a/autosubmit/history/database_managers/experiment_history_db_manager.py b/autosubmit/history/database_managers/experiment_history_db_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..e2b5c3bf097ad704dfb634849ec972fcf7e13ef2 --- /dev/null +++ b/autosubmit/history/database_managers/experiment_history_db_manager.py @@ -0,0 +1,390 @@ +#!/usr/bin/env python + +# Copyright 2015-2020 Earth Sciences Department, BSC-CNS +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . +import sqlite3 +import os +import traceback +import textwrap +import autosubmit.history.utils as HUtils +import database_models as Models +from autosubmit.history.data_classes.job_data import JobData +from autosubmit.history.data_classes.experiment_run import ExperimentRun +from abc import ABCMeta, abstractmethod +from database_manager import DatabaseManager, DEFAULT_JOBDATA_DIR +from datetime import datetime + +CURRENT_DB_VERSION = 17 +DB_EXPERIMENT_HEADER_SCHEMA_CHANGES = 14 +DB_VERSION_SCHEMA_CHANGES = 12 +DEFAULT_DB_VERSION = 10 +DEFAULT_MAX_COUNTER = 0 + +class ExperimentHistoryDbManager(DatabaseManager): + """ Manages actions directly on the database. + """ + def __init__(self, expid, jobdata_dir_path=DEFAULT_JOBDATA_DIR): + """ Requires expid and jobdata_dir_path. """ + super(ExperimentHistoryDbManager, self).__init__(expid, jobdata_dir_path=jobdata_dir_path) + self._set_schema_changes() + self._set_table_queries() + self.historicaldb_file_path = os.path.join(self.JOBDATA_DIR, "job_data_{0}.db".format(self.expid)) # type : str + + def initialize(self): + if self.my_database_exists(): + if not self.is_current_version(): + self.update_historical_database() + else: + self.create_historical_database() + + def my_database_exists(self): + return os.path.exists(self.historicaldb_file_path) + + def is_header_ready_db_version(self): + if self.my_database_exists(): + return self._get_pragma_version() >= DB_EXPERIMENT_HEADER_SCHEMA_CHANGES + return False + + def is_current_version(self): + if self.my_database_exists(): + return self._get_pragma_version() == CURRENT_DB_VERSION + return False + + def _set_table_queries(self): + """ Sets basic table queries. """ + self.create_table_header_query = textwrap.dedent( + '''CREATE TABLE + IF NOT EXISTS experiment_run ( + run_id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT, + created TEXT NOT NULL, + modified TEXT NOT NULL, + start INTEGER NOT NULL, + finish INTEGER, + chunk_unit TEXT NOT NULL, + chunk_size INTEGER NOT NULL, + completed INTEGER NOT NULL, + total INTEGER NOT NULL, + failed INTEGER NOT NULL, + queuing INTEGER NOT NULL, + running INTEGER NOT NULL, + submitted INTEGER NOT NULL, + suspended INTEGER NOT NULL DEFAULT 0, + metadata TEXT + ); + ''') + self.create_table_query = textwrap.dedent( + '''CREATE TABLE + IF NOT EXISTS job_data ( + id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT, + counter INTEGER NOT NULL, + job_name TEXT NOT NULL, + created TEXT NOT NULL, + modified TEXT NOT NULL, + submit INTEGER NOT NULL, + start INTEGER NOT NULL, + finish INTEGER NOT NULL, + status TEXT NOT NULL, + rowtype INTEGER NOT NULL, + ncpus INTEGER NOT NULL, + wallclock TEXT NOT NULL, + qos TEXT NOT NULL, + energy INTEGER NOT NULL, + date TEXT NOT NULL, + section TEXT NOT NULL, + member TEXT NOT NULL, + chunk INTEGER NOT NULL, + last INTEGER NOT NULL, + platform TEXT NOT NULL, + job_id INTEGER NOT NULL, + extra_data TEXT NOT NULL, + nnodes INTEGER NOT NULL DEFAULT 0, + run_id INTEGER, + MaxRSS REAL NOT NULL DEFAULT 0.0, + AveRSS REAL NOT NULL DEFAULT 0.0, + out TEXT NOT NULL, + err TEXT NOT NULL, + rowstatus INTEGER NOT NULL DEFAULT 0, + children TEXT, + platform_output TEXT, + UNIQUE(counter,job_name) + ); + ''') + self.create_index_query = textwrap.dedent(''' + CREATE INDEX IF NOT EXISTS ID_JOB_NAME ON job_data(job_name); + ''') + + def _set_schema_changes(self): + # type : () -> None + """ Creates the list of schema changes""" + self.version_schema_changes = [ + "ALTER TABLE job_data ADD COLUMN nnodes INTEGER NOT NULL DEFAULT 0", + "ALTER TABLE job_data ADD COLUMN run_id INTEGER" + ] + # Version 15 + self.version_schema_changes.extend([ + "ALTER TABLE job_data ADD COLUMN MaxRSS REAL NOT NULL DEFAULT 0.0", + "ALTER TABLE job_data ADD COLUMN AveRSS REAL NOT NULL DEFAULT 0.0", + "ALTER TABLE job_data ADD COLUMN out TEXT NOT NULL DEFAULT ''", + "ALTER TABLE job_data ADD COLUMN err TEXT NOT NULL DEFAULT ''", + "ALTER TABLE job_data ADD COLUMN rowstatus INTEGER NOT NULL DEFAULT 0", + "ALTER TABLE experiment_run ADD COLUMN suspended INTEGER NOT NULL DEFAULT 0", + "ALTER TABLE experiment_run ADD COLUMN metadata TEXT" + ]) + # Version 16 + self.version_schema_changes.extend([ + "ALTER TABLE experiment_run ADD COLUMN modified TEXT" + ]) + # Version 17 + self.version_schema_changes.extend([ + "ALTER TABLE job_data ADD COLUMN children TEXT", + "ALTER TABLE job_data ADD COLUMN platform_output TEXT" + ]) + + def create_historical_database(self): + """ Creates the historical database with the latest changes. """ + self.execute_statement_on_dbfile(self.historicaldb_file_path, self.create_table_header_query) + self.execute_statement_on_dbfile(self.historicaldb_file_path, self.create_table_query) + self.execute_statement_on_dbfile(self.historicaldb_file_path, self.create_index_query) + self._set_historical_pragma_version(CURRENT_DB_VERSION) + + def update_historical_database(self): + """ Updates the historical database with the latest changes IF necessary. """ + self.execute_many_statements_on_dbfile(self.historicaldb_file_path, self.version_schema_changes) + self.execute_statement_on_dbfile(self.historicaldb_file_path, self.create_index_query) + self.execute_statement_on_dbfile(self.historicaldb_file_path, self.create_table_header_query) + self._set_historical_pragma_version(CURRENT_DB_VERSION) + + def get_experiment_run_dc_with_max_id(self): + """ Get Current (latest) ExperimentRun data class. """ + return ExperimentRun.from_model(self._get_experiment_run_with_max_id()) + + def register_experiment_run_dc(self, experiment_run_dc): + self._insert_experiment_run(experiment_run_dc) + return ExperimentRun.from_model(self._get_experiment_run_with_max_id()) + + def update_experiment_run_dc_by_id(self, experiment_run_dc): + """ Requires ExperimentRun data class. """ + self._update_experiment_run(experiment_run_dc) + return ExperimentRun.from_model(self._get_experiment_run_with_max_id()) + + def _get_experiment_run_with_max_id(self): + """ Get Models.ExperimentRunRow for the maximum id run. """ + statement = self.get_built_select_statement("experiment_run", "run_id > 0 ORDER BY run_id DESC LIMIT 0, 1") + max_experiment_run = self.get_from_statement(self.historicaldb_file_path, statement) + if len(max_experiment_run) == 0: + raise Exception("No Experiment Runs registered.") + return Models.ExperimentRunRow(*max_experiment_run[0]) + + def is_there_a_last_experiment_run(self): + statement = self.get_built_select_statement("experiment_run", "run_id > 0 ORDER BY run_id DESC LIMIT 0, 1") + max_experiment_run = self.get_from_statement(self.historicaldb_file_path, statement) + if len(max_experiment_run) > 0: + return True + return False + + def get_job_data_all(self): + """ Gets all content from job_data as list of Models.JobDataRow from database. """ + statement = self.get_built_select_statement("job_data") + job_data_rows = self.get_from_statement(self.historicaldb_file_path, statement) + return [Models.JobDataRow(*row) for row in job_data_rows] + + def register_submitted_job_data_dc(self, job_data_dc): + """ Sets previous register to last=0 and inserts the new job_data_dc data class.""" + self._set_current_job_data_rows_last_to_zero_by_job_name(job_data_dc.job_name) + self._insert_job_data(job_data_dc) + return self.get_job_data_dc_unique_latest_by_job_name(job_data_dc.job_name) + + def _set_current_job_data_rows_last_to_zero_by_job_name(self, job_name): + """ Sets the column last = 0 for all job_rows by job_name and last = 1. """ + job_data_row_last = self._get_job_data_last_by_name(job_name) + job_data_dc_list = [JobData.from_model(row) for row in job_data_row_last] + for job_data_dc in job_data_dc_list: + job_data_dc.last = 0 + self._update_job_data_by_id(job_data_dc) + + def update_job_data_dc_by_id(self, job_data_dc): + """ Update JobData data class. Returns latest last=1 row from job_data by job_name. """ + self._update_job_data_by_id(job_data_dc) + return self.get_job_data_dc_unique_latest_by_job_name(job_data_dc.job_name) + + def update_list_job_data_dc_by_each_id(self, job_data_dcs): + """ Return length of updated list. """ + for job_data_dc in job_data_dcs: + self._update_job_data_by_id(job_data_dc) + return len(job_data_dcs) + + def get_job_data_dc_unique_latest_by_job_name(self, job_name): + """ Returns JobData data class for the latest job_data_row with last=1 by job_name. """ + job_data_row_last = self._get_job_data_last_by_name(job_name) + if len(job_data_row_last) > 0: + return JobData.from_model(job_data_row_last[0]) + return None + + def _get_job_data_last_by_name(self, job_name): + """ Get List of Models.JobDataRow for job_name and last=1 """ + statement = self.get_built_select_statement("job_data", "last=1 and job_name=? ORDER BY counter DESC") + arguments = (job_name,) + job_data_rows_last = self.get_from_statement_with_arguments(self.historicaldb_file_path, statement, arguments) + return [Models.JobDataRow(*row) for row in job_data_rows_last] + + def get_job_data_dcs_last_by_run_id(self, run_id): + job_data_rows = self._get_job_data_last_by_run_id(run_id) + return [JobData.from_model(row) for row in job_data_rows] + + def _get_job_data_last_by_run_id(self, run_id): + """ Get List of Models.JobDataRow for last=1 and run_id """ + statement = self.get_built_select_statement("job_data", "run_id=? and last=1 and rowtype >= 2 ORDER BY id") + arguments = (run_id,) + job_data_rows = self.get_from_statement_with_arguments(self.historicaldb_file_path, statement, arguments) + return [Models.JobDataRow(*row) for row in job_data_rows] + + def get_job_data_dcs_last_by_wrapper_code(self, wrapper_code): + if wrapper_code: + return [JobData.from_model(row) for row in self._get_job_data_last_by_wrapper_code(wrapper_code)] + else: + return [] + + def _get_job_data_last_by_wrapper_code(self, wrapper_code): + """ Get List of Models.JobDataRow for last=1 and rowtype=wrapper_code """ + statement = self.get_built_select_statement("job_data", "rowtype = ? and last=1 ORDER BY id") + arguments = (wrapper_code,) + job_data_rows = self.get_from_statement_with_arguments(self.historicaldb_file_path, statement, arguments) + return [Models.JobDataRow(*row) for row in job_data_rows] + + def get_all_last_job_data_dcs(self): + """ Gets JobData data classes in job_data for last=1. """ + job_data_rows = self._get_all_last_job_data_rows() + return [JobData.from_model(row) for row in job_data_rows] + + def _get_all_last_job_data_rows(self): + """ Get List of Models.JobDataRow for last=1. """ + statement = self.get_built_select_statement("job_data", "last=1") + job_data_rows = self.get_from_statement(self.historicaldb_file_path, statement) + return [Models.JobDataRow(*row) for row in job_data_rows] + + def _insert_job_data(self, job_data): + # type : (JobData) -> int + """ Insert data class JobData into job_data table. """ + statement = ''' INSERT INTO job_data(counter, job_name, created, modified, + submit, start, finish, status, rowtype, ncpus, + wallclock, qos, energy, date, section, member, chunk, last, + platform, job_id, extra_data, nnodes, run_id, MaxRSS, AveRSS, + out, err, rowstatus, children, platform_output) + VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?) ''' + arguments = (job_data.counter, job_data.job_name, HUtils.get_current_datetime(), HUtils.get_current_datetime(), + job_data.submit, job_data.start, job_data.finish, job_data.status, job_data.rowtype, job_data.ncpus, + job_data.wallclock, job_data.qos, job_data.energy, job_data.date, job_data.section, job_data.member, job_data.chunk, job_data.last, + job_data.platform, job_data.job_id, job_data.extra_data, job_data.nnodes, job_data.run_id, job_data.MaxRSS, job_data.AveRSS, + job_data.out, job_data.err, job_data.rowstatus, job_data.children, job_data.platform_output) + return self.insert_statement_with_arguments(self.historicaldb_file_path, statement, arguments) + + def _insert_experiment_run(self, experiment_run): + """ Insert data class ExperimentRun into database """ + statement = ''' INSERT INTO experiment_run(created, modified, start, finish, + chunk_unit, chunk_size, completed, total, + failed, queuing, running, + submitted, suspended, metadata) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?) ''' + arguments = (HUtils.get_current_datetime(), HUtils.get_current_datetime(), experiment_run.start, experiment_run.finish, + experiment_run.chunk_unit, experiment_run.chunk_size, experiment_run.completed, experiment_run.total, + experiment_run.failed, experiment_run.queuing, experiment_run.running, + experiment_run.submitted, experiment_run.suspended, experiment_run.metadata) + return self.insert_statement_with_arguments(self.historicaldb_file_path, statement, arguments) + + def update_many_job_data_change_status(self, changes): + # type : (List[Tuple]) -> None + """ + Update many job_data rows in bulk. Requires a changes list of argument tuples. + Only updates finish, modified, status, and rowstatus by id. + """ + statement = ''' UPDATE job_data SET modified=?, status=?, rowstatus=? WHERE id=? ''' + self.execute_many_statement_with_arguments_on_dbfile(self.historicaldb_file_path, statement, changes) + + def _update_job_data_by_id(self, job_data_dc): + """ + Update job_data table with data class JobData. + Update last, submit, start, finish, modified, job_id, status, energy, extra_data, nnodes, ncpus, rowstatus, out, err by id. + """ + statement = ''' UPDATE job_data SET last=?, submit=?, start=?, finish=?, modified=?, + job_id=?, status=?, energy=?, extra_data=?, + nnodes=?, ncpus=?, rowstatus=?, out=?, err=?, + children=?, platform_output=? WHERE id=? ''' + arguments = (job_data_dc.last, job_data_dc.submit, job_data_dc.start, job_data_dc.finish, HUtils.get_current_datetime(), + job_data_dc.job_id, job_data_dc.status, job_data_dc.energy, job_data_dc.extra_data, + job_data_dc.nnodes, job_data_dc.ncpus, job_data_dc.rowstatus, job_data_dc.out, job_data_dc.err, + job_data_dc.children, job_data_dc.platform_output, job_data_dc._id) + self.execute_statement_with_arguments_on_dbfile(self.historicaldb_file_path, statement, arguments) + + def _update_experiment_run(self, experiment_run_dc): + """ + Update experiment_run table with data class ExperimentRun. + Updates by run_id (finish, chunk_unit, chunk_size, completed, total, failed, queuing, running, submitted, suspended) + """ + statement = ''' UPDATE experiment_run SET finish=?, chunk_unit=?, chunk_size=?, completed=?, total=?, + failed=?, queuing=?, running=?, submitted=?, + suspended=?, modified=? WHERE run_id=? ''' + arguments = (experiment_run_dc.finish, experiment_run_dc.chunk_unit, experiment_run_dc.chunk_size, experiment_run_dc.completed, experiment_run_dc.total, + experiment_run_dc.failed, experiment_run_dc.queuing, experiment_run_dc.running, experiment_run_dc.submitted, + experiment_run_dc.suspended, HUtils.get_current_datetime(), experiment_run_dc.run_id) + self.execute_statement_with_arguments_on_dbfile(self.historicaldb_file_path, statement, arguments) + + def _get_job_data_last_by_run_id_and_finished(self, run_id): + """ Get List of Models.JobDataRow for last=1, finished > 0 and run_id """ + statement = self.get_built_select_statement("job_data", "run_id=? and last=1 and finish > 0 and rowtype >= 2 ORDER BY id") + arguments = (run_id,) + job_data_rows = self.get_from_statement_with_arguments(self.historicaldb_file_path, statement, arguments) + return [Models.JobDataRow(*row) for row in job_data_rows] + + def get_job_data_by_name(self, job_name): + """ Get List of Models.JobDataRow for job_name """ + statement = self.get_built_select_statement("job_data", "job_name=? ORDER BY counter DESC") + arguments = (job_name,) + job_data_rows = self.get_from_statement_with_arguments(self.historicaldb_file_path, statement, arguments) + return [Models.JobDataRow(*row) for row in job_data_rows] + + def get_job_data_max_counter(self): + """ The max counter is the maximum count value for the count column in job_data. """ + statement = "SELECT MAX(counter) as maxcounter FROM job_data" + counter_result = self.get_from_statement(self.historicaldb_file_path, statement) + if len(counter_result) <= 0: + return DEFAULT_MAX_COUNTER + else: + max_counter = Models.MaxCounterRow(*counter_result[0]).maxcounter + return max_counter if max_counter else DEFAULT_MAX_COUNTER + + def delete_job_data(self, id): + """ Deletes row from job_data by id. Useful for testing. """ + statement = ''' DELETE FROM job_data WHERE id=? ''' + arguments = (id, ) + self.execute_statement_with_arguments_on_dbfile(self.historicaldb_file_path, statement, arguments) + + def delete_experiment_run(self, run_id): + """ Deletes row in experiment_run by run_id. Useful for testing. """ + statement = ''' DELETE FROM experiment_run where run_id=? ''' + arguments = (run_id,) + self.execute_statement_with_arguments_on_dbfile(self.historicaldb_file_path, statement, arguments) + + def _set_historical_pragma_version(self, version=10): + """ Sets the pragma version. """ + statement = "pragma user_version={v:d};".format(v=version) + self.execute_statement_on_dbfile(self.historicaldb_file_path, statement) + + def _get_pragma_version(self): + """ Gets current pragma version as int. """ + statement = "pragma user_version;" + pragma_result = self.get_from_statement(self.historicaldb_file_path, statement) + if len(pragma_result) <= 0: + raise Exception("Error while getting the pragma version. This might be a signal of a deeper problem. Review previous errors.") + return Models.PragmaVersion(*pragma_result[0]).version diff --git a/autosubmit/history/database_managers/experiment_status_db_manager.py b/autosubmit/history/database_managers/experiment_status_db_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..69d5b4576119548657309370ff0bfdcf89411a43 --- /dev/null +++ b/autosubmit/history/database_managers/experiment_status_db_manager.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python + + +# Copyright 2015-2020 Earth Sciences Department, BSC-CNS +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . + +import os +import sqlite3 +import traceback +import textwrap +import time +from database_manager import DatabaseManager, DEFAULT_LOCAL_ROOT_DIR +import autosubmit.history.utils as HUtils +import database_models as Models + +class ExperimentStatusDbManager(DatabaseManager): + """ Manages the actions on the status database """ + def __init__(self, expid, local_root_dir_path=DEFAULT_LOCAL_ROOT_DIR): + super(ExperimentStatusDbManager, self).__init__(expid, local_root_dir_path=local_root_dir_path) + self._as_times_file_path = os.path.join(self.LOCAL_ROOT_DIR, self.AS_TIMES_DB_NAME) + self._ecearth_file_path = os.path.join(self.LOCAL_ROOT_DIR, self.ECEARTH_DB_NAME) + self._pkl_file_path = os.path.join(self.LOCAL_ROOT_DIR, "pkl", "job_list_{0}.pkl".format(self.expid)) + self._validate_status_database() + # self.current_experiment_row = self._get_current_experiment_row(self.expid) + # self.current_experiment_status_row =self._get_current_experiment_status_row(self.current_experiment_row.id) + + def _validate_status_database(self): + """ Creates experiment_status table if it does not exist """ + create_table_query = textwrap.dedent( + '''CREATE TABLE + IF NOT EXISTS experiment_status ( + exp_id integer PRIMARY KEY, + name text NOT NULL, + status text NOT NULL, + seconds_diff integer NOT NULL, + modified text NOT NULL, + FOREIGN KEY (exp_id) REFERENCES experiment (id) + );''' + ) + self.execute_statement_on_dbfile(self._as_times_file_path, create_table_query) + + def print_current_table(self): + for experiment in self._get_experiment_status_content(): + print(experiment) + if self.current_experiment_status_row: + print("Current Row:\n\t" + self.current_experiment_status_row.name + "\n\t" + + str(self.current_experiment_status_row.exp_id) + "\n\t" + self.current_experiment_status_row.status) + + def is_running(self, time_condition=600): + # type : (int) -> bool + """ True if experiment is running, False otherwise. """ + if (os.path.exists(self._pkl_file_path)): + current_stat = os.stat(self._pkl_file_path) + timest = int(current_stat.st_mtime) + timesys = int(time.time()) + time_diff = int(timesys - timest) + if (time_diff < time_condition): + return True + else: + return False + return False + + def set_existing_experiment_status_as_running(self, expid): + """ Set the experiment_status row as running. """ + self.update_exp_status(expid, Models.RunningStatus.RUNNING) + + def create_experiment_status_as_running(self, experiment): + """ Create a new experiment_status row for the Models.Experiment item.""" + self.create_exp_status(experiment.id, experiment.name, Models.RunningStatus.RUNNING) + + + def get_experiment_status_row_by_expid(self, expid): + # type : (str) -> Models.ExperimentRow + """ + Get Models.ExperimentRow by expid. + """ + experiment_row = self.get_experiment_row_by_expid(expid) + return self.get_experiment_status_row_by_exp_id(experiment_row.id) + + def get_experiment_row_by_expid(self, expid): + # type : (str) -> Models.ExperimentRow + """ + Get the experiment from ecearth.db by expid as Models.ExperimentRow. + """ + statement = self.get_built_select_statement("experiment", "name=?") + current_rows = self.get_from_statement_with_arguments(self._ecearth_file_path, statement, (expid,)) + if len(current_rows) <= 0: + raise ValueError("Experiment {0} not found in {1}".format(expid, self._ecearth_file_path)) + return Models.ExperimentRow(*current_rows[0]) + + def _get_experiment_status_content(self): + # type : () -> List[Models.ExperimentStatusRow] + """ + Get all registers from experiment_status as List of Models.ExperimentStatusRow.\n + """ + statement = self.get_built_select_statement("experiment_status") + current_rows = self.get_from_statement(self._as_times_file_path, statement) + return [Models.ExperimentStatusRow(*row) for row in current_rows] + + def get_experiment_status_row_by_exp_id(self, exp_id): + # type : (int) -> Models.ExperimentStatusRow + """ Get Models.ExperimentStatusRow from as_times.db by exp_id (int)""" + statement = self.get_built_select_statement("experiment_status", "exp_id=?") + arguments = (exp_id,) + current_rows = self.get_from_statement_with_arguments(self._as_times_file_path, statement, arguments) + if len(current_rows) <= 0: + return None + return Models.ExperimentStatusRow(*current_rows[0]) + + + def create_exp_status(self, exp_id, expid, status): + # type : (int, str) -> None + """ + Create experiment status + """ + statement = ''' INSERT INTO experiment_status(exp_id, name, status, seconds_diff, modified) VALUES(?,?,?,?,?) ''' + arguments = (exp_id, expid, status, 0, HUtils.get_current_datetime()) + return self.insert_statement_with_arguments(self._as_times_file_path, statement, arguments) + + def update_exp_status(self, expid, status="RUNNING"): + # type : (str, str) -> None + """ + Update status, seconds_diff, modified in experiment_status. + """ + statement = ''' UPDATE experiment_status SET status = ?, seconds_diff = ?, modified = ? WHERE name = ? ''' + arguments = (status, 0, HUtils.get_current_datetime(), expid) + self.execute_statement_with_arguments_on_dbfile(self._as_times_file_path, statement, arguments) + + def delete_exp_status(self, expid): + # type : (str) -> None + """ Deletes experiment_status row by expid. Useful for testing purposes. """ + statement = ''' DELETE FROM experiment_status where name = ? ''' + arguments = (expid,) + self.execute_statement_with_arguments_on_dbfile(self._as_times_file_path, statement, arguments) \ No newline at end of file diff --git a/autosubmit/history/database_managers/test.py b/autosubmit/history/database_managers/test.py new file mode 100644 index 0000000000000000000000000000000000000000..0b1e3a05ac8b780ed23ed39684d62aac9e6dcc3e --- /dev/null +++ b/autosubmit/history/database_managers/test.py @@ -0,0 +1,251 @@ +#!/usr/bin/env python + +# Copyright 2015-2020 Earth Sciences Department, BSC-CNS +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . + +import unittest +import time +import random +import os +from shutil import copy2 +from experiment_history_db_manager import ExperimentHistoryDbManager +from experiment_status_db_manager import ExperimentStatusDbManager +from autosubmit.history.data_classes.experiment_run import ExperimentRun +from autosubmit.history.data_classes.job_data import JobData +from autosubmit.config.basicConfig import BasicConfig +import autosubmit.history.utils as HUtils +EXPID_TT00_SOURCE = "test_database.db~" +EXPID_TT01_SOURCE = "test_database_no_run.db~" +EXPID = "t024" +EXPID_NONE = "t027" +BasicConfig.read() +JOBDATA_DIR = BasicConfig.JOBDATA_DIR +LOCAL_ROOT_DIR = BasicConfig.LOCAL_ROOT_DIR + +class TestExperimentStatusDatabaseManager(unittest.TestCase): + """ Covers Experiment Status Database Manager """ + def setUp(self): + self.exp_status_db = ExperimentStatusDbManager(EXPID, LOCAL_ROOT_DIR) + + def test_get_current_experiment_status_row(self): + exp_status_row = self.exp_status_db.get_experiment_status_row_by_expid(EXPID) + self.assertIsNotNone(exp_status_row) + exp_status_row_none = self.exp_status_db.get_experiment_status_row_by_expid(EXPID_NONE) + self.assertIsNone(exp_status_row_none) + exp_row_direct = self.exp_status_db.get_experiment_status_row_by_exp_id(exp_status_row.exp_id) + self.assertTrue(exp_status_row.exp_id == exp_row_direct.exp_id) + + + def test_update_exp_status(self): + self.exp_status_db.update_exp_status(EXPID, "RUNNING") + exp_status_row_current = self.exp_status_db.get_experiment_status_row_by_expid(EXPID) + self.assertTrue(exp_status_row_current.status == "RUNNING") + self.exp_status_db.update_exp_status(EXPID, "NOT RUNNING") + exp_status_row_current = self.exp_status_db.get_experiment_status_row_by_expid(EXPID) + self.assertTrue(exp_status_row_current.status == "NOT RUNNING") + + def test_create_exp_status(self): + experiment = self.exp_status_db.get_experiment_row_by_expid(EXPID_NONE) + self.exp_status_db.create_experiment_status_as_running(experiment) + experiment_status = self.exp_status_db.get_experiment_status_row_by_expid(EXPID_NONE) + self.assertIsNotNone(experiment_status) + self.exp_status_db.delete_exp_status(EXPID_NONE) + experiment_status = self.exp_status_db.get_experiment_status_row_by_expid(EXPID_NONE) + self.assertIsNone(experiment_status) + + +class TestExperimentHistoryDbManager(unittest.TestCase): + """ Covers Experiment History Database Manager and Data Models """ + def setUp(self): + self.experiment_database = ExperimentHistoryDbManager(EXPID, JOBDATA_DIR) + source_path_tt00 = os.path.join(JOBDATA_DIR, EXPID_TT00_SOURCE) + self.target_path_tt00 = os.path.join(JOBDATA_DIR, "job_data_{0}.db".format(EXPID)) + copy2(source_path_tt00, self.target_path_tt00) + source_path_tt01 = os.path.join(JOBDATA_DIR, EXPID_TT01_SOURCE) + self.target_path_tt01 = os.path.join(JOBDATA_DIR, "job_data_{0}.db".format(EXPID_NONE)) + copy2(source_path_tt01, self.target_path_tt01) + self.experiment_database.initialize() + + def tearDown(self): + os.remove(self.target_path_tt00) + os.remove(self.target_path_tt01) + + def test_get_max_id(self): + max_item = self.experiment_database.get_experiment_run_dc_with_max_id() + self.assertTrue(max_item.run_id > 0) + self.assertTrue(max_item.run_id >= 18) # Max is 18 + + def test_pragma(self): + self.assertTrue(self.experiment_database._get_pragma_version() == 17) # Update version on changes + + def test_get_job_data(self): + job_data = self.experiment_database._get_job_data_last_by_name("a29z_20000101_fc0_1_SIM") + self.assertTrue(len(job_data) > 0) + self.assertTrue(job_data[0].last == 1) + self.assertTrue(job_data[0].job_name == "a29z_20000101_fc0_1_SIM") + + job_data = self.experiment_database.get_job_data_by_name("a29z_20000101_fc0_1_SIM") + self.assertTrue(job_data[0].job_name == "a29z_20000101_fc0_1_SIM") + + job_data = self.experiment_database._get_job_data_last_by_run_id(18) # Latest + self.assertTrue(len(job_data) > 0) + + job_data = self.experiment_database._get_job_data_last_by_run_id_and_finished(18) + self.assertTrue(len(job_data) > 0) + + job_data = self.experiment_database.get_job_data_all() + self.assertTrue(len(job_data) > 0) + + def test_insert_and_delete_experiment_run(self): + new_run = ExperimentRun(19) + new_id = self.experiment_database._insert_experiment_run(new_run) + self.assertIsNotNone(new_id) + last_experiment_run = self.experiment_database.get_experiment_run_dc_with_max_id() + self.assertTrue(new_id == last_experiment_run.run_id) + self.experiment_database.delete_experiment_run(new_id) + last_experiment_run = self.experiment_database.get_experiment_run_dc_with_max_id() + self.assertTrue(new_id != last_experiment_run.run_id) + + def test_insert_and_delete_job_data(self): + max_run_id = self.experiment_database.get_experiment_run_dc_with_max_id().run_id + new_job_data_name = "test_001_name_{0}".format(int(time.time())) + new_job_data = JobData(_id=1, job_name=new_job_data_name, run_id=max_run_id) + new_job_data_id = self.experiment_database._insert_job_data(new_job_data) + self.assertIsNotNone(new_job_data_id) + self.experiment_database.delete_job_data(new_job_data_id) + job_data = self.experiment_database.get_job_data_by_name(new_job_data_name) + self.assertTrue(len(job_data) == 0) + + + def test_update_experiment_run(self): + experiment_run_data_class = self.experiment_database.get_experiment_run_dc_with_max_id() # 18 + backup_run = self.experiment_database.get_experiment_run_dc_with_max_id() + experiment_run_data_class.chunk_unit = "unouno" + experiment_run_data_class.running = random.randint(1, 100) + experiment_run_data_class.queuing = random.randint(1, 100) + experiment_run_data_class.suspended = random.randint(1, 100) + self.experiment_database._update_experiment_run(experiment_run_data_class) + last_experiment_run = self.experiment_database.get_experiment_run_dc_with_max_id() # 18 + self.assertTrue(last_experiment_run.chunk_unit == experiment_run_data_class.chunk_unit) + self.assertTrue(last_experiment_run.running == experiment_run_data_class.running) + self.assertTrue(last_experiment_run.queuing == experiment_run_data_class.queuing) + self.assertTrue(last_experiment_run.suspended == experiment_run_data_class.suspended) + self.experiment_database._update_experiment_run(backup_run) + + def test_job_data_from_model(self): + job_data_rows = self.experiment_database._get_job_data_last_by_name("a29z_20000101_fc0_1_SIM") + job_data_row_first = job_data_rows[0] + job_data_data_class = JobData.from_model(job_data_row_first) + self.assertTrue(job_data_row_first.job_name == job_data_data_class.job_name) + + def test_update_job_data_processed(self): + current_time = time.time() + job_data_rows = self.experiment_database._get_job_data_last_by_name("a29z_20000101_fc0_1_SIM") + job_data_row_first = job_data_rows[0] + job_data_data_class = JobData.from_model(job_data_row_first) + backup_job_dc = JobData.from_model(job_data_row_first) + job_data_data_class.nnodes = random.randint(1, 1000) + job_data_data_class.ncpus = random.randint(1, 1000) + job_data_data_class.status = "DELAYED" + job_data_data_class.finish = current_time + self.experiment_database._update_job_data_by_id(job_data_data_class) + job_data_rows_current = self.experiment_database._get_job_data_last_by_name("a29z_20000101_fc0_1_SIM") + job_data_row_first = job_data_rows_current[0] + self.assertTrue(job_data_row_first.nnodes == job_data_data_class.nnodes) + self.assertTrue(job_data_row_first.ncpus == job_data_data_class.ncpus) + self.assertTrue(job_data_row_first.status == job_data_data_class.status) + self.assertTrue(job_data_row_first.finish == job_data_data_class.finish) + self.experiment_database._update_job_data_by_id(backup_job_dc) + + def test_bulk_update(self): + current_time = time.time() + all_job_data_rows = self.experiment_database.get_job_data_all() + job_data_rows_test = [job for job in all_job_data_rows if job.run_id == 3] + backup = [JobData.from_model(job) for job in job_data_rows_test] + list_job_data_class = [JobData.from_model(job) for job in job_data_rows_test] + backup_changes = [(HUtils.get_current_datetime(), job.status, job.rowstatus, job._id) for job in list_job_data_class] + changes = [(HUtils.get_current_datetime(), "DELAYED", job.rowstatus, job._id) for job in list_job_data_class] + self.experiment_database.update_many_job_data_change_status(changes) + all_job_data_rows = self.experiment_database.get_job_data_all() + job_data_rows_validate = [job for job in all_job_data_rows if job.run_id == 3] + for (job_val, change_item) in zip(job_data_rows_validate, changes): + modified, status, rowstatus, _id = change_item + # self.assertTrue(job_val.finish == finish) + self.assertTrue(job_val.modified == modified) + self.assertTrue(job_val.status == status) + self.assertTrue(job_val.rowstatus == rowstatus) + self.assertTrue(job_val.id == _id) + self.experiment_database.update_many_job_data_change_status(backup_changes) + + def test_job_data_maxcounter(self): + new_job_data = ExperimentHistoryDbManager(EXPID_NONE, JOBDATA_DIR) + new_job_data.initialize() + max_empty_table_counter = new_job_data.get_job_data_max_counter() + self.assertTrue(max_empty_table_counter == 0) + max_existing_counter = self.experiment_database.get_job_data_max_counter() + self.assertTrue(max_existing_counter > 0) + + def test_register_submitted_job_data_dc(self): + job_data_dc = self.experiment_database.get_job_data_dc_unique_latest_by_job_name("a29z_20000101_fc0_1_SIM") + max_counter = self.experiment_database.get_job_data_max_counter() + self.assertTrue(max_counter > 0) + self.assertTrue(job_data_dc.counter > 0) + next_counter = max(max_counter, job_data_dc.counter + 1) + self.assertTrue(next_counter >= max_counter) + self.assertTrue(next_counter >= job_data_dc.counter + 1) + job_data_dc.counter = next_counter + job_data_dc_current = self.experiment_database.register_submitted_job_data_dc(job_data_dc) + self.assertTrue(job_data_dc._id < job_data_dc_current._id) + job_data_last_list = self.experiment_database._get_job_data_last_by_name(job_data_dc.job_name) + self.assertTrue(len(job_data_last_list) == 1) + self.experiment_database.delete_job_data(job_data_last_list[0].id) + job_data_dc.last = 1 + updated_job_data_dc = self.experiment_database.update_job_data_dc_by_id(job_data_dc) + self.assertTrue(job_data_dc._id == updated_job_data_dc._id) + job_data_dc = self.experiment_database.get_job_data_dc_unique_latest_by_job_name("a29z_20000101_fc0_1_SIM") + self.assertTrue(job_data_dc._id == updated_job_data_dc._id) + + def test_update_children_and_platform_output(self): + job_data_dc = self.experiment_database.get_job_data_dc_unique_latest_by_job_name("a29z_20000101_fc0_1_SIM") + children_str = "a00, a01, a02" + platform_output_str = " SLURM OUTPUT " + job_data_dc.children = children_str + job_data_dc.platform_output = platform_output_str + self.experiment_database.update_job_data_dc_by_id(job_data_dc) + job_data_dc_updated = self.experiment_database.get_job_data_dc_unique_latest_by_job_name("a29z_20000101_fc0_1_SIM") + self.assertTrue(job_data_dc_updated.children == children_str) + self.assertTrue(job_data_dc_updated.platform_output == platform_output_str) + # Back to normal + job_data_dc.children = "" + job_data_dc.platform_output = "NO OUTPUT" + self.experiment_database.update_job_data_dc_by_id(job_data_dc) + job_data_dc_updated = self.experiment_database.get_job_data_dc_unique_latest_by_job_name("a29z_20000101_fc0_1_SIM") + self.assertTrue(job_data_dc_updated.children == "") + self.assertTrue(job_data_dc_updated.platform_output == "NO OUTPUT") + + + + def test_experiment_run_dc(self): + experiment_run = self.experiment_database.get_experiment_run_dc_with_max_id() + self.assertIsNotNone(experiment_run) + + def test_if_database_exists(self): + exp_manager = ExperimentHistoryDbManager("0000") + self.assertTrue(exp_manager.my_database_exists() == False) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/autosubmit/history/experiment_history.py b/autosubmit/history/experiment_history.py new file mode 100644 index 0000000000000000000000000000000000000000..f436cc53d701df28e71f694dc880bc119a891638 --- /dev/null +++ b/autosubmit/history/experiment_history.py @@ -0,0 +1,312 @@ +#!/usr/bin/python + +# Copyright 2015-2020 Earth Sciences Department, BSC-CNS +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . +import os +import traceback +from autosubmit.history.data_classes import job_data +import database_managers.database_models as Models +import utils as HUtils +from time import time, sleep +from database_managers.experiment_history_db_manager import ExperimentHistoryDbManager, DEFAULT_JOBDATA_DIR +from strategies import PlatformInformationHandler, SingleAssociationStrategy, StraightWrapperAssociationStrategy, TwoDimWrapperDistributionStrategy, GeneralizedWrapperDistributionStrategy +from data_classes.job_data import JobData +from data_classes.experiment_run import ExperimentRun +from platform_monitor.slurm_monitor import SlurmMonitor +from internal_logging import Logging + +SECONDS_WAIT_PLATFORM = 60 + +class ExperimentHistory(): + def __init__(self, expid, jobdata_dir_path=DEFAULT_JOBDATA_DIR): + self.expid = expid + self._log = Logging(expid) + self._job_data_dir_path = jobdata_dir_path + try: + self.manager = ExperimentHistoryDbManager(self.expid, jobdata_dir_path=jobdata_dir_path) + except Exception as exp: + self._log.log(str(exp), traceback.format_exc()) + self.manager = None + + def initialize_database(self): + try: + self.manager.initialize() + except Exception as exp: + self._log.log(str(exp), traceback.format_exc()) + self.manager = None + + def is_header_ready(self): + if self.manager: + return self.manager.is_header_ready_db_version() + return False + + + def write_submit_time(self, job_name, submit=0, status="UNKNOWN", ncpus=0, wallclock="00:00", qos="debug", date="", + member="", section="", chunk=0, platform="NA", job_id=0, wrapper_queue=None, wrapper_code=None, children=""): + try: + next_counter = self._get_next_counter_by_job_name(job_name) + job_data_dc = JobData(_id=0, + counter=next_counter, + job_name=job_name, + submit=submit, + status=status, + rowtype=self._get_defined_rowtype(wrapper_code), + ncpus=ncpus, + wallclock=wallclock, + qos=self._get_defined_queue_name(wrapper_queue, wrapper_code, qos), + date=date, + member=member, + section=section, + chunk=chunk, + platform=platform, + job_id=job_id, + children=children) + return self.manager.register_submitted_job_data_dc(job_data_dc) + except Exception as exp: + self._log.log(str(exp), traceback.format_exc()) + return None + + def write_start_time(self, job_name, start=0, status="UNKWOWN", ncpus=0, wallclock="00:00", qos="debug", date="", + member="", section="", chunk=0, platform="NA", job_id=0, wrapper_queue=None, wrapper_code=None, children=""): + try: + job_data_dc_last = self.manager.get_job_data_dc_unique_latest_by_job_name(job_name) + if not job_data_dc_last: + job_data_dc_last = self.write_submit_time(job_name=job_name, + status=status, + ncpus=ncpus, + wallclock=wallclock, + qos=qos, + date=date, + member=member, + section=section, + chunk=chunk, + platform=platform, + job_id=job_id, + wrapper_queue=wrapper_queue, + wrapper_code=wrapper_code) + self._log.log("write_start_time {0} start not found.".format(job_name)) + job_data_dc_last.start = start + job_data_dc_last.qos = self._get_defined_queue_name(wrapper_queue, wrapper_code, qos) + job_data_dc_last.status = status + job_data_dc_last.rowtype = self._get_defined_rowtype(wrapper_code) + job_data_dc_last.job_id = job_id + job_data_dc_last.children = children + return self.manager.update_job_data_dc_by_id(job_data_dc_last) + except Exception as exp: + self._log.log(str(exp), traceback.format_exc()) + + def write_finish_time(self, job_name, finish=0, status="UNKNOWN", ncpus=0, wallclock="00:00", qos="debug", date="", + member="", section="", chunk=0, platform="NA", job_id=0, out_file=None, err_file=None, + wrapper_queue=None, wrapper_code=None, children=""): + try: + job_data_dc_last = self.manager.get_job_data_dc_unique_latest_by_job_name(job_name) + if not job_data_dc_last: + job_data_dc_last = self.write_submit_time(job_name=job_name, + status=status, + ncpus=ncpus, + wallclock=wallclock, + qos=qos, + date=date, + member=member, + section=section, + chunk=chunk, + platform=platform, + job_id=job_id, + wrapper_queue=wrapper_queue, + wrapper_code=wrapper_code, + children=children) + self._log.log("write_finish_time {0} submit not found.".format(job_name)) + job_data_dc_last.finish = finish if finish > 0 else int(time()) + job_data_dc_last.status = status + job_data_dc_last.job_id = job_id + job_data_dc_last.rowstatus = Models.RowStatus.PENDING_PROCESS + job_data_dc_last.out = out_file if out_file else "" + job_data_dc_last.err = err_file if err_file else "" + return self.manager.update_job_data_dc_by_id(job_data_dc_last) + except Exception as exp: + self._log.log(str(exp), traceback.format_exc()) + + def write_platform_data_after_finish(self, job_data_dc, platform_obj): + """ + Call it in a thread. + """ + try: + sleep(SECONDS_WAIT_PLATFORM) + ssh_output = platform_obj.check_job_energy(job_data_dc.job_id) + slurm_monitor = SlurmMonitor(ssh_output) + self._verify_slurm_monitor(slurm_monitor, job_data_dc) + job_data_dcs_in_wrapper = self.manager.get_job_data_dcs_last_by_wrapper_code(job_data_dc.wrapper_code) + job_data_dcs_to_update = [] + if len(job_data_dcs_in_wrapper) > 0: + info_handler = PlatformInformationHandler(StraightWrapperAssociationStrategy()) + job_data_dcs_to_update = info_handler.execute_distribution(job_data_dc, job_data_dcs_in_wrapper, slurm_monitor) + if len(job_data_dcs_to_update) == 0: + info_handler.strategy = TwoDimWrapperDistributionStrategy() + job_data_dcs_to_update = info_handler.execute_distribution(job_data_dc, job_data_dcs_in_wrapper, slurm_monitor) + if len(job_data_dcs_to_update) == 0: + info_handler.strategy = GeneralizedWrapperDistributionStrategy() + job_data_dcs_to_update = info_handler.execute_distribution(job_data_dc, job_data_dcs_in_wrapper, slurm_monitor) + else: + info_handler = PlatformInformationHandler(SingleAssociationStrategy()) + job_data_dcs_to_update = info_handler.execute_distribution(job_data_dc, job_data_dcs_in_wrapper, slurm_monitor) + return self.manager.update_list_job_data_dc_by_each_id(job_data_dcs_to_update) + except Exception as exp: + self._log.log(str(exp), traceback.format_exc()) + + def _verify_slurm_monitor(self, slurm_monitor, job_data_dc): + try: + if slurm_monitor.header.status not in ["COMPLETED", "FAILED"]: + self._log.log("Assertion Error on job {0} with ssh_output {1}".format(job_data_dc.job_name, slurm_monitor.original_input), + "Slurm status {0} is not COMPLETED nor FAILED for ID {1}.\n".format(slurm_monitor.header.status, slurm_monitor.header.name)) + if not slurm_monitor.steps_plus_extern_approximate_header_energy(): + self._log.log("Assertion Error on job {0} with ssh_output {1}".format(job_data_dc.job_name, slurm_monitor.original_input), + "Steps + extern != total energy for ID {0}. Number of steps {1}.\n".format(slurm_monitor.header.name, slurm_monitor.step_count)) + except Exception as exp: + self._log.log(str(exp), traceback.format_exc()) + + def process_status_changes(self, job_list=None, chunk_unit="NA", chunk_size=0, current_config=""): + """ Detect status differences between job_list and current job_data rows, and update. Creates a new run if necessary. """ + try: + current_experiment_run_dc = self.manager.get_experiment_run_dc_with_max_id() + update_these_changes = self._get_built_list_of_changes(job_list) + if len(update_these_changes) > 0: + self.manager.update_many_job_data_change_status(update_these_changes) + if self.should_we_create_a_new_run(job_list, len(update_these_changes), current_experiment_run_dc.total): + return self.create_new_experiment_run(chunk_unit, chunk_size, current_config, job_list) + return self.update_counts_on_experiment_run_dc(current_experiment_run_dc, job_list) + except Exception as exp: + self._log.log(str(exp), traceback.format_exc()) + + def _get_built_list_of_changes(self, job_list): + """ Return: List of (current timestamp, current datetime str, status, rowstatus, id in job_data). One tuple per change. """ + job_data_dcs = self.detect_changes_in_job_list(job_list) + return [(HUtils.get_current_datetime(), job.status, Models.RowStatus.CHANGED, job._id) for job in job_data_dcs] + + def process_job_list_changes_to_experiment_totals(self, job_list=None): + """ Updates current experiment_run row with totals calculated from job_list. """ + try: + current_experiment_run_dc = self.manager.get_experiment_run_dc_with_max_id() + return self.update_counts_on_experiment_run_dc(current_experiment_run_dc, job_list) + except Exception as exp: + self._log.log(str(exp), traceback.format_exc()) + + def should_we_create_a_new_run(self, job_list, changes_count, total_count): + if len(job_list) != total_count: + return True + if changes_count > int(self._get_date_member_completed_count(job_list)*0.9): + return True + return False + + def update_counts_on_experiment_run_dc(self, experiment_run_dc, job_list=None): + """ Return updated row as Models.ExperimentRun. """ + status_counts = self.get_status_counts_from_job_list(job_list) + experiment_run_dc.completed = status_counts[HUtils.SupportedStatus.COMPLETED] + experiment_run_dc.failed = status_counts[HUtils.SupportedStatus.FAILED] + experiment_run_dc.queuing = status_counts[HUtils.SupportedStatus.QUEUING] + experiment_run_dc.submitted = status_counts[HUtils.SupportedStatus.SUBMITTED] + experiment_run_dc.running = status_counts[HUtils.SupportedStatus.RUNNING] + experiment_run_dc.suspended = status_counts[HUtils.SupportedStatus.SUSPENDED] + experiment_run_dc.total = status_counts["TOTAL"] + return self.manager.update_experiment_run_dc_by_id(experiment_run_dc) + + def finish_current_experiment_run(self): + if self.manager.is_there_a_last_experiment_run(): + current_experiment_run_dc = self.manager.get_experiment_run_dc_with_max_id() + current_experiment_run_dc.finish = int(time()) + return self.manager.update_experiment_run_dc_by_id(current_experiment_run_dc) + return None + + def create_new_experiment_run(self, chunk_unit="NA", chunk_size=0, current_config="", job_list=None): + """ Also writes the finish timestamp of the previous run. """ + self.finish_current_experiment_run() + return self._create_new_experiment_run_dc_with_counts(chunk_unit=chunk_unit, chunk_size=chunk_size, current_config=current_config, job_list=job_list) + + def _create_new_experiment_run_dc_with_counts(self, chunk_unit, chunk_size, current_config="", job_list=None): + """ Create new experiment_run row and return the new Models.ExperimentRun data class from database. """ + status_counts = self.get_status_counts_from_job_list(job_list) + experiment_run_dc = ExperimentRun(0, + chunk_unit=chunk_unit, + chunk_size=chunk_size, + metadata=current_config, + start=int(time()), + completed=status_counts[HUtils.SupportedStatus.COMPLETED], + total=status_counts["TOTAL"], + failed=status_counts[HUtils.SupportedStatus.FAILED], + queuing=status_counts[HUtils.SupportedStatus.QUEUING], + running=status_counts[HUtils.SupportedStatus.RUNNING], + submitted=status_counts[HUtils.SupportedStatus.SUBMITTED], + suspended=status_counts[HUtils.SupportedStatus.SUSPENDED]) + return self.manager.register_experiment_run_dc(experiment_run_dc) + + def detect_changes_in_job_list(self, job_list): + """ Detect changes in job_list compared to the current contents of job_data table. Returns a list of JobData data classes where the status of each item is the new status.""" + job_name_to_job = {job.name: job for job in job_list} + current_job_data_dcs = self.manager.get_all_last_job_data_dcs() + differences = [] + for job_dc in current_job_data_dcs: + if job_dc.job_name in job_name_to_job and job_dc.status != job_name_to_job[job_dc.job_name].status_str: + job_dc.status = job_name_to_job[job_dc.job_name].status_str + differences.append(job_dc) + return differences + + def _get_defined_rowtype(self, code): + if code: + return code + else: + return Models.RowType.NORMAL + + def _get_defined_queue_name(self, wrapper_queue, wrapper_code, qos): + if wrapper_code and wrapper_code > 2 and wrapper_queue is not None: + return wrapper_queue + return qos + + def _get_next_counter_by_job_name(self, job_name): + """ Return the counter attribute from the latest job data row by job_name. """ + job_data_dc = self.manager.get_job_data_dc_unique_latest_by_job_name(job_name) + max_counter = self.manager.get_job_data_max_counter() + if job_data_dc: + return max(max_counter, job_data_dc.counter + 1) + else: + return max_counter + + def _get_date_member_completed_count(self, job_list): + """ Each item in the job_list must have attributes: date, member, status_str. """ + job_list = job_list if job_list else [] + return sum(1 for job in job_list if job.date is not None and job.member is not None and job.status_str == HUtils.SupportedStatus.COMPLETED) + + def get_status_counts_from_job_list(self, job_list): + """ + Return dict with keys COMPLETED, FAILED, QUEUING, SUBMITTED, RUNNING, SUSPENDED, TOTAL. + """ + result = { + HUtils.SupportedStatus.COMPLETED: 0, + HUtils.SupportedStatus.FAILED: 0, + HUtils.SupportedStatus.QUEUING: 0, + HUtils.SupportedStatus.SUBMITTED: 0, + HUtils.SupportedStatus.RUNNING: 0, + HUtils.SupportedStatus.SUSPENDED: 0, + "TOTAL": 0 + } + + if not job_list: + job_list = [] + + for job in job_list: + if job.status_str in result: + result[job.status_str] += 1 + result["TOTAL"] = len(job_list) + return result + \ No newline at end of file diff --git a/autosubmit/history/experiment_status.py b/autosubmit/history/experiment_status.py new file mode 100644 index 0000000000000000000000000000000000000000..8de57f8703fb74c2ba0aed7eadeec239ffe2b8c5 --- /dev/null +++ b/autosubmit/history/experiment_status.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python + +# Copyright 2015-2020 Earth Sciences Department, BSC-CNS +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . + +import traceback +from database_managers.experiment_status_db_manager import ExperimentStatusDbManager, DEFAULT_LOCAL_ROOT_DIR +from internal_logging import Logging + +class ExperimentStatus(): + """ Represents the Experiment Status Mechanism that keeps track of currently active experiments """ + def __init__(self, expid, local_root_dir_path=DEFAULT_LOCAL_ROOT_DIR): + # type : (str) -> None + self.expid = expid # type : str + try: + self.manager = ExperimentStatusDbManager(self.expid, local_root_dir_path=local_root_dir_path) + except Exception as exp: + message = "Error while trying to update {0} in experiment_status.".format(str(self.expid)) + print(message) + Logging(self.expid).log(message, traceback.format_exc()) + self.manager = None + + def set_as_running(self): + # type : () -> None + """ Set the status of the experiment in experiment_status of as_times.db as RUNNING. Creates the database, table and row if necessary.""" + if self.manager: + exp_status_row = self.manager.get_experiment_status_row_by_expid(self.expid) + if exp_status_row: + self.manager.set_existing_experiment_status_as_running(exp_status_row.name) + else: + exp_row = self.manager.get_experiment_row_by_expid(self.expid) + self.manager.create_experiment_status_as_running(exp_row) diff --git a/autosubmit/history/internal_logging.py b/autosubmit/history/internal_logging.py new file mode 100644 index 0000000000000000000000000000000000000000..fa379c61aa3ce733c74c86e7063168788bc7b7dc --- /dev/null +++ b/autosubmit/history/internal_logging.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python + +# Copyright 2015-2020 Earth Sciences Department, BSC-CNS +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . +import os +import utils as HUtils + +class Logging(): + def __init__(self, expid): + self.expid = expid + + def log(self, main_msg, traceback_msg=""): + try: + log_path = self.get_default_log_path(self.expid) + HUtils.get_current_datetime() + if not os.path.exists(log_path): + HUtils.create_file_with_full_permissions(log_path) + with open(log_path, "a") as exp_log: + exp_log.write(self.build_message(main_msg, traceback_msg)) + except Exception as exp: + print(exp) + print("Logging failed. Please report it to the developers.") + + def build_message(self, main_msg, traceback_msg): + return "{0} :: {1} :: {2}".format(HUtils.get_current_datetime(), main_msg, traceback_msg) + + def get_default_log_path(self, expid): + return os.path.join("/esarchive","autosubmit", "as_metadata", "logs","{}_log.txt".format(expid)) \ No newline at end of file diff --git a/autosubmit/history/platform_monitor/__init__.py b/autosubmit/history/platform_monitor/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/autosubmit/history/platform_monitor/output_examples/pending.txt b/autosubmit/history/platform_monitor/output_examples/pending.txt new file mode 100644 index 0000000000000000000000000000000000000000..007e88d0886e0514329881d82c47db9c4ae5109e --- /dev/null +++ b/autosubmit/history/platform_monitor/output_examples/pending.txt @@ -0,0 +1 @@ + 17838842 PENDING 4 1 2021-10-11T10:55:53 Unknown Unknown diff --git a/autosubmit/history/platform_monitor/output_examples/wrapper1.txt b/autosubmit/history/platform_monitor/output_examples/wrapper1.txt new file mode 100644 index 0000000000000000000000000000000000000000..61b855cd1b46e5c03a8e8298f333d8315a5b4b2e --- /dev/null +++ b/autosubmit/history/platform_monitor/output_examples/wrapper1.txt @@ -0,0 +1,3 @@ + 12535498 COMPLETED 2 1 2020-11-18T13:54:24 2020-11-18T13:55:55 2020-11-18T13:56:10 2.77K + 12535498.batch COMPLETED 2 1 2020-11-18T13:55:55 2020-11-18T13:55:55 2020-11-18T13:56:10 2.69K 659K 659K + 12535498.extern COMPLETED 2 1 2020-11-18T13:55:55 2020-11-18T13:55:55 2020-11-18T13:56:10 2.77K 24K 24K \ No newline at end of file diff --git a/autosubmit/history/platform_monitor/output_examples/wrapper2.txt b/autosubmit/history/platform_monitor/output_examples/wrapper2.txt new file mode 100644 index 0000000000000000000000000000000000000000..082eb0105bf17788d6f9a058f6530b74490e0faa --- /dev/null +++ b/autosubmit/history/platform_monitor/output_examples/wrapper2.txt @@ -0,0 +1,3 @@ + 12535498 COMPLETED 2 1 2020-11-18T13:54:24 2020-11-18T13:55:55 2020-11-18T13:56:10 2.77K + 12535498.batch COMPLETED 2 1 2020-11-18T13:55:55 2020-11-18T13:55:55 2020-11-18T13:56:10 2.69K 659K 659K + 12535498.0 COMPLETED 2 1 2020-11-18T13:55:55 2020-11-18T13:55:55 2020-11-18T13:56:10 2.77K 24K 24K \ No newline at end of file diff --git a/autosubmit/history/platform_monitor/output_examples/wrapper_big.txt b/autosubmit/history/platform_monitor/output_examples/wrapper_big.txt new file mode 100644 index 0000000000000000000000000000000000000000..65c6c119128b035bdb21cb2133838c5353ab9bbd --- /dev/null +++ b/autosubmit/history/platform_monitor/output_examples/wrapper_big.txt @@ -0,0 +1,33 @@ + 17857525 COMPLETED 10 1 2021-10-13T15:51:16 2021-10-13T15:51:17 2021-10-13T15:52:47 19.05K + 17857525.batch COMPLETED 10 1 2021-10-13T15:51:17 2021-10-13T15:51:17 2021-10-13T15:52:47 13.38K 6264K 6264K + 17857525.extern COMPLETED 10 1 2021-10-13T15:51:17 2021-10-13T15:51:17 2021-10-13T15:52:47 13.66K 473K 68K + 17857525.0 COMPLETED 10 1 2021-10-13T15:51:21 2021-10-13T15:51:21 2021-10-13T15:51:22 186 352K 312.30K + 17857525.1 COMPLETED 10 1 2021-10-13T15:51:23 2021-10-13T15:51:23 2021-10-13T15:51:24 186 420K 306.70K + 17857525.2 COMPLETED 10 1 2021-10-13T15:51:24 2021-10-13T15:51:24 2021-10-13T15:51:27 188 352K 325.80K + 17857525.3 COMPLETED 10 1 2021-10-13T15:51:28 2021-10-13T15:51:28 2021-10-13T15:51:29 192 352K 341.90K + 17857525.4 COMPLETED 10 1 2021-10-13T15:51:29 2021-10-13T15:51:29 2021-10-13T15:51:31 186 352K 335.20K + 17857525.5 COMPLETED 10 1 2021-10-13T15:51:31 2021-10-13T15:51:31 2021-10-13T15:51:32 186 352K 329.80K + 17857525.6 COMPLETED 10 1 2021-10-13T15:51:32 2021-10-13T15:51:32 2021-10-13T15:51:33 184 428K 311.10K + 17857525.7 COMPLETED 10 1 2021-10-13T15:51:34 2021-10-13T15:51:34 2021-10-13T15:51:35 185 416K 341.40K + 17857525.8 COMPLETED 10 1 2021-10-13T15:51:35 2021-10-13T15:51:35 2021-10-13T15:51:37 180 428K 317.40K + 17857525.9 COMPLETED 10 1 2021-10-13T15:51:39 2021-10-13T15:51:39 2021-10-13T15:51:42 17 424K 272.70K + 17857525.10 COMPLETED 10 1 2021-10-13T15:51:42 2021-10-13T15:51:42 2021-10-13T15:51:44 185 356K 304.20K + 17857525.11 COMPLETED 10 1 2021-10-13T15:51:44 2021-10-13T15:51:44 2021-10-13T15:51:45 189 352K 322.20K + 17857525.12 COMPLETED 10 1 2021-10-13T15:51:45 2021-10-13T15:51:45 2021-10-13T15:51:47 184 388K 310.70K + 17857525.13 COMPLETED 10 1 2021-10-13T15:51:48 2021-10-13T15:51:48 2021-10-13T15:51:49 183 352K 336.90K + 17857525.14 COMPLETED 10 1 2021-10-13T15:51:49 2021-10-13T15:51:49 2021-10-13T15:51:51 183 428K 346.60K + 17857525.15 COMPLETED 10 1 2021-10-13T15:51:51 2021-10-13T15:51:51 2021-10-13T15:51:53 187 352K 335.90K + 17857525.16 COMPLETED 10 1 2021-10-13T15:51:54 2021-10-13T15:51:54 2021-10-13T15:51:55 184 424K 270K + 17857525.17 COMPLETED 10 1 2021-10-13T15:51:55 2021-10-13T15:51:55 2021-10-13T15:51:57 186 352K 304.80K + 17857525.18 COMPLETED 10 1 2021-10-13T15:51:57 2021-10-13T15:51:57 2021-10-13T15:51:59 182 428K 357K + 17857525.19 COMPLETED 10 1 2021-10-13T15:51:59 2021-10-13T15:51:59 2021-10-13T15:52:01 185 420K 280.60K + 17857525.20 COMPLETED 10 1 2021-10-13T15:52:01 2021-10-13T15:52:01 2021-10-13T15:52:03 185 352K 339.90K + 17857525.21 COMPLETED 10 1 2021-10-13T15:52:04 2021-10-13T15:52:04 2021-10-13T15:52:05 188 356K 340.20K + 17857525.22 COMPLETED 10 1 2021-10-13T15:52:06 2021-10-13T15:52:06 2021-10-13T15:52:08 185 352K 287.50K + 17857525.23 COMPLETED 10 1 2021-10-13T15:52:08 2021-10-13T15:52:08 2021-10-13T15:52:11 187 420K 349.40K + 17857525.24 COMPLETED 10 1 2021-10-13T15:52:14 2021-10-13T15:52:14 2021-10-13T15:52:16 185 420K 353.70K + 17857525.25 COMPLETED 10 1 2021-10-13T15:52:20 2021-10-13T15:52:20 2021-10-13T15:52:22 187 352K 340.30K + 17857525.26 COMPLETED 10 1 2021-10-13T15:52:24 2021-10-13T15:52:24 2021-10-13T15:52:32 186 420K 345.80K + 17857525.27 COMPLETED 10 1 2021-10-13T15:52:37 2021-10-13T15:52:37 2021-10-13T15:52:39 184 352K 341K + 17857525.28 COMPLETED 10 1 2021-10-13T15:52:41 2021-10-13T15:52:41 2021-10-13T15:52:43 184 352K 326.20K + 17857525.29 COMPLETED 10 1 2021-10-13T15:52:44 2021-10-13T15:52:44 2021-10-13T15:52:47 183 352K 319.30K diff --git a/autosubmit/history/platform_monitor/platform_monitor.py b/autosubmit/history/platform_monitor/platform_monitor.py new file mode 100644 index 0000000000000000000000000000000000000000..8439190cc7b8c25108689d40d6838abf2dd38f76 --- /dev/null +++ b/autosubmit/history/platform_monitor/platform_monitor.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python + +# Copyright 2015-2020 Earth Sciences Department, BSC-CNS +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . + +from abc import ABCMeta, abstractmethod + +class PlatformMonitor(): + __metaclass__ = ABCMeta + def __init__(self, platform_output): + self.original_input = platform_output + self.input = str(platform_output).strip() + + + @abstractmethod + def _identify_input_rows(self): + """ """ + diff --git a/autosubmit/history/platform_monitor/platform_utils.py b/autosubmit/history/platform_monitor/platform_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..43a015ab9bf03302bb012eff24a16935e3a1a6f0 --- /dev/null +++ b/autosubmit/history/platform_monitor/platform_utils.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python + +# Copyright 2015-2020 Earth Sciences Department, BSC-CNS +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . + +import os +from time import mktime +from datetime import datetime + +SLURM_DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%S" + +def parse_output_number(string_number): + """ + Parses number in format 1.0K 1.0M 1.0G + + :param string_number: String representation of number + :type string_number: str + :return: number in float format + :rtype: float + """ + number = 0.0 + if (string_number): + last_letter = string_number.strip()[-1] + multiplier = 1.0 + if last_letter == "G": + multiplier = 1000000000.0 # Billion + number = float(string_number[:-1]) + elif last_letter == "M": + multiplier = 1000000.0 # Million + number = float(string_number[:-1]) + elif last_letter == "K": + multiplier = 1000.0 # Thousand + number = float(string_number[:-1]) + else: + number = float(string_number) + try: + number = float(number) * multiplier + except Exception as exp: + number = 0.0 + pass + return number + +def try_parse_time_to_timestamp(input): + """ + Receives a string in format "%Y-%m-%dT%H:%M:%S" and tries to parse it to timestamp. + """ + try: + return int(mktime(datetime.strptime(input, SLURM_DATETIME_FORMAT).timetuple())) + except: + return 0 + +def read_example(example_name): + source_path = "autosubmit/history/platform_monitor/output_examples/" + file_path = os.path.join(source_path, example_name) + with open(file_path, "r") as fp: + output_ssh = fp.read() + return output_ssh \ No newline at end of file diff --git a/autosubmit/history/platform_monitor/slurm_monitor.py b/autosubmit/history/platform_monitor/slurm_monitor.py new file mode 100644 index 0000000000000000000000000000000000000000..2c6d382004b44841daf677edf502140a885e6d98 --- /dev/null +++ b/autosubmit/history/platform_monitor/slurm_monitor.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python + +# Copyright 2015-2020 Earth Sciences Department, BSC-CNS +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . + +from platform_monitor import PlatformMonitor +from slurm_monitor_item import SlurmMonitorItem + +class SlurmMonitor(PlatformMonitor): + """ Manages Slurm commands interpretation. """ + def __init__(self, platform_output): + super(SlurmMonitor, self).__init__(platform_output) + self._identify_input_rows() + + @property + def steps_energy(self): + return sum([step.energy for step in self.input_items if step.is_step]) + + @property + def total_energy(self): + return max(self.header.energy, self.steps_energy + self.extern.energy) + + @property + def step_count(self): + return len([step for step in self.input_items if step.is_step]) + + def _identify_input_rows(self): + lines = self.input.split("\n") + self.input_items = [SlurmMonitorItem.from_line(line) for line in lines] + + @property + def steps(self): + return [item for item in self.input_items if item.is_step] + + @property + def header(self): + return next((header for header in self.input_items if header.is_header), None) + + @property + def batch(self): + return next((batch for batch in self.input_items if batch.is_batch), None) + + @property + def extern(self): + return next((extern for extern in self.input_items if extern.is_extern), None) + + def steps_plus_extern_approximate_header_energy(self): + return abs(self.steps_energy + self.extern.energy - self.header.energy) <= 10 + + def print_items(self): + for item in self.input_items: + print(item) + + diff --git a/autosubmit/history/platform_monitor/slurm_monitor_item.py b/autosubmit/history/platform_monitor/slurm_monitor_item.py new file mode 100644 index 0000000000000000000000000000000000000000..a990315f9ce9e4436306b86eb152a49cdab6d6b4 --- /dev/null +++ b/autosubmit/history/platform_monitor/slurm_monitor_item.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python + +# Copyright 2015-2020 Earth Sciences Department, BSC-CNS +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . + +import platform_utils as utils + +class SlurmMonitorItem(): + def __init__(self, name, status, ncpus, nnodes, submit, start, finish, energy="0", MaxRSS=0.0, AveRSS=0.0): + self.name = str(name) + self.status = str(status) + self.ncpus = int(ncpus) + self.nnodes = int(nnodes) + self.submit = utils.try_parse_time_to_timestamp(submit) + self.start = utils.try_parse_time_to_timestamp(start) + self.finish = utils.try_parse_time_to_timestamp(finish) + self.energy_str = energy + self.energy = utils.parse_output_number(energy) + self.MaxRSS = utils.parse_output_number(MaxRSS) + self.AveRSS = utils.parse_output_number(AveRSS) + + @property + def is_header(self): + return not self.is_detail + + @property + def is_detail(self): + if self.name.find(".") >= 0: + return True + return False + + @property + def is_extern(self): + if self.name.find(".ext") >= 0: + return True + return False + + @property + def is_batch(self): + if self.name.find(".bat") >= 0: + return True + return False + + @property + def step_number(self): + if self.is_step == True: + point_loc = self.name.find(".") + return int(self.name[point_loc+1:]) + return -1 + + @property + def is_step(self): + if self.name.find(".") >= 0 and self.is_batch == False and self.is_extern == False: + return True + return False + + @classmethod + def from_line(cls, line): + line = line.strip().split() + if len(line) < 2: + raise Exception("Slurm parser found a line too short {0}".format(line)) + new_item = cls(line[0], + line[1], + str(line[2]) if len(line) > 2 else 0, + str(line[3]) if len(line) > 3 else 0, + str(line[4]) if len(line) > 4 else 0, + str(line[5]) if len(line) > 5 else 0, + str(line[6]) if len(line) > 6 else 0, + str(line[7]) if len(line) > 7 else 0, + str(line[8]) if len(line) > 8 else 0, + str(line[9]) if len(line) > 9 else 0) + return new_item + + def get_as_dict(self): + return {"ncpus": self.ncpus, + "nnodes": self.nnodes, + "submit": self.submit, + "start": self.start, + "finish": self.finish, + "energy": self.energy, + "MaxRSS": self.MaxRSS, + "AveRSS": self.AveRSS} + + def __str__(self): + return "Name {0}, Status {1}, NCpus {2}, NNodes {3}, Submit {4}, Start {5}, Finish {6}, Energy {7}, MaxRSS {8}, AveRSS {9} [Energy Str {10}]".format(self.name, self.status, self.ncpus, self.nnodes, self.submit, self.start, self.finish, self.energy, self.MaxRSS, self.AveRSS, self.energy_str, self.is_batch) \ No newline at end of file diff --git a/autosubmit/history/platform_monitor/test.py b/autosubmit/history/platform_monitor/test.py new file mode 100644 index 0000000000000000000000000000000000000000..2f910037183766e7d8a6834ce1770985556f124c --- /dev/null +++ b/autosubmit/history/platform_monitor/test.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python + +# Copyright 2015-2020 Earth Sciences Department, BSC-CNS +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . + +import unittest +import platform_utils as utils +from slurm_monitor import SlurmMonitor + +class TestSlurmMonitor(unittest.TestCase): + def test_reader_on_simple_wrapper_example_1(self): + ssh_output = utils.read_example("wrapper1.txt") + slurm_monitor = SlurmMonitor(ssh_output) + # Header + self.assertTrue(slurm_monitor.input_items[0].is_batch == False) + self.assertTrue(slurm_monitor.input_items[0].is_detail == False) + self.assertTrue(slurm_monitor.input_items[0].is_extern == False) + self.assertTrue(slurm_monitor.input_items[0].is_header == True) + self.assertTrue(slurm_monitor.input_items[0].is_detail == False) + # Batch + self.assertTrue(slurm_monitor.input_items[1].is_batch == True) + self.assertTrue(slurm_monitor.input_items[1].is_detail == True) + self.assertTrue(slurm_monitor.input_items[1].is_extern == False) + self.assertTrue(slurm_monitor.input_items[1].is_header == False) + self.assertTrue(slurm_monitor.input_items[1].is_detail == True) + # Extern + self.assertTrue(slurm_monitor.input_items[2].is_batch == False) + self.assertTrue(slurm_monitor.input_items[2].is_detail == True) + self.assertTrue(slurm_monitor.input_items[2].is_extern == True) + self.assertTrue(slurm_monitor.input_items[2].is_header == False) + self.assertTrue(slurm_monitor.input_items[2].is_detail == True) + header = slurm_monitor.header + batch = slurm_monitor.batch + extern = slurm_monitor.extern + self.assertIsNotNone(header) + self.assertIsNotNone(batch) + self.assertIsNotNone(extern) + # print("{0} {1} <- {2}".format(batch.name, batch.energy, batch.energy_str)) + # print("{0} {1} <- {2}".format(extern.name, extern.energy, extern.energy_str)) + # print("{0} {1} <- {2}".format(header.name, header.energy, header.energy_str)) + self.assertTrue(slurm_monitor.steps_plus_extern_approximate_header_energy()) + + + def test_reader_on_simple_wrapper_example_2(self): + ssh_output = utils.read_example("wrapper2.txt") # not real + slurm_monitor = SlurmMonitor(ssh_output) + # Header + self.assertTrue(slurm_monitor.input_items[0].is_batch == False) + self.assertTrue(slurm_monitor.input_items[0].is_detail == False) + self.assertTrue(slurm_monitor.input_items[0].is_step == False) + self.assertTrue(slurm_monitor.input_items[0].is_extern == False) + self.assertTrue(slurm_monitor.input_items[0].is_header == True) + # Batch + self.assertTrue(slurm_monitor.input_items[1].is_batch == True) + self.assertTrue(slurm_monitor.input_items[1].is_detail == True) + self.assertTrue(slurm_monitor.input_items[1].is_step == False) + self.assertTrue(slurm_monitor.input_items[1].is_extern == False) + self.assertTrue(slurm_monitor.input_items[1].is_header == False) + # Step 0 + self.assertTrue(slurm_monitor.input_items[2].is_batch == False) + self.assertTrue(slurm_monitor.input_items[2].is_detail == True) + self.assertTrue(slurm_monitor.input_items[2].is_step == True) + self.assertTrue(slurm_monitor.input_items[2].is_extern == False) + self.assertTrue(slurm_monitor.input_items[2].is_header == False) + self.assertTrue(slurm_monitor.input_items[2].step_number >= 0) + + def test_reader_on_big_wrapper(self): + ssh_output = utils.read_example("wrapper_big.txt") + slurm_monitor = SlurmMonitor(ssh_output) + self.assertTrue(slurm_monitor.step_count == 30) + header = slurm_monitor.header + batch = slurm_monitor.batch + extern = slurm_monitor.extern + self.assertIsNotNone(header) + self.assertIsNotNone(batch) + self.assertIsNotNone(extern) + self.assertTrue(slurm_monitor.steps_plus_extern_approximate_header_energy()) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/autosubmit/history/strategies.py b/autosubmit/history/strategies.py new file mode 100644 index 0000000000000000000000000000000000000000..976904f7419118981505935dadf2b8b181136dbb --- /dev/null +++ b/autosubmit/history/strategies.py @@ -0,0 +1,193 @@ +#!/usr/bin/env python + +# Copyright 2015-2020 Earth Sciences Department, BSC-CNS +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . + +from abc import ABCMeta, abstractmethod +import database_managers.database_models as Models +import traceback +from internal_logging import Logging + +class PlatformInformationHandler(): + def __init__(self, strategy): + self._strategy = strategy + + @property + def strategy(self): + return self._strategy + + @strategy.setter + def strategy(self, strategy): + self._strategy = strategy + + def execute_distribution(self, job_data_dc, job_data_dcs_in_wrapper, slurm_monitor): + return self._strategy.apply_distribution(job_data_dc, job_data_dcs_in_wrapper, slurm_monitor) + + +class Strategy(): + """ Strategy Interface """ + __metaclass__ = ABCMeta + + @abstractmethod + def apply_distribution(self, job_data_dc, job_data_dcs_in_wrapper, slurm_monitor): + pass + + def set_job_data_dc_as_processed(self, job_data_dc, original_ssh_output): + job_data_dc.platform_output = original_ssh_output + job_data_dc.rowstatus = Models.RowStatus.PROCESSED + return job_data_dc + + def set_job_data_dc_as_process_failed(self, job_data_dc, original_ssh_output): + job_data_dc.platform_output = original_ssh_output + job_data_dc.rowstatus = Models.RowStatus.FAULTY + return job_data_dc + + def get_calculated_weights_of_jobs_in_wrapper(self, job_data_dcs_in_wrapper): + """ Based on computational weight: running time in seconds * number of cpus. """ + total_weight = sum(job.computational_weight for job in job_data_dcs_in_wrapper) + return {job.job_name: round(job.computational_weight/total_weight, 4) for job in job_data_dcs_in_wrapper} + + +class SingleAssociationStrategy(Strategy): + def apply_distribution(self, job_data_dc, job_data_dcs_in_wrapper, slurm_monitor): + try: + if len(job_data_dcs_in_wrapper) > 0: + return [] + job_data_dc.submit = slurm_monitor.header.submit + job_data_dc.start = slurm_monitor.header.start + job_data_dc.finish = slurm_monitor.header.finish + job_data_dc.ncpus = slurm_monitor.header.ncpus + job_data_dc.nnodes = slurm_monitor.header.nnodes + job_data_dc.energy = slurm_monitor.header.energy + job_data_dc.MaxRSS = max(slurm_monitor.header.MaxRSS, slurm_monitor.batch.MaxRSS if slurm_monitor.batch else 0, slurm_monitor.extern.MaxRSS if slurm_monitor.extern else 0) # TODO: Improve this rule + job_data_dc.AveRSS = max(slurm_monitor.header.AveRSS, slurm_monitor.batch.AveRSS if slurm_monitor.batch else 0, slurm_monitor.extern.AveRSS if slurm_monitor.extern else 0) + job_data_dc = self.set_job_data_dc_as_processed(job_data_dc, slurm_monitor.original_input) + return [job_data_dc] + except Exception as exp: + Logging("strategies").log("SingleAssociationStrategy failed for {0}. Using ssh_output: {1}. Exception message: {2}".format(job_data_dc.job_name, slurm_monitor.original_input, str(exp)), + traceback.format_exc()) + job_data_dc = self.set_job_data_dc_as_process_failed(job_data_dc, slurm_monitor.original_input) + return [job_data_dc] + +class StraightWrapperAssociationStrategy(Strategy): + def apply_distribution(self, job_data_dc, job_data_dcs_in_wrapper, slurm_monitor): + """ """ + try: + if len(job_data_dcs_in_wrapper) != slurm_monitor.step_count: + return [] + result = [] + computational_weights = self.get_calculated_weights_of_jobs_in_wrapper(job_data_dcs_in_wrapper) + for job_dc, step in zip(job_data_dcs_in_wrapper, slurm_monitor.steps): + job_dc.energy = step.energy + computational_weights.get(job_dc.job_name, 0) * slurm_monitor.extern.energy + job_dc.AveRSS = step.AveRSS + job_dc.MaxRSS = step.MaxRSS + job_dc.platform_output = "" + result.append(job_dc) + job_data_dc = self.set_job_data_dc_as_processed(job_data_dc, slurm_monitor.original_input) + result.append(job_data_dc) + return result + except Exception as exp: + Logging("strategies").log("StraightWrapperAssociationStrategy failed for {0}. Using ssh_output: {1}. Exception message: {2}".format(job_data_dc.job_name, slurm_monitor.original_input, str(exp)), + traceback.format_exc()) + job_data_dc = self.set_job_data_dc_as_process_failed(job_data_dc, slurm_monitor.original_input) + return [job_data_dc] + +class GeneralizedWrapperDistributionStrategy(Strategy): + def apply_distribution(self, job_data_dc, job_data_dcs_in_wrapper, slurm_monitor): + try: + result = [] + computational_weights = self.get_calculated_weights_of_jobs_in_wrapper(job_data_dcs_in_wrapper) + for job_dc in job_data_dcs_in_wrapper: + job_dc.energy = round(computational_weights.get(job_dc.job_name, 0) * slurm_monitor.total_energy,2) + job_dc.platform_output = "" + result.append(job_dc) + job_data_dc = self.set_job_data_dc_as_processed(job_data_dc, slurm_monitor.original_input) + result.append(job_data_dc) + return result + except Exception as exp: + Logging("strategies").log("GeneralizedWrapperDistributionStrategy failed for {0}. Using ssh_output: {1}. Exception message: {2}".format(job_data_dc.job_name, slurm_monitor.original_input, str(exp)), + traceback.format_exc()) + job_data_dc = self.set_job_data_dc_as_process_failed(job_data_dc, slurm_monitor.original_input) + return [job_data_dc] + +class TwoDimWrapperDistributionStrategy(Strategy): + def apply_distribution(self, job_data_dc, job_data_dcs_in_wrapper, slurm_monitor): + try: + result = [] + self.jobs_per_level = self.get_jobs_per_level(job_data_dcs_in_wrapper) + if len(self.jobs_per_level) != slurm_monitor.step_count: + return [] + comp_weight_per_level = self.get_comp_weight_per_level(self.jobs_per_level) + level_energy = [] + for i, step in enumerate(slurm_monitor.steps): + level_energy.append(step.energy + comp_weight_per_level[i] * slurm_monitor.extern.energy) + for i, jobs in enumerate(self.jobs_per_level): + weights = self.get_comp_weight_per_group_of_job_dcs(jobs) + for j, job_dc in enumerate(jobs): + job_dc.energy = round(level_energy[i] * weights[j], 2) + result.append(job_dc) + job_data_dc = self.set_job_data_dc_as_processed(job_data_dc, slurm_monitor.original_input) + result.append(job_data_dc) + return result + except Exception as exp: + Logging("strategies").log("TwoDimWrapperDistributionStrategy failed for {0}. Using ssh_output: {1}. Exception message: {2}".format(job_data_dc.job_name, slurm_monitor.original_input, str(exp)), + traceback.format_exc()) + job_data_dc = self.set_job_data_dc_as_process_failed(job_data_dc, slurm_monitor.original_input) + return [job_data_dc] + + def get_jobs_per_level(self, job_data_dcs_in_wrapper): + """ List of Lists, index of list is the level. """ + job_name_to_object = {job.job_name: job for job in job_data_dcs_in_wrapper} + levels = [] + roots_dcs = self._get_roots(job_data_dcs_in_wrapper) + levels.append(roots_dcs) + next_level = self.get_level(roots_dcs, job_name_to_object) + while len(next_level) > 0: + levels.append([job for job in next_level]) + next_level = self.get_level(next_level, job_name_to_object) + return levels + + def _get_roots(self, job_data_dcs_in_wrapper): + children_names = self._get_all_children(job_data_dcs_in_wrapper) + return [job for job in job_data_dcs_in_wrapper if job.job_name not in children_names] + + def _get_all_children(self, job_data_dcs_in_wrapper): + result = [] + for job_dc in job_data_dcs_in_wrapper: + result.extend(job_dc.children_list) + return result + + def get_comp_weight_per_group_of_job_dcs(self, jobs): + total = sum(job.computational_weight for job in jobs) + return [round(job.computational_weight/total, 4) for job in jobs] + + def get_comp_weight_per_level(self, jobs_per_level): + level_weight = [] + total_weight = 0 + for jobs in jobs_per_level: + computational_weight = sum(job.computational_weight for job in jobs) + total_weight += computational_weight + level_weight.append(computational_weight) + return [round(weight/total_weight, 4) for weight in level_weight] + + def get_level(self, previous_level_dcs, job_name_to_object): + children_names = [] + for job_dc in previous_level_dcs: + children_names.extend(job_dc.children_list) + level_dcs = [job_name_to_object[job_name] for job_name in children_names if job_name in job_name_to_object] + return level_dcs + + \ No newline at end of file diff --git a/autosubmit/history/test.py b/autosubmit/history/test.py new file mode 100644 index 0000000000000000000000000000000000000000..b12112e716763d4a5cbf6b0cf863fffdb0ce7bf4 --- /dev/null +++ b/autosubmit/history/test.py @@ -0,0 +1,305 @@ +#!/usr/bin/python + +# Copyright 2015-2020 Earth Sciences Department, BSC-CNS +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . + +import unittest +import traceback +import os +import time +from shutil import copy2 +from collections import namedtuple +from experiment_history import ExperimentHistory +from internal_logging import Logging +from strategies import StraightWrapperAssociationStrategy, GeneralizedWrapperDistributionStrategy, PlatformInformationHandler +from autosubmit.config.basicConfig import BasicConfig +from platform_monitor.slurm_monitor import SlurmMonitor +EXPID_TT00_SOURCE = "test_database.db~" +EXPID_TT01_SOURCE = "test_database_no_run.db~" +EXPID = "tt00" +EXPID_NONE = "tt01" +BasicConfig.read() +JOBDATA_DIR = BasicConfig.JOBDATA_DIR +LOCAL_ROOT_DIR = BasicConfig.LOCAL_ROOT_DIR +job = namedtuple("Job", ["name", "date", "member", "status_str", "children"]) + +class TestExperimentHistory(unittest.TestCase): + # @classmethod + # def setUpClass(cls): + # cls.exp = ExperimentHistory("tt00") # example database + def setUp(self): + source_path_tt00 = os.path.join(JOBDATA_DIR, EXPID_TT00_SOURCE) + self.target_path_tt00 = os.path.join(JOBDATA_DIR, "job_data_{0}.db".format(EXPID)) + copy2(source_path_tt00, self.target_path_tt00) + source_path_tt01 = os.path.join(JOBDATA_DIR, EXPID_TT01_SOURCE) + self.target_path_tt01 = os.path.join(JOBDATA_DIR, "job_data_{0}.db".format(EXPID_NONE)) + copy2(source_path_tt01, self.target_path_tt01) + self.job_list = [ + job("a29z_20000101_fc2_1_POST", "2000-01-01 00:00:00", "POST", "COMPLETED", ""), + job("a29z_20000101_fc1_1_CLEAN", "2000-01-01 00:00:00", "CLEAN", "COMPLETED", ""), + job("a29z_20000101_fc3_1_POST", "2000-01-01 00:00:00", "POST", "RUNNING", ""), + job("a29z_20000101_fc2_1_CLEAN", "2000-01-01 00:00:00", "CLEAN", "COMPLETED", ""), + job("a29z_20000101_fc0_3_SIM", "2000-01-01 00:00:00", "SIM", "COMPLETED", ""), + job("a29z_20000101_fc1_2_POST", "2000-01-01 00:00:00", "POST", "QUEUING", ""), + ] # 2 differences, all COMPLETED + self.job_list_large = [ + job("a29z_20000101_fc2_1_POST", "2000-01-01 00:00:00", "POST", "COMPLETED", ""), + job("a29z_20000101_fc1_1_CLEAN", "2000-01-01 00:00:00", "CLEAN", "COMPLETED", ""), + job("a29z_20000101_fc3_1_POST", "2000-01-01 00:00:00", "POST", "RUNNING", ""), + job("a29z_20000101_fc2_1_CLEAN", "2000-01-01 00:00:00", "CLEAN", "COMPLETED", ""), + job("a29z_20000101_fc0_3_SIM", "2000-01-01 00:00:00", "SIM", "COMPLETED", ""), + job("a29z_20000101_fc1_2_POST", "2000-01-01 00:00:00", "POST", "QUEUING", ""), + job("a29z_20000101_fc1_5_POST", "2000-01-01 00:00:00", "POST", "SUSPENDED", ""), + job("a29z_20000101_fc1_4_POST", "2000-01-01 00:00:00", "POST", "FAILED", ""), + job("a29z_20000101_fc2_5_CLEAN", "2000-01-01 00:00:00", "CLEAN", "SUBMITTED", ""), + job("a29z_20000101_fc0_1_POST", "2000-01-01 00:00:00", "POST", "RUNNING", ""), + ] + + def tearDown(self): + os.remove(self.target_path_tt00) + os.remove(self.target_path_tt01) + + def test_db_exists(self): + exp_history = ExperimentHistory("tt00") + exp_history.initialize_database() + self.assertTrue(exp_history.manager.my_database_exists() == True) + exp_history = ExperimentHistory("tt99") + self.assertTrue(exp_history.manager.my_database_exists() == False) + + def test_is_header_ready(self): + exp_history = ExperimentHistory("tt00") + self.assertTrue(exp_history.is_header_ready() == True) + + def test_detect_differences_job_list(self): + exp_history = ExperimentHistory("tt00") + exp_history.initialize_database() + differences = exp_history.detect_changes_in_job_list(self.job_list) + expected_differences = ["a29z_20000101_fc3_1_POST", "a29z_20000101_fc1_2_POST"] + for job_dc in differences: + self.assertTrue(job_dc.job_name in expected_differences) + self.assertTrue(len(differences) == 2) + + def test_built_list_of_changes(self): + exp_history = ExperimentHistory("tt00") + exp_history.initialize_database() + built_differences = exp_history._get_built_list_of_changes(self.job_list) + expected_ids_differences = [90, 101] + for item in built_differences: + self.assertTrue(item[3] in expected_ids_differences) + + def test_get_date_member_count(self): + exp_history = ExperimentHistory("tt00") + exp_history.initialize_database() + dm_count = exp_history._get_date_member_completed_count(self.job_list) + self.assertTrue(dm_count > 0) + + def test_should_we_create_new_run(self): + exp_history = ExperimentHistory("tt00") + exp_history.initialize_database() + CHANGES_COUNT = 1 + TOTAL_COUNT = 6 + should_we = exp_history.should_we_create_a_new_run(self.job_list, CHANGES_COUNT, TOTAL_COUNT) + self.assertTrue(should_we == False) + TOTAL_COUNT_DIFF = 5 + should_we = exp_history.should_we_create_a_new_run(self.job_list, CHANGES_COUNT, TOTAL_COUNT_DIFF) + self.assertTrue(should_we == True) + CHANGES_COUNT = 5 + should_we = exp_history.should_we_create_a_new_run(self.job_list, CHANGES_COUNT, TOTAL_COUNT) + self.assertTrue(should_we == True) + + def test_status_counts(self): + exp_history = ExperimentHistory("tt00") + exp_history.initialize_database() + result = exp_history.get_status_counts_from_job_list(self.job_list_large) + self.assertTrue(result["COMPLETED"] == 4) + self.assertTrue(result["QUEUING"] == 1) + self.assertTrue(result["RUNNING"] == 2) + self.assertTrue(result["FAILED"] == 1) + self.assertTrue(result["SUSPENDED"] == 1) + self.assertTrue(result["TOTAL"] == len(self.job_list_large)) + + def test_create_new_experiment_run_with_counts(self): + exp_history = ExperimentHistory("tt00") + exp_history.initialize_database() + exp_run = exp_history.create_new_experiment_run(job_list=self.job_list) + self.assertTrue(exp_run.chunk_size == 0) + self.assertTrue(exp_run.chunk_unit == "NA") + self.assertTrue(exp_run.total == len(self.job_list)) + self.assertTrue(exp_run.completed == 4) + + def test_finish_current_run(self): + exp_history = ExperimentHistory("tt00") + exp_history.initialize_database() + exp_run = exp_history.finish_current_experiment_run() + self.assertTrue(len(exp_run.modified) > 0) + self.assertTrue(exp_run.finish > 0) + + def test_process_job_list_changes(self): + exp_history = ExperimentHistory("tt00") + exp_history.initialize_database() + exp_run = exp_history.process_job_list_changes_to_experiment_totals(self.job_list) + self.assertTrue(exp_run.total == len(self.job_list)) + self.assertTrue(exp_run.completed == 4) + self.assertTrue(exp_run.running == 1) + self.assertTrue(exp_run.queuing == 1) + + def test_calculated_weights(self): + exp_history = ExperimentHistory("tt00") + exp_history.initialize_database() + job_data_dcs = exp_history.manager.get_all_last_job_data_dcs() + calculated_weights = GeneralizedWrapperDistributionStrategy().get_calculated_weights_of_jobs_in_wrapper(job_data_dcs) + sum_comp_weight = 0 + for job_name in calculated_weights: + sum_comp_weight += calculated_weights[job_name] + self.assertTrue(abs(sum_comp_weight - 1) <= 0.01) + + def test_distribute_energy_in_wrapper_1_to_1(self): + exp_history = ExperimentHistory("tt00") + exp_history.initialize_database() + ssh_output = ''' 17857525 COMPLETED 10 1 2021-10-13T15:51:16 2021-10-13T15:51:17 2021-10-13T15:52:47 2.41K + 17857525.batch COMPLETED 10 1 2021-10-13T15:51:17 2021-10-13T15:51:17 2021-10-13T15:52:47 1.88K 6264K 6264K + 17857525.extern COMPLETED 10 1 2021-10-13T15:51:17 2021-10-13T15:51:17 2021-10-13T15:52:47 1.66K 473K 68K + 17857525.0 COMPLETED 10 1 2021-10-13T15:51:21 2021-10-13T15:51:21 2021-10-13T15:51:22 186 352K 312.30K + 17857525.1 COMPLETED 10 1 2021-10-13T15:51:23 2021-10-13T15:51:23 2021-10-13T15:51:24 186 420K 306.70K + 17857525.2 COMPLETED 10 1 2021-10-13T15:51:24 2021-10-13T15:51:24 2021-10-13T15:51:27 188 352K 325.80K + 17857525.3 COMPLETED 10 1 2021-10-13T15:51:28 2021-10-13T15:51:28 2021-10-13T15:51:29 192 352K 341.90K + ''' + slurm_monitor = SlurmMonitor(ssh_output) + job_data_dcs = exp_history.manager.get_all_last_job_data_dcs()[:4] # Get me 4 jobs + weights = StraightWrapperAssociationStrategy().get_calculated_weights_of_jobs_in_wrapper(job_data_dcs) + info_handler = PlatformInformationHandler(StraightWrapperAssociationStrategy()) + job_data_dcs_with_data = info_handler.execute_distribution(job_data_dcs[0], job_data_dcs, slurm_monitor) + self.assertTrue(job_data_dcs_with_data[0].energy == round(slurm_monitor.steps[0].energy + weights[job_data_dcs_with_data[0].job_name]*slurm_monitor.extern.energy, 2)) + self.assertTrue(job_data_dcs_with_data[0].MaxRSS == slurm_monitor.steps[0].MaxRSS) + self.assertTrue(job_data_dcs_with_data[2].energy == round(slurm_monitor.steps[2].energy + weights[job_data_dcs_with_data[2].job_name]*slurm_monitor.extern.energy, 2)) + self.assertTrue(job_data_dcs_with_data[2].AveRSS == slurm_monitor.steps[2].AveRSS) + + def test_distribute_energy_in_wrapper_general_case(self): + exp_history = ExperimentHistory("tt00") + exp_history.initialize_database() + ssh_output = ''' 17857525 COMPLETED 10 1 2021-10-13T15:51:16 2021-10-13T15:51:17 2021-10-13T15:52:47 2.41K + 17857525.batch COMPLETED 10 1 2021-10-13T15:51:17 2021-10-13T15:51:17 2021-10-13T15:52:47 1.88K 6264K 6264K + 17857525.extern COMPLETED 10 1 2021-10-13T15:51:17 2021-10-13T15:51:17 2021-10-13T15:52:47 1.66K 473K 68K + 17857525.0 COMPLETED 10 1 2021-10-13T15:51:21 2021-10-13T15:51:21 2021-10-13T15:51:22 186 352K 312.30K + 17857525.1 COMPLETED 10 1 2021-10-13T15:51:23 2021-10-13T15:51:23 2021-10-13T15:51:24 186 420K 306.70K + 17857525.2 COMPLETED 10 1 2021-10-13T15:51:24 2021-10-13T15:51:24 2021-10-13T15:51:27 188 352K 325.80K + 17857525.3 COMPLETED 10 1 2021-10-13T15:51:28 2021-10-13T15:51:28 2021-10-13T15:51:29 192 352K 341.90K + ''' + slurm_monitor = SlurmMonitor(ssh_output) + job_data_dcs = exp_history.manager.get_all_last_job_data_dcs()[:5] # Get me 5 jobs + weights = GeneralizedWrapperDistributionStrategy().get_calculated_weights_of_jobs_in_wrapper(job_data_dcs) + # print(sum(weights[k] for k in weights)) + info_handler = PlatformInformationHandler(GeneralizedWrapperDistributionStrategy()) + job_data_dcs_with_data = info_handler.execute_distribution(job_data_dcs[0], job_data_dcs, slurm_monitor) + self.assertTrue(job_data_dcs_with_data[0].energy == round(slurm_monitor.total_energy * weights[job_data_dcs_with_data[0].job_name], 2)) + self.assertTrue(job_data_dcs_with_data[1].energy == round(slurm_monitor.total_energy * weights[job_data_dcs_with_data[1].job_name], 2)) + self.assertTrue(job_data_dcs_with_data[2].energy == round(slurm_monitor.total_energy * weights[job_data_dcs_with_data[2].job_name], 2)) + self.assertTrue(job_data_dcs_with_data[3].energy == round(slurm_monitor.total_energy * weights[job_data_dcs_with_data[3].job_name], 2)) + self.assertTrue(job_data_dcs_with_data[4].energy == round(slurm_monitor.total_energy * weights[job_data_dcs_with_data[4].job_name], 2)) + sum_energy = sum(job.energy for job in job_data_dcs_with_data[:5]) # Last 1 is original job_data_dc + self.assertTrue(abs(sum_energy - slurm_monitor.total_energy) <= 10) + + def test_process_status_changes(self): + exp_history = ExperimentHistory("tt00") + exp_history.initialize_database() + CHUNK_UNIT = "month" + CHUNK_SIZE = 20 + CURRENT_CONFIG = "CURRENT CONFIG" + current_experiment_run_dc = exp_history.manager.get_experiment_run_dc_with_max_id() + exp_run = exp_history.process_status_changes(job_list=self.job_list, chunk_unit=CHUNK_UNIT, chunk_size=CHUNK_SIZE, current_config=CURRENT_CONFIG) # Generates new run + self.assertTrue(current_experiment_run_dc.run_id != exp_run.run_id) + self.assertTrue(exp_run.chunk_unit == CHUNK_UNIT) + self.assertTrue(exp_run.metadata == CURRENT_CONFIG) + self.assertTrue(exp_run.total == len(self.job_list)) + current_experiment_run_dc = exp_history.manager.get_experiment_run_dc_with_max_id() + exp_run = exp_history.process_status_changes(job_list=self.job_list, chunk_unit=CHUNK_UNIT, chunk_size=CHUNK_SIZE, current_config=CURRENT_CONFIG) # Same run + self.assertTrue(current_experiment_run_dc.run_id == exp_run.run_id) + new_job_list = [ + job("a29z_20000101_fc2_1_POST", "2000-01-01 00:00:00", "POST", "FAILED", ""), + job("a29z_20000101_fc1_1_CLEAN", "2000-01-01 00:00:00", "CLEAN", "FAILED", ""), + job("a29z_20000101_fc3_1_POST", "2000-01-01 00:00:00", "POST", "RUNNING", ""), + job("a29z_20000101_fc2_1_CLEAN", "2000-01-01 00:00:00", "CLEAN", "FAILED", ""), + job("a29z_20000101_fc0_3_SIM", "2000-01-01 00:00:00", "SIM", "FAILED", ""), + job("a29z_20000101_fc1_2_POST", "2000-01-01 00:00:00", "POST", "QUEUING", ""), + ] + current_experiment_run_dc = exp_history.manager.get_experiment_run_dc_with_max_id() + exp_run = exp_history.process_status_changes(job_list=new_job_list, chunk_unit=CHUNK_UNIT, chunk_size=CHUNK_SIZE, current_config=CURRENT_CONFIG) # Generates new run + self.assertTrue(current_experiment_run_dc.run_id != exp_run.run_id) + self.assertTrue(exp_run.total == len(new_job_list)) + self.assertTrue(exp_run.failed == 4) + + def test_write_submit_time(self): + exp_history = ExperimentHistory("tt00") + exp_history.initialize_database() + JOB_NAME = "a29z_20000101_fc2_1_SIM" + NCPUS = 128 + PLATFORM_NAME = "marenostrum4" + JOB_ID = 101 + inserted_job_data_dc = exp_history.write_submit_time(JOB_NAME, time.time(), "SUBMITTED", NCPUS, "00:30", "debug", "20000101", "fc2", "SIM", 1, PLATFORM_NAME, JOB_ID, "bsc_es", 1, "") + self.assertTrue(inserted_job_data_dc.job_name == JOB_NAME) + self.assertTrue(inserted_job_data_dc.ncpus == NCPUS) + self.assertTrue(inserted_job_data_dc.children == "") + self.assertTrue(inserted_job_data_dc.energy == 0) + self.assertTrue(inserted_job_data_dc.platform == PLATFORM_NAME) + self.assertTrue(inserted_job_data_dc.job_id == JOB_ID) + self.assertTrue(inserted_job_data_dc.qos == "debug") + + + def test_write_start_time(self): + exp_history = ExperimentHistory("tt00") + exp_history.initialize_database() + JOB_NAME = "a29z_20000101_fc2_1_SIM" + NCPUS = 128 + PLATFORM_NAME = "marenostrum4" + JOB_ID = 101 + inserted_job_data_dc_submit = exp_history.write_submit_time(JOB_NAME, time.time(), "SUBMITTED", NCPUS, "00:30", "debug", "20000101", "fc2", "SIM", 1, PLATFORM_NAME, JOB_ID, "bsc_es", 1, "") + inserted_job_data_dc = exp_history.write_start_time(JOB_NAME, time.time(), "RUNNING", NCPUS, "00:30", "debug", "20000101", "fc2", "SIM", 1, PLATFORM_NAME, JOB_ID, "bsc_es", 1, "") + self.assertTrue(inserted_job_data_dc.job_name == JOB_NAME) + self.assertTrue(inserted_job_data_dc.ncpus == NCPUS) + self.assertTrue(inserted_job_data_dc.children == "") + self.assertTrue(inserted_job_data_dc.energy == 0) + self.assertTrue(inserted_job_data_dc.platform == PLATFORM_NAME) + self.assertTrue(inserted_job_data_dc.job_id == JOB_ID) + self.assertTrue(inserted_job_data_dc.status == "RUNNING") + self.assertTrue(inserted_job_data_dc.qos == "debug") + + + +class TestLogging(unittest.TestCase): + + def setUp(self): + message = "No Message" + try: + raise Exception("Setup test exception") + except: + message = traceback.format_exc() + self.log = Logging("tt00") + self.exp_message = "Exception message" + self.trace_message = message + + def test_build_message(self): + message = self.log.build_message(self.exp_message, self.trace_message) + # print(message) + self.assertIsNotNone(message) + self.assertTrue(len(message) > 0) + + def test_log(self): + self.log.log(self.exp_message, self.trace_message) + + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/autosubmit/history/test_strategies.py b/autosubmit/history/test_strategies.py new file mode 100644 index 0000000000000000000000000000000000000000..d84d46b1557f38e2223c74815ebb671e84458906 --- /dev/null +++ b/autosubmit/history/test_strategies.py @@ -0,0 +1,94 @@ +#!/usr/bin/python + +# Copyright 2015-2020 Earth Sciences Department, BSC-CNS +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . + +import unittest +from collections import namedtuple +from data_classes.job_data import JobData +from strategies import StraightWrapperAssociationStrategy, GeneralizedWrapperDistributionStrategy, PlatformInformationHandler, TwoDimWrapperDistributionStrategy +from platform_monitor.slurm_monitor import SlurmMonitor +job_dc = namedtuple("Job", ["job_name", "date", "member", "status_str", "children", "children_list"]) + +class Test2DWrapperDistributionStrategy(unittest.TestCase): + def setUp(self): + self.strategy = TwoDimWrapperDistributionStrategy() + self.job_data_dcs_in_wrapper = [ + JobData(0, job_name="a29z_20000101_fc2_1_POSTR", status="COMPLETED", submit=10, start=100, finish=200, ncpus=100, energy=0, children="a29z_20000101_fc1_1_CLEAN, a29z_20000101_fc3_1_POST"), + JobData(0, job_name="a29z_20000101_fc1_1_CLEAN", status="COMPLETED", submit=10, start=100, finish=200, ncpus=100, energy=0, children="a29z_20000101_fc2_1_CLEAN"), + JobData(0, job_name="a29z_20000101_fc3_1_POST", status="COMPLETED", submit=10, start=100, finish=200, ncpus=100, energy=0, children="a29z_20000101_fc0_3_SIM"), + JobData(0, job_name="a29z_20000101_fc2_1_CLEAN", status="COMPLETED", submit=10, start=100, finish=200, ncpus=100, energy=0, children=""), + JobData(0, job_name="a29z_20000101_fc0_3_SIM", status="COMPLETED", submit=10, start=100, finish=200, ncpus=100, energy=0, children=""), + JobData(0, job_name="a29z_20000101_fc1_2_POSTR1", status="COMPLETED", submit=10, start=100, finish=200, ncpus=100, energy=0, children="a29z_20000101_fc1_5_POST2"), + JobData(0, job_name="a29z_20000101_fc1_5_POST2", status="COMPLETED", submit=10, start=100, finish=200, ncpus=100, energy=0, children="a29z_20000101_fc1_4_POST3"), + JobData(0, job_name="a29z_20000101_fc1_4_POST3", status="COMPLETED", submit=10, start=100, finish=200, ncpus=100, energy=0, children="a29z_20000101_fc2_5_CLEAN4"), + JobData(0, job_name="a29z_20000101_fc2_5_CLEAN4", status="COMPLETED", submit=10, start=100, finish=200, ncpus=100, energy=0, children="a29z_20000101_fc0_1_POST5"), + JobData(0, job_name="a29z_20000101_fc0_1_POST5", status="COMPLETED", submit=10, start=100, finish=200, ncpus=100, energy=0, children=""), + ] + + def test_get_all_children(self): + children = self.strategy._get_all_children(self.job_data_dcs_in_wrapper) + self.assertTrue(len(children) == 8) + + def test_get_roots(self): + roots = self.strategy._get_roots(self.job_data_dcs_in_wrapper) + self.assertTrue(len(roots) == 2) + + def test_get_level(self): + roots = self.strategy._get_roots(self.job_data_dcs_in_wrapper) + job_name_to_children_names = {job.job_name: job.children_list for job in self.job_data_dcs_in_wrapper} + next_level = self.strategy.get_level(roots, job_name_to_children_names) + self.assertTrue(len(next_level) == 3) + + def test_get_jobs_per_level(self): + levels = self.strategy.get_jobs_per_level(self.job_data_dcs_in_wrapper) + for level in levels: + print([job.job_name for job in level]) + self.assertTrue(len(levels) == 5) + self.assertTrue("a29z_20000101_fc0_1_POST5" in [job.job_name for job in levels[4]]) + + def test_energy_distribution(self): + ssh_output = ''' 17857525 COMPLETED 10 1 2021-10-13T15:51:16 2021-10-13T15:51:17 2021-10-13T15:52:47 2.62K + 17857525.batch COMPLETED 10 1 2021-10-13T15:51:17 2021-10-13T15:51:17 2021-10-13T15:52:47 1.88K 6264K 6264K + 17857525.extern COMPLETED 10 1 2021-10-13T15:51:17 2021-10-13T15:51:17 2021-10-13T15:52:47 1.66K 473K 68K + 17857525.0 COMPLETED 10 1 2021-10-13T15:51:21 2021-10-13T15:51:21 2021-10-13T15:51:22 186 352K 312.30K + 17857525.1 COMPLETED 10 1 2021-10-13T15:51:23 2021-10-13T15:51:23 2021-10-13T15:51:24 186 420K 306.70K + 17857525.2 COMPLETED 10 1 2021-10-13T15:51:24 2021-10-13T15:51:24 2021-10-13T15:51:27 188 352K 325.80K + 17857525.3 COMPLETED 10 1 2021-10-13T15:51:28 2021-10-13T15:51:28 2021-10-13T15:51:29 192 352K 341.90K + 17857525.4 COMPLETED 10 1 2021-10-13T15:51:28 2021-10-13T15:51:28 2021-10-13T15:51:29 210 352K 341.90K + ''' + slurm_monitor = SlurmMonitor(ssh_output) + info_handler = PlatformInformationHandler(TwoDimWrapperDistributionStrategy()) + job_dcs = info_handler.execute_distribution(self.job_data_dcs_in_wrapper[0], self.job_data_dcs_in_wrapper, slurm_monitor) + for job in job_dcs: + print("{0} -> {1} and {2} : ncpus {3} running {4}".format(job.job_name, job.energy, job.rowstatus, job.ncpus, job.running_time)) + for level in info_handler.strategy.jobs_per_level: + print([job.job_name for job in level]) + total_in_jobs = sum(job.energy for job in job_dcs[:-1]) # ignore last + self.assertTrue(abs(total_in_jobs - slurm_monitor.total_energy) <= 10) + self.assertTrue(abs(job_dcs[0].energy - 259) < 1) + self.assertTrue(abs(job_dcs[1].energy - 259) < 1) + self.assertTrue(abs(job_dcs[2].energy - 228) < 1) + self.assertTrue(abs(job_dcs[3].energy - 228) < 1) + self.assertTrue(abs(job_dcs[4].energy - 228) < 1) + self.assertTrue(abs(job_dcs[5].energy - 228.67) < 1) + self.assertTrue(abs(job_dcs[6].energy - 228.67) < 1) + self.assertTrue(abs(job_dcs[7].energy - 228.67) < 1) + self.assertTrue(abs(job_dcs[8].energy - 358) < 1) + self.assertTrue(abs(job_dcs[9].energy - 376) < 1) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/autosubmit/history/utils.py b/autosubmit/history/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..831695fbd48d9e6539375311b4eed8066fc8e76a --- /dev/null +++ b/autosubmit/history/utils.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python + +# Copyright 2015-2020 Earth Sciences Department, BSC-CNS + +# This file is part of Autosubmit. + +# Autosubmit is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# Autosubmit is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Autosubmit. If not, see . + +import time +import os +from datetime import datetime + +DATETIME_FORMAT = '%Y-%m-%d-%H:%M:%S' + +def get_fields_as_comma_str(model): + """ Get the fields of a namedtumple as a comma separated string. """ + return ",".join(model._fields) + +def calculate_queue_time_in_seconds(submit_time, start_time): + # type : (float, float) -> int + """ Calculates queue time in seconds based on submit and start timestamps. """ + if submit_time > 0 and start_time > 0 and (start_time - submit_time) > 0: + return int(start_time - submit_time) + return 0 + +def calculate_run_time_in_seconds(start_time, finish_time): + # type : (float, float) -> int + """ Calculates run time in seconds based on start and finish timestamps. """ + if finish_time > 0 and start_time > 0 and (finish_time - start_time) > 0: + return int(finish_time - start_time) + return 0 + +def get_current_datetime(): + # type : () -> str + """ Returns the current time in format '%Y-%m-%d-%H:%M:%S' """ + return datetime.today().strftime(DATETIME_FORMAT) + +def get_current_datetime_if_none(argument): + # type : (Any) -> Union[Any, str] + """ Returns the current time in format '%Y-%m-%d-%H:%M:%S' if the supplied argument is None, else return argument. """ + if argument is None: + return get_current_datetime() + else: + return argument + +def create_file_with_full_permissions(path): + # type : (str) -> None + """ creates a database files with full permissions """ + os.umask(0) + os.open(path, os.O_WRONLY | os.O_CREAT, 0o777) + +class SupportedStatus: + COMPLETED = "COMPLETED" + FAILED = "FAILED" + QUEUING = "QUEUING" + SUBMITTED = "SUBMITTED" + RUNNING = "RUNNING" + SUSPENDED = "SUSPENDED" + +# if __name__ == "__main__": +# print(get_fields_as_comma_str()) \ No newline at end of file diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py index 94e1a58380e5e037813621e9545e964b30b32f86..2cd12f080efba190e04062254a3b8f9e70d404ee 100644 --- a/autosubmit/job/job.py +++ b/autosubmit/job/job.py @@ -36,8 +36,9 @@ from autosubmit.config.config_common import AutosubmitConfig from autosubmit.job.job_common import Status, Type, increase_wallclock_by_chunk from autosubmit.job.job_common import StatisticsSnippetBash, StatisticsSnippetPython from autosubmit.job.job_common import StatisticsSnippetR, StatisticsSnippetEmpty +from autosubmit.job.job_utils import get_job_package_code from autosubmit.config.basicConfig import BasicConfig -from autosubmit.database.db_jobdata import JobDataStructure +from autosubmit.history.experiment_history import ExperimentHistory from bscearth.utils.date import date2str, parse_date, previous_day, chunk_end_date, chunk_start_date, Log, subs_dates from time import sleep from threading import Thread @@ -173,7 +174,7 @@ class Job(object): :rtype: set """ return self._parents - + @parents.setter def parents(self, parents): """ @@ -181,6 +182,20 @@ class Job(object): """ self._parents = parents + @property + def status_str(self): + """ + String representation of the current status + """ + return Status.VALUE_TO_KEY.get(self.status, "UNKNOWN") + + @property + def children_names_str(self): + """ + Comma separated list of children's names + """ + return ",".join([str(child.name) for child in self._children]) + @property def is_serial(self): return str(self.processors) == '1' @@ -1238,9 +1253,11 @@ class Job(object): # Get # Writing database if self.wrapper_type != "vertical" or enabled: - JobDataStructure(self.expid).write_submit_time(self.name, data_time[1], Status.VALUE_TO_KEY[self.status] if self.status in Status.VALUE_TO_KEY.keys() else "UNKNOWN", self.processors, - self.wallclock, self.queue, self.date, self.member, self.section, self.chunk, self.platform_name, self.id, self.packed, self._wrapper_queue) - + exp_history = ExperimentHistory(self.expid) + exp_history.write_submit_time(self.name, submit=data_time[1], status=Status.VALUE_TO_KEY.get(self.status, "UNKNOWN"), ncpus=self.processors, + wallclock=self.wallclock, qos=self.queue, date=self.date, member=self.member, section=self.section, chunk=self.chunk, + platform=self.platform_name, job_id=self.id, wrapper_queue=self._wrapper_queue, wrapper_code=get_job_package_code(self.name), + children=self.children_names_str) def write_start_time(self, enabled = False): """ @@ -1266,8 +1283,11 @@ class Job(object): # noinspection PyTypeChecker f.write(date2str(datetime.datetime.fromtimestamp(start_time), 'S')) # Writing database - JobDataStructure(self.expid).write_start_time(self.name, start_time, Status.VALUE_TO_KEY[self.status] if self.status in Status.VALUE_TO_KEY.keys() else "UNKNOWN", self.processors, - self.wallclock, self._queue, self.date, self.member, self.section, self.chunk, self.platform_name, self.id, self.packed, self._wrapper_queue) + exp_history = ExperimentHistory(self.expid) + exp_history.write_start_time(self.name, start=start_time, status=Status.VALUE_TO_KEY.get(self.status, "UNKNOWN"), ncpus=self.processors, + wallclock=self.wallclock, qos=self.queue, date=self.date, member=self.member, section=self.section, chunk=self.chunk, + platform=self.platform_name, job_id=self.id, wrapper_queue=self._wrapper_queue, wrapper_code=get_job_package_code(self.name), + children=self.children_names_str) return True def write_end_time(self, completed,enabled = False): @@ -1302,13 +1322,17 @@ class Job(object): out, err = self.local_logs path_out = os.path.join(self._tmp_path, 'LOG_' + str(self.expid), out) # Launch first as simple non-threaded function - JobDataStructure(self.expid).write_finish_time(self.name, finish_time, final_status, self.processors, self.wallclock, self._queue, self.date, - self.member, self.section, self.chunk, self.platform_name, self.id, self.platform, self.packed, [job.id for job in self._parents], True, None, out, err, self._wrapper_queue) - # Launch second as threaded function - thread_write_finish = Thread(target=JobDataStructure(self.expid).write_finish_time, args=(self.name, finish_time, final_status, self.processors, - self.wallclock, self._queue, self.date, self.member, self.section, self.chunk, self.platform_name, self.id, self.platform, self.packed, [job.id for job in self._parents], False, path_out, out, err, self._wrapper_queue)) - thread_write_finish.name = "JOB_data_{}".format(self.name) - thread_write_finish.start() + exp_history = ExperimentHistory(self.expid) + job_data_dc = exp_history.write_finish_time(self.name, finish=finish_time, status=final_status, ncpus=self.processors, + wallclock=self.wallclock, qos=self.queue, date=self.date, member=self.member, section=self.section, chunk=self.chunk, + platform=self.platform_name, job_id=self.id, out_file=out, err_file=err, wrapper_queue=self._wrapper_queue, + wrapper_code=get_job_package_code(self.name), children=self.children_names_str) + + # Launch second as threaded function only for slurm + if job_data_dc and type(self.platform) is not str and self.platform.type == "slurm": + thread_write_finish = Thread(target=ExperimentHistory(self.expid).write_platform_data_after_finish, args=(job_data_dc, self.platform)) + thread_write_finish.name = "JOB_data_{}".format(self.name) + thread_write_finish.start() def write_total_stat_by_retries_fix_newline(self): path = os.path.join(self._tmp_path, self.name + '_TOTAL_STATS') @@ -1334,22 +1358,27 @@ class Job(object): path_out = os.path.join(self._tmp_path, 'LOG_' + str(self.expid), out) # Launch first as simple non-threaded function if not first_retrial: - JobDataStructure(self.expid).write_submit_time(self.name, total_stats[0], Status.VALUE_TO_KEY[ - self.status] if self.status in Status.VALUE_TO_KEY.keys() else "UNKNOWN", self.processors, - self.wallclock, self.queue, self.date, self.member, self.section, - self.chunk, self.platform_name, self.id, self.packed, - self._wrapper_queue) - JobDataStructure(self.expid).write_start_time(self.name, total_stats[0], Status.VALUE_TO_KEY[ - self.status] if self.status in Status.VALUE_TO_KEY.keys() else "UNKNOWN", self.processors, - self.wallclock, self._queue, self.date, self.member, - self.section, self.chunk, self.platform_name, self.id, - self.packed, self._wrapper_queue) - JobDataStructure(self.expid).write_finish_time(self.name, total_stats[1], total_stats[2], self.processors, - self.wallclock, self._queue, self.date, - self.member, self.section, self.chunk, self.platform_name, - self.id, self.platform, self.packed, - [job.id for job in self._parents], True, None, out, err, - self._wrapper_queue) + exp_history = ExperimentHistory(self.expid) + exp_history.write_submit_time(self.name, submit=total_stats[0], status=Status.VALUE_TO_KEY.get(self.status, "UNKNOWN"), ncpus=self.processors, + wallclock=self.wallclock, qos=self.queue, date=self.date, member=self.member, section=self.section, chunk=self.chunk, + platform=self.platform_name, job_id=self.id, wrapper_queue=self._wrapper_queue, wrapper_code=get_job_package_code(self.name), + children=self.children_names_str) + exp_history = ExperimentHistory(self.expid) + exp_history.write_start_time(self.name, start=total_stats[0], status=Status.VALUE_TO_KEY.get(self.status, "UNKNOWN"), ncpus=self.processors, + wallclock=self.wallclock, qos=self.queue, date=self.date, member=self.member, section=self.section, chunk=self.chunk, + platform=self.platform_name, job_id=self.id, wrapper_queue=self._wrapper_queue, wrapper_code=get_job_package_code(self.name), + children=self.children_names_str) + + exp_history = ExperimentHistory(self.expid) + job_data_dc = exp_history.write_finish_time(self.name, finish=total_stats[1], status=total_stats[2], ncpus=self.processors, + wallclock=self.wallclock, qos=self.queue, date=self.date, member=self.member, section=self.section, chunk=self.chunk, + platform=self.platform_name, job_id=self.id, out_file=out, err_file=err, wrapper_queue=self._wrapper_queue, + wrapper_code=get_job_package_code(self.name), children=self.children_names_str) + # Launch second as threaded function only for slurm + if job_data_dc and type(self.platform) is not str and self.platform.type == "slurm": + thread_write_finish = Thread(target=ExperimentHistory(self.expid).write_platform_data_after_finish, args=(job_data_dc, self.platform)) + thread_write_finish.name = "JOB_data_{}".format(self.name) + thread_write_finish.start() def check_started_after(self, date_limit): """ diff --git a/autosubmit/job/job_utils.py b/autosubmit/job/job_utils.py index 0d90766905148a92883769d9914edb554a3a7734..a83b76d0f1cae5f8681db23dd6b1953bead77e22 100644 --- a/autosubmit/job/job_utils.py +++ b/autosubmit/job/job_utils.py @@ -18,11 +18,13 @@ # along with Autosubmit. If not, see . import networkx +import os from networkx.algorithms.dag import is_directed_acyclic_graph from networkx import DiGraph from networkx import dfs_edges from networkx import NetworkXError +from autosubmit.job.job_package_persistence import JobPackagePersistence def transitive_reduction(graph): @@ -41,6 +43,31 @@ def transitive_reduction(graph): reduced_graph.add_edges_from((u, v) for v in u_edges) return reduced_graph +def get_job_package_code(job_name): + """ + Finds the package code and retrieves it. None if no package. + + :param BasicConfig: Basic configuration + :type BasicConfig: Configuration Object + :param expid: Experiment Id + :type expid: String + :param current_job_name: Name of job + :type current_jobs: string + :return: package code, None if not found + :rtype: int or None + """ + try: + packages_wrapper = JobPackagePersistence(os.path.join(self.basic_conf.LOCAL_ROOT_DIR, self.expid, "pkl"),"job_packages_" + self.expid).load(wrapper=True) + packages_wrapper_plus = JobPackagePersistence(os.path.join(self.basic_conf.LOCAL_ROOT_DIR, self.expid, "pkl"),"job_packages_" + self.expid).load(wrapper=False) + if (packages_wrapper or packages_wrapper_plus): + packages = packages_wrapper if len(packages_wrapper) > len(packages_wrapper_plus) else packages_wrapper_plus + for exp, package_name, _job_name in packages: + if job_name == _job_name: + code = int(package_name.split("_")[2]) + return code + except: + pass + return 0 class Dependency(object): """ diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index 0d2b387c7b3e5f1ed5c48b7f242da3ecf0e17bcd..f0dc578a11bf2122b5042a85014732f584f68eed 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -47,6 +47,7 @@ class ParamikoPlatform(Platform): self.channels = {} self.poller = select.poll() self.local_x11_display = xlib_connect.get_display(os.environ['DISPLAY']) + @property def header(self): """ @@ -85,13 +86,13 @@ class ParamikoPlatform(Platform): """ Test if the connection is still alive, reconnect if not. """ - try: - self.reset() + try: + self.reset() try: self.restore_connection() except: - pass - transport = self._ssh.get_transport() + pass + transport = self._ssh.get_transport() transport.send_ignore() except EOFError as e: raise AutosubmitError("[{0}] not alive. Host: {1}".format( @@ -395,7 +396,7 @@ class ParamikoPlatform(Platform): else: return None - def check_job_energy(self, job_id, packed=False): + def check_job_energy(self, job_id): """ Checks job energy and return values. Defined in child classes. @@ -407,8 +408,7 @@ class ParamikoPlatform(Platform): """ check_energy_cmd = self.get_job_energy_cmd(job_id) self.send_command(check_energy_cmd) - return self.parse_job_finish_data( - self.get_ssh_output(), packed) + return self.get_ssh_output() def submit_Script(self, hold=False): """