From f3c79bcf3e4353710c10c577f6bc34a4b9944ee8 Mon Sep 17 00:00:00 2001
From: dbeltran <daniel.beltran@bsc.es>
Date: Wed, 21 Aug 2024 15:41:55 +0200
Subject: [PATCH 01/25] WIP: Zombies/orphan processors logs fix

---
 autosubmit/autosubmit.py                  |  29 +----
 autosubmit/helpers/utils.py               |  15 ---
 autosubmit/job/job_list.py                |  45 ++++---
 autosubmit/platforms/ecplatform.py        |  18 +--
 autosubmit/platforms/locplatform.py       |  10 +-
 autosubmit/platforms/paramiko_platform.py |  11 +-
 autosubmit/platforms/platform.py          | 142 +++++++++++++---------
 autosubmit/platforms/sgeplatform.py       |   9 +-
 bin/autosubmit                            |   9 +-
 9 files changed, 135 insertions(+), 153 deletions(-)

diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py
index 6bc39f862..a64adfafb 100644
--- a/autosubmit/autosubmit.py
+++ b/autosubmit/autosubmit.py
@@ -86,7 +86,7 @@ from typing import List
 import autosubmit.history.utils as HUtils
 import autosubmit.helpers.autosubmit_helper as AutosubmitHelper
 import autosubmit.statistics.utils as StatisticsUtils
-from autosubmit.helpers.utils import proccess_id, terminate_child_process, check_jobs_file_exists
+from autosubmit.helpers.utils import proccess_id, check_jobs_file_exists
 
 from contextlib import suppress
 
@@ -1686,25 +1686,6 @@ class Autosubmit:
         for job in job_list.get_job_list():
             job.status = Status.WAITING
 
-
-    @staticmethod
-    def terminate_child_process(expid, platform = None):
-        # get pid of the main process
-        pid = os.getpid()
-        # In case some one used 4.1.6 or 4.1.5
-        process_ids = proccess_id(expid,"run", single_instance = False, platform = platform)
-        if process_ids:
-            for process_id in [ process_id for process_id in process_ids if process_id != pid]:
-                # force kill
-                os.kill(process_id, signal.SIGKILL)
-        process_ids = proccess_id(expid,"log", single_instance = False, platform = platform)
-        # 4.1.7 +
-        if process_ids:
-            for process_id in [ process_id for process_id in process_ids if process_id != pid]:
-                # force kill
-                os.kill(process_id, signal.SIGKILL)
-
-
     @staticmethod
     def terminate(all_threads):
         # Closing threads on Ctrl+C
@@ -2171,7 +2152,6 @@ class Autosubmit:
                     did_run = True
                     try:
                         if Autosubmit.exit:
-                            terminate_child_process(expid)
                             Autosubmit.terminate(threading.enumerate())
                             if job_list.get_failed():
                                 return 1
@@ -2346,7 +2326,7 @@ class Autosubmit:
                 # get all threads
                 threads = threading.enumerate()
                 # print name
-                timeout = as_conf.experiment_data.get("CONFIG",{}).get("LAST_LOGS_TIMEOUT", 180)
+                timeout = as_conf.experiment_data.get("CONFIG",{}).get("LAST_LOGS_TIMEOUT", 1)
                 for remaining in range(timeout, 0, -1):
                     if len(job_list.get_completed_without_logs()) == 0:
                         break
@@ -2368,7 +2348,6 @@ class Autosubmit:
                         Autosubmit.database_fix(expid)
                     except Exception as e:
                         pass
-                terminate_child_process(expid)
                 for platform in platforms_to_test:
                     platform.closeConnection()
                 if len(job_list.get_failed()) > 0:
@@ -2383,13 +2362,10 @@ class Autosubmit:
                         Log.warning("Database is locked")
         except (portalocker.AlreadyLocked, portalocker.LockException) as e:
             message = "We have detected that there is another Autosubmit instance using the experiment\n. Stop other Autosubmit instances that are using the experiment or delete autosubmit.lock file located on tmp folder"
-            terminate_child_process(expid)
             raise AutosubmitCritical(message, 7000)
         except AutosubmitCritical as e:
-            terminate_child_process(expid)
             raise
         except BaseException as e:
-            terminate_child_process(expid)
             raise
         finally:
             if profile:
@@ -6094,7 +6070,6 @@ class Autosubmit:
                     if status in Status.VALUE_TO_KEY.values():
                         job.status = Status.KEY_TO_VALUE[status]
                 job_list.save()
-            terminate_child_process(expid)
 
 
diff --git a/autosubmit/helpers/utils.py b/autosubmit/helpers/utils.py
index 7ccca9cf8..4f05c7676 100644
--- a/autosubmit/helpers/utils.py
+++ b/autosubmit/helpers/utils.py
@@ -34,21 +34,6 @@ def check_jobs_file_exists(as_conf, current_section_name=None):
         if missing_files:
             raise AutosubmitCritical(f"Templates not found:\n{missing_files}", 7011)
 
-def terminate_child_process(expid, platform=None):
-    # get pid of the main process
-    pid = os.getpid()
-    # In case someone used 4.1.6 or 4.1.5
-    process_ids = proccess_id(expid, "run", single_instance=False, platform=platform)
-    if process_ids:
-        for process_id in [process_id for process_id in process_ids if process_id != pid]:
-            # force kill
-            os.kill(process_id, signal.SIGKILL)
-    process_ids = proccess_id(expid, "log", single_instance=False, platform=platform)
-    # 4.1.7 +
-    if process_ids:
-        for process_id in [process_id for process_id in process_ids if process_id != pid]:
-            # force kill
-            os.kill(process_id, signal.SIGKILL)
 
 def proccess_id(expid=None, command="run", single_instance=True, platform=None):
     # Retrieve the process id of the autosubmit process
diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py
index 22245c23a..de63452b4 100644
--- a/autosubmit/job/job_list.py
+++ b/autosubmit/job/job_list.py
@@ -2587,30 +2587,39 @@ class JobList(object):
         """
         Updates the log err and log out.
         """
-        if not hasattr(job,"updated_log") or not job.updated_log:  # hasattr for backward compatibility (job.updated_logs is only for newer jobs, as the loaded ones may not have this set yet)
-            # order path_to_logs by name and get the two last element
-            log_file = False
-            if hasattr(job, "x11") and job.x11:
-                job.updated_log = True
-                return
+        if not hasattr(job, "updated_log"): # hasattr for backward compatibility (job.updated_logs is only for newer jobs, as the loaded ones may not have this set yet)
+            job.updated_log = False
+        if hasattr(job, "x11") and job.x11: # X11 has it log writted in the run.out file. No need to check for log files as there are none
+            job.updated_log = True
+            return
+        log_recovered = self.check_if_log_is_recovered(job)
+
+        if log_recovered:  # hasattr for backward compatibility
+            job.local_logs = (log_recovered.name, log_recovered.name[:-4] + ".err") # we only want the last one
+            job.updated_log = True
+        elif not job.updated_log and str(as_conf.platforms_data.get(job.platform.name, {}).get('DISABLE_RECOVERY_THREADS', "false")).lower() == "false":
+            job.platform.add_job_to_log_recover(job)
+
+    def check_if_log_is_recovered(self, job):
+        """
+        Check if the log is recovered.
+        Conditions:
+        - File must exist
+        - File timestamp should be greater than the job ready_start_date, otherwise is from a previous run.
+
+        """
+        if not hasattr(job, "updated_log") or not job.updated_log:
             if job.wrapper_type == "vertical" and job.fail_count > 0:
                 for log_recovered in self.path_to_logs.glob(f"{job.name}.*._{job.fail_count}.out"):
                     if job.local_logs[0][-4] in log_recovered.name:
-                        log_file = True
-                        break
+                        if log_recovered.stat().st_mtime > job.ready_start_date:
+                            return log_recovered
             else:
                 for log_recovered in self.path_to_logs.glob(f"{job.name}.*.out"):
                     if job.local_logs[0] == log_recovered.name:
-                        log_file = True
-                        break
-
-            if log_file:
-                if not hasattr(job, "ready_start_date") or not job.ready_start_date or job.local_logs[0] >= job.ready_start_date:  # hasattr for backward compatibility
-                    job.local_logs = (log_recovered.name, log_recovered.name[:-4] + ".err")
-                    job.updated_log = True
-            if not job.updated_log and str(as_conf.platforms_data.get(job.platform.name, {}).get('DISABLE_RECOVERY_THREADS', "false")).lower() == "false":
-                job.platform.add_job_to_log_recover(job)
-
+                        if log_recovered.stat().st_mtime > job.ready_start_date:
+                            return log_recovered
+        return None
     def update_list(self, as_conf, store_change=True, fromSetStatus=False, submitter=None, first_time=False):
         # type: (AutosubmitConfig, bool, bool, object, bool) -> bool
         """
diff --git a/autosubmit/platforms/ecplatform.py b/autosubmit/platforms/ecplatform.py
index 713cb17eb..5f12c1ec9 100644
--- a/autosubmit/platforms/ecplatform.py
+++ b/autosubmit/platforms/ecplatform.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-
+import atexit
 # Copyright 2017-2020 Earth Sciences Department, BSC-CNS
 
 # This file is part of Autosubmit.
@@ -169,12 +169,8 @@ class EcPlatform(ParamikoPlatform):
                 self.connected = False
         except:
             self.connected = False
-        if not self.log_retrieval_process_active and (
-                as_conf is None or str(as_conf.platforms_data.get(self.name, {}).get('DISABLE_RECOVERY_THREADS',
-                                                                                 "false")).lower() == "false"):
-            self.log_retrieval_process_active = True
-            if as_conf and as_conf.misc_data.get("AS_COMMAND","").lower() == "run":
-                self.recover_job_logs()
+        self.spawn_log_retrieval_process(as_conf)
+
 
     def restore_connection(self,as_conf):
         """
@@ -208,12 +204,8 @@ class EcPlatform(ParamikoPlatform):
         try:
             if output.lower().find("yes") != -1:
                 self.connected = True
-                if not self.log_retrieval_process_active and (
-                        as_conf is None or str(as_conf.platforms_data.get(self.name, {}).get('DISABLE_RECOVERY_THREADS',
-                                                                                             "false")).lower() == "false"):
-                    self.log_retrieval_process_active = True
-                    if as_conf and as_conf.misc_data.get("AS_COMMAND", "").lower() == "run":
-                        self.recover_job_logs()
+                self.spawn_log_retrieval_process(as_conf)
+
                 return "OK"
             else:
                 self.connected = False
diff --git a/autosubmit/platforms/locplatform.py b/autosubmit/platforms/locplatform.py
index 3d9296705..520a4b0f3 100644
--- a/autosubmit/platforms/locplatform.py
+++ b/autosubmit/platforms/locplatform.py
@@ -111,15 +111,9 @@ class LocalPlatform(ParamikoPlatform):
     def get_checkjob_cmd(self, job_id):
         return self.get_pscall(job_id)
 
-    def connect(self, as_conf, reconnect=False):
+    def connect(self, as_conf={}, reconnect=False):
         self.connected = True
-        if not self.log_retrieval_process_active and (
-                as_conf is None or str(as_conf.platforms_data.get(self.name, {}).get('DISABLE_RECOVERY_THREADS',"false")).lower() == "false"):
-            self.log_retrieval_process_active = True
-            if as_conf and as_conf.misc_data.get("AS_COMMAND","").lower() == "run":
-                self.recover_job_logs()
-
-
+        self.spawn_log_retrieval_process(as_conf)
     def test_connection(self,as_conf):
         self.main_process_id = os.getpid()
         if not self.connected:
diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py
index 288a4f00b..8ad9983b1 100644
--- a/autosubmit/platforms/paramiko_platform.py
+++ b/autosubmit/platforms/paramiko_platform.py
@@ -1,3 +1,4 @@
+import atexit
 import copy
 import threading
 
@@ -23,7 +24,6 @@ from threading import Thread
 import threading
 import getpass
 from paramiko.agent import Agent
-from autosubmit.helpers.utils import terminate_child_process
 
 def threaded(fn):
     def wrapper(*args, **kwargs):
@@ -116,7 +116,6 @@ class ParamikoPlatform(Platform):
             display = "localhost:0"
         self.local_x11_display = xlib_connect.get_display(display)
         self.log_retrieval_process_active = False
-        terminate_child_process(self.expid, self.name)
     def test_connection(self,as_conf):
         """
         Test if the connection is still alive, reconnect if not.
@@ -151,6 +150,7 @@ class ParamikoPlatform(Platform):
             raise AutosubmitCritical(str(e),7051)
             #raise AutosubmitError("[{0}] connection failed for host: {1}".format(self.name, self.host), 6002, e.message)
 
+
     def restore_connection(self, as_conf):
         try:
             self.connected = False
@@ -306,10 +306,9 @@ class ParamikoPlatform(Platform):
             self._ftpChannel = paramiko.SFTPClient.from_transport(self.transport,window_size=pow(4, 12) ,max_packet_size=pow(4, 12) )
             self._ftpChannel.get_channel().settimeout(120)
             self.connected = True
-            if not self.log_retrieval_process_active and (as_conf is None or str(as_conf.platforms_data.get(self.name, {}).get('DISABLE_RECOVERY_THREADS', "false")).lower() == "false"):
-                self.log_retrieval_process_active = True
-                if as_conf and as_conf.misc_data.get("AS_COMMAND", "").lower() == "run":
-                    self.recover_job_logs()
+            self.spawn_log_retrieval_process(as_conf)
+
+
         except SSHException:
             raise
         except IOError as e:
diff --git a/autosubmit/platforms/platform.py b/autosubmit/platforms/platform.py
index 80a2d4971..54c830740 100644
--- a/autosubmit/platforms/platform.py
+++ b/autosubmit/platforms/platform.py
@@ -1,38 +1,31 @@
-import atexit
-
-import queue
+import queue # only for the exception
+import psutil
 import setproctitle
 import locale
 import os
-
 import traceback
 from autosubmit.job.job_common import Status
 from typing import List, Union
-
 from autosubmit.helpers.parameters import autosubmit_parameter
 from log.log import AutosubmitCritical, AutosubmitError, Log
 from multiprocessing import Process, Queue, Event
-
+import atexit
 import time
+def processed(*decorator_args, **decorator_kwargs):
+    def decorator(fn):
+        def wrapper(*args, **kwargs):
+            stop_event = decorator_kwargs.get('stop_event', Event())
+            args = (args[0], stop_event)
+            process = Process(target=fn, args=args, kwargs=kwargs, name=f"{args[0].name}_platform")
+            if decorator_kwargs.get('daemon', False):
+                process.daemon = True
+            process.start()
+
+            return process
+        return wrapper
+    return decorator
+
 
-# stop the background task gracefully before exit
-def stop_background(stop_event, process):
-    # request the background thread stop
-    stop_event.set()
-    # wait for the background thread to stop
-    process.join()
-
-def processed(fn):
-    def wrapper(*args, **kwargs):
-        stop_event = Event()
-        args = (args[0], stop_event)
-        process = Process(target=fn, args=args, kwargs=kwargs, name=f"{args[0].name}_platform")
-        process.daemon = True  # Set the process as a daemon process
-        process.start()
-        atexit.register(stop_background, stop_event, process)
-        return process
-
-    return wrapper
 
 class Platform(object):
     """
@@ -107,6 +100,9 @@ class Platform(object):
         self.recovery_queue = Queue()
         self.log_retrieval_process_active = False
         self.main_process_id = None
+        self.stop_event = Event()
+
+
 
     @property
     @autosubmit_parameter(name='current_arch')
@@ -228,6 +224,10 @@ class Platform(object):
     def root_dir(self, value):
         self._root_dir = value
 
+    def send_stop_signal(self,stop_event):
+        Log.info(f"Sending stop signal to {self.name} platform to stop log recovery process")
+        stop_event.set()
+        time.sleep(60) # matching the time of the log recovery process
     def get_exclusive_directive(self, job):
         """
         Returns exclusive directive for the specified job
@@ -819,7 +819,7 @@ class Platform(object):
         raise NotImplementedError
 
     def add_job_to_log_recover(self, job):
-        self.recovery_queue.put((job,job.children))
+        self.recovery_queue.put(job)
 
     def connect(self, as_conf, reconnect=False):
         raise NotImplementedError
@@ -827,46 +827,76 @@ class Platform(object):
     def restore_connection(self,as_conf):
         raise NotImplementedError
 
-    @processed
-    def recover_job_logs(self, event):
+    def spawn_log_retrieval_process(self,as_conf):
+        if not self.log_retrieval_process_active and (
+                as_conf is None or str(as_conf.platforms_data.get(self.name, {}).get('DISABLE_RECOVERY_THREADS',
+                                                                                     "false")).lower() == "false"):
+            self.log_retrieval_process_active = True
+            if as_conf and as_conf.misc_data.get("AS_COMMAND", "").lower() == "run":
+                process = self.recover_job_logs_parent(self.stop_event)
+                process.join()
+                atexit.register(self.send_stop_signal, self.stop_event)
+
+    @processed(stop_event=Event(),daemon=False)
+    def recover_job_logs_parent(self, stop_event):
+        """
+        This function, spawn some processes to recover the logs of the jobs that have been submitted and exits.
+        The main_process call to this function and waits until all process are spawn. ( which avoids to this process to zombify )
+        This ensures that the recover_job_logs process is inherited by the init process, which reaps it automatically thus avoiding zombies.
+        """
+        Log.get_logger("Autosubmit")
+        self.recover_job_logs(stop_event)
+        return self
+
+    @processed(stop_event=Event(),daemon=False)
+    def recover_job_logs(self, stop_event):
+        """
+        This function, recovers the logs of the jobs that have been submitted.
+        This is an independent process that will be spawned by the recover_job_logs_parent function.
+        The exit of this process is controlled by the stop_event or if the main process is killed.
+        """
         setproctitle.setproctitle(f"autosubmit log {self.expid} recovery {self.name.lower()}")
-        job_names_processed = set()
+        Log.get_logger("Autosubmit")
+        Log.info(f"Thread: Starting log recovery for {self.name} platform")
+        job = None
+        jobs_pending_to_process = set()
         self.connected = False
         self.restore_connection(None)
         # check if id of self.main_process exists with ps ax | grep self.main_process_id
-        max_logs_to_process = 60
-        while not event.is_set() and os.system(f"ps ax | grep {str(self.main_process_id)} | grep -v grep > /dev/null 2>&1") == 0:
-            time.sleep(60)
-            logs_processed = 0 # avoid deadlocks just in case
+        max_logs_without_waiting_to_process = 60
+        Log.info(f"Thread: stop_event: {stop_event.is_set()} - main_process_id: {psutil.pid_exists(int(self.main_process_id))}")
+        while not stop_event.is_set() and psutil.pid_exists(int(self.main_process_id)):  # psutil, is a protection against kill -9.
+            Log.info(f"Thread: stop_event: {stop_event.is_set()} - main_process_id: {psutil.pid_exists(int(self.main_process_id))}")
             try:
-                while not self.recovery_queue.empty() and logs_processed < max_logs_to_process:
-                    logs_processed += 1
-                    job,children = self.recovery_queue.get(block=False)
-                    if job.wrapper_type != "vertical":
-                        if f'{job.name}_{job.fail_count}' in job_names_processed:
-                            continue
-                    else:
-                        if f'{job.name}' in job_names_processed:
-                            continue
-                    job.children = children
-                    job.platform = self
-                    if job.x11:
-                        Log.debug("Job {0} is an X11 job, skipping log retrieval as they're written in the ASLOGS".format(job.name))
-                        continue
+                logs_tried_to_retrieve = 0
+                while not self.recovery_queue.empty() and logs_tried_to_retrieve < max_logs_without_waiting_to_process:
                     try:
+                        logs_tried_to_retrieve += 1
+                        job = self.recovery_queue.get(timeout=1)  # Should be non-empty, but added a timeout for other possible errors.
+                        job.children = set()
+                        job.platform = self
                         job.retrieve_logfiles(self, raise_error=True)
-                        if job.wrapper_type != "vertical":
-                            job_names_processed.add(f'{job.name}_{job.fail_count}')
-                        else:
-                            job_names_processed.add(f'{job.name}')
-                    except:
+                    except queue.Empty:
                         pass
-            except queue.Empty:
-                pass
-            except (IOError, OSError):
-                pass
+                while len(jobs_pending_to_process) > 0: # jobs that had any issue during the log retrieval
+                    job = jobs_pending_to_process.pop()
+                    job.children = set()
+                    job.platform = self
+                    job.retrieve_logfiles(self, raise_error=True)
+                time.sleep(1)
+
             except Exception as e:
+                Log.warning(f"Error in log recovery: {e}")
                 try:
-                    self.restore_connection(None)
+                    if job:
+                        jobs_pending_to_process.add(job)
+                    if not stop_event.is_set() and psutil.pid_exists(int(self.main_process_id)):
+                        self.connected = False
+                        self.restore_connection(None)
                 except:
                     pass
+        self.recovery_queue.close()
+        Log.info(
+            f"Thread: stop_event: {stop_event.is_set()} - main_process_id: {psutil.pid_exists(int(self.main_process_id))}")
+
+        Log.info(f"Thread: Exiting log recovery for {self.name} platform")
\ No newline at end of file
diff --git a/autosubmit/platforms/sgeplatform.py b/autosubmit/platforms/sgeplatform.py
index f44fb6629..44b8a1b84 100644
--- a/autosubmit/platforms/sgeplatform.py
+++ b/autosubmit/platforms/sgeplatform.py
@@ -122,12 +122,9 @@ class SgePlatform(ParamikoPlatform):
         :rtype: bool
         """
         self.connected = True
-        if not self.log_retrieval_process_active and (
-                as_conf is None or str(as_conf.platforms_data.get(self.name, {}).get('DISABLE_RECOVERY_THREADS',
-                                                                                 "false")).lower() == "false"):
-            self.log_retrieval_process_active = True
-            if as_conf and as_conf.misc_data.get("AS_COMMAND","").lower() == "run":
-                self.recover_job_logs()
+        self.spawn_log_retrieval_process(as_conf)
+
+
     def restore_connection(self,as_conf):
         """
         In this case, it does nothing because connection is established for each command
diff --git a/bin/autosubmit b/bin/autosubmit
index d08f47593..b60ce0a74 100755
--- a/bin/autosubmit
+++ b/bin/autosubmit
@@ -51,7 +51,7 @@ def exit_from_error(e: BaseException):
         args = [str(e)]
         Log.critical(msg.format(*args))
     Log.info("More info at https://autosubmit.readthedocs.io/en/master/troubleshooting/error-codes.html")
-    os._exit(1)
+    return 1
 
 # noinspection PyProtectedMember
 def main():
@@ -60,8 +60,8 @@ def main():
         if os.path.exists(os.path.join(Log.file_path, "autosubmit.lock")):
             os.remove(os.path.join(Log.file_path, "autosubmit.lock"))
         if type(return_value) is int:
-            os._exit(return_value)
-        os._exit(0)
+            return return_value
+        return 0
     except AutosubmitError as e:
         exit_from_error(e)
     except AutosubmitCritical as e:
@@ -70,4 +70,5 @@ def main():
         exit_from_error(e)
 
 if __name__ == "__main__":
-    main()
+    exit_code = main()
+    sys.exit(exit_code) # Sys.exit ensures a proper cleanup of the program, while os._exit() does not.
\ No newline at end of file
-- 
GitLab


From 97767c8b718cb9a982337028260594e54a7f8f3a Mon Sep 17 00:00:00 2001
From: dbeltran <daniel.beltran@bsc.es>
Date: Thu, 22 Aug 2024 12:07:28 +0200
Subject: [PATCH 02/25] WIP: Zombies/orphan processors logs fix

---
 autosubmit/platforms/platform.py | 50 ++++++++++++++------------------
 1 file changed, 21 insertions(+), 29 deletions(-)

diff --git a/autosubmit/platforms/platform.py b/autosubmit/platforms/platform.py
index 54c830740..366ec1847 100644
--- a/autosubmit/platforms/platform.py
+++ b/autosubmit/platforms/platform.py
@@ -11,21 +11,12 @@ from log.log import AutosubmitCritical, AutosubmitError, Log
 from multiprocessing import Process, Queue, Event
 import atexit
 import time
-def processed(*decorator_args, **decorator_kwargs):
-    def decorator(fn):
-        def wrapper(*args, **kwargs):
-            stop_event = decorator_kwargs.get('stop_event', Event())
-            args = (args[0], stop_event)
-            process = Process(target=fn, args=args, kwargs=kwargs, name=f"{args[0].name}_platform")
-            if decorator_kwargs.get('daemon', False):
-                process.daemon = True
-            process.start()
-
-            return process
-        return wrapper
-    return decorator
-
-
+def processed(fn):
+    def wrapper(*args, **kwargs):
+        process = Process(target=fn, args=args, kwargs=kwargs, name=f"{args[0].name}_platform")
+        process.start()
+        return process
+    return wrapper
 
 class Platform(object):
     """
@@ -833,23 +824,24 @@ class Platform(object):
                                                                                      "false")).lower() == "false"):
             self.log_retrieval_process_active = True
             if as_conf and as_conf.misc_data.get("AS_COMMAND", "").lower() == "run":
-                process = self.recover_job_logs_parent(self.stop_event)
+                process = self.recover_job_logs_parent()
                 process.join()
-                atexit.register(self.send_stop_signal, self.stop_event)
-
-    @processed(stop_event=Event(),daemon=False)
-    def recover_job_logs_parent(self, stop_event):
+                self.stop_event.set()
+    @processed
+    def recover_job_logs_parent(self):
         """
         This function, spawn some processes to recover the logs of the jobs that have been submitted and exits.
         The main_process call to this function and waits until all process are spawn. ( which avoids to this process to zombify )
         This ensures that the recover_job_logs process is inherited by the init process, which reaps it automatically thus avoiding zombies.
         """
         Log.get_logger("Autosubmit")
-        self.recover_job_logs(stop_event)
-        return self
+        self.recover_job_logs()
+        #self.stop_event.set()
+        return 0
+
 
-    @processed(stop_event=Event(),daemon=False)
-    def recover_job_logs(self, stop_event):
+    @processed
+    def recover_job_logs(self):
         """
         This function, recovers the logs of the jobs that have been submitted.
         This is an independent process that will be spawned by the recover_job_logs_parent function.
@@ -864,9 +856,9 @@ class Platform(object):
         self.restore_connection(None)
         # check if id of self.main_process exists with ps ax | grep self.main_process_id
         max_logs_without_waiting_to_process = 60
-        Log.info(f"Thread: stop_event: {stop_event.is_set()} - main_process_id: {psutil.pid_exists(int(self.main_process_id))}")
-        while not stop_event.is_set() and psutil.pid_exists(int(self.main_process_id)):  # psutil, is a protection against kill -9.
-            Log.info(f"Thread: stop_event: {stop_event.is_set()} - main_process_id: {psutil.pid_exists(int(self.main_process_id))}")
+        Log.info(f"Thread: stop_event: {self.stop_event.is_set()} - main_process_id: {psutil.pid_exists(int(self.main_process_id))}")
+        while not self.stop_event.is_set() and psutil.pid_exists(int(self.main_process_id)):  # psutil, is a protection against kill -9.
+            Log.info(f"Thread: stop_event: {self.stop_event.is_set()} - main_process_id: {psutil.pid_exists(int(self.main_process_id))}")
             try:
                 logs_tried_to_retrieve = 0
                 while not self.recovery_queue.empty() and logs_tried_to_retrieve < max_logs_without_waiting_to_process:
@@ -890,13 +882,13 @@ class Platform(object):
                 try:
                     if job:
                         jobs_pending_to_process.add(job)
-                    if not stop_event.is_set() and psutil.pid_exists(int(self.main_process_id)):
+                    if not self.stop_event.is_set() and psutil.pid_exists(int(self.main_process_id)):
                         self.connected = False
                         self.restore_connection(None)
                 except:
                     pass
         self.recovery_queue.close()
         Log.info(
-            f"Thread: stop_event: {stop_event.is_set()} - main_process_id: {psutil.pid_exists(int(self.main_process_id))}")
+            f"Thread: stop_event: {self.stop_event.is_set()} - main_process_id: {psutil.pid_exists(int(self.main_process_id))}")
 
         Log.info(f"Thread: Exiting log recovery for {self.name} platform")
\ No newline at end of file
-- 
GitLab


From a8b77bf47a5bc755c1102ecdda1880f4944fd9e4 Mon Sep 17 00:00:00 2001
From: dbeltran <daniel.beltran@bsc.es>
Date: Thu, 22 Aug 2024 16:28:50 +0200
Subject: [PATCH 03/25] WIP: Looking good, some more fixes neccesaries and
 ready to be pushed

---
 autosubmit/autosubmit.py                  |  7 ++-
 autosubmit/job/job_list.py                | 11 ++++-
 autosubmit/platforms/ecplatform.py        |  2 -
 autosubmit/platforms/locplatform.py       |  1 -
 autosubmit/platforms/paramiko_platform.py |  5 --
 autosubmit/platforms/platform.py          | 57 +++++++++++++----------
 autosubmit/platforms/sgeplatform.py       |  1 -
 7 files changed, 45 insertions(+), 39 deletions(-)

diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py
index a64adfafb..b4e98d803 100644
--- a/autosubmit/autosubmit.py
+++ b/autosubmit/autosubmit.py
@@ -1679,6 +1679,7 @@ class Autosubmit:
         for job in job_list.get_active():
             if job.status != Status.WAITING:
                 job.status = Status.READY
+                job.ready_start_date = strftime("%Y%m%d%H%M%S")
         while job_list.get_active():
             Autosubmit.submit_ready_jobs(as_conf, job_list, platforms_to_test, packages_persistence, True,
                                          only_wrappers, hold=False)
@@ -2323,10 +2324,8 @@ class Autosubmit:
                 # Wait for all remaining threads of I/O, close remaining connections
                 # search hint - finished run
                 Log.info("Waiting for all logs to be updated")
-                # get all threads
-                threads = threading.enumerate()
                 # print name
-                timeout = as_conf.experiment_data.get("CONFIG",{}).get("LAST_LOGS_TIMEOUT", 1)
+                timeout = as_conf.experiment_data.get("CONFIG",{}).get("LAST_LOGS_TIMEOUT", 20)
                 for remaining in range(timeout, 0, -1):
                     if len(job_list.get_completed_without_logs()) == 0:
                         break
@@ -2335,7 +2334,7 @@ class Autosubmit:
                         job_list.update_log_status(job, as_conf)
                     sleep(1)
                     if remaining % 10 == 0:
-                        Log.info(f"Timeout: {remaining}")
+                        Log.info(f"Still logs remaining, Timeout: {remaining}")
 
                 # Updating job data header with current information when experiment ends
                 try:
diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py
index de63452b4..4616c809e 100644
--- a/autosubmit/job/job_list.py
+++ b/autosubmit/job/job_list.py
@@ -305,6 +305,7 @@ class JobList(object):
                 job.parameters = parameters
                 if not job.has_parents():
                     job.status = Status.READY
+                    job.ready_start_date = strftime("%Y%m%d%H%M%S")
                     job.packed = False
                 else:
                     job.status = Status.WAITING
@@ -2617,7 +2618,7 @@ class JobList(object):
             else:
                 for log_recovered in self.path_to_logs.glob(f"{job.name}.*.out"):
                     if job.local_logs[0] == log_recovered.name:
-                        if log_recovered.stat().st_mtime > job.ready_start_date:
+                        if job.ready_start_date and log_recovered.stat().st_mtime > job.ready_start_date:
                             return log_recovered
         return None
     def update_list(self, as_conf, store_change=True, fromSetStatus=False, submitter=None, first_time=False):
@@ -2679,6 +2680,7 @@ class JobList(object):
                                 "Resetting job: {0} status to: DELAYED for retrial...".format(job.name))
                         else:
                             job.status = Status.READY
+                            job.ready_start_date = strftime("%Y%m%d%H%M%S")
                             job.packed = False
                             Log.debug(
                                 "Resetting job: {0} status to: READY for retrial...".format(job.name))
@@ -2744,6 +2746,7 @@ class JobList(object):
             for job in self.get_delayed():
                 if datetime.datetime.now() >= job.delay_end:
                     job.status = Status.READY
+                    job.ready_start_date = strftime("%Y%m%d%H%M%S")
                     job.packed = False
             for job in self.get_waiting():
                 tmp = [parent for parent in job.parents if
@@ -2755,6 +2758,7 @@ class JobList(object):
                 failed_ones = [parent for parent in job.parents if parent.status == Status.FAILED]
                 if job.parents is None or len(tmp) == len(job.parents):
                     job.status = Status.READY
+                    job.ready_start_date = strftime("%Y%m%d%H%M%S")
                     job.packed = False
                     job.hold = False
                     Log.debug(
@@ -2775,6 +2779,8 @@ class JobList(object):
                                     break
                             if not strong_dependencies_failure and weak_dependencies_failure:
                                 job.status = Status.READY
+                                job.ready_start_date = strftime("%Y%m%d%H%M%S")
+
                                 job.packed = False
                                 job.hold = False
                                 Log.debug(
@@ -2788,6 +2794,7 @@ class JobList(object):
                             for parent in job.parents:
                                 if parent.name in job.edge_info and job.edge_info[parent.name].get('optional', False):
                                     job.status = Status.READY
+                                    job.ready_start_date = strftime("%Y%m%d%H%M%S")
                                     job.packed = False
                                     job.hold = False
                                     Log.debug(
@@ -2804,9 +2811,9 @@ class JobList(object):
                             parent.status == Status.SKIPPED or parent.status == Status.FAILED]
                     if len(tmp2) == len(job.parents) and len(tmp3) != len(job.parents):
                         job.status = Status.READY
+                        job.ready_start_date = strftime("%Y%m%d%H%M%S")
                         job.packed = False
                         # Run start time in format (YYYYMMDDHH:MM:SS) from current time
-                        job.ready_start_date = strftime("%Y%m%d%H%M%S")
                         job.packed = False
                         job.hold = False
                         save = True
diff --git a/autosubmit/platforms/ecplatform.py b/autosubmit/platforms/ecplatform.py
index 5f12c1ec9..36f694c3c 100644
--- a/autosubmit/platforms/ecplatform.py
+++ b/autosubmit/platforms/ecplatform.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-import atexit
 # Copyright 2017-2020 Earth Sciences Department, BSC-CNS
 
 # This file is part of Autosubmit.
@@ -197,7 +196,6 @@ class EcPlatform(ParamikoPlatform):
         :return: True
         :rtype: bool
         """
-        self.main_process_id = os.getpid()
         output = subprocess.check_output(self._checkvalidcert_cmd, shell=True).decode(locale.getlocale()[1])
         if not output:
             output = ""
diff --git a/autosubmit/platforms/locplatform.py b/autosubmit/platforms/locplatform.py
index 520a4b0f3..8e884bde9 100644
--- a/autosubmit/platforms/locplatform.py
+++ b/autosubmit/platforms/locplatform.py
@@ -115,7 +115,6 @@ class LocalPlatform(ParamikoPlatform):
         self.connected = True
         self.spawn_log_retrieval_process(as_conf)
     def test_connection(self,as_conf):
-        self.main_process_id = os.getpid()
         if not self.connected:
             self.connect(as_conf)
 
diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py
index 8ad9983b1..bd4c96cde 100644
--- a/autosubmit/platforms/paramiko_platform.py
+++ b/autosubmit/platforms/paramiko_platform.py
@@ -1,7 +1,3 @@
-import atexit
-import copy
-import threading
-
 import locale
 from contextlib import suppress
 from time import sleep
@@ -120,7 +116,6 @@ class ParamikoPlatform(Platform):
         """
         Test if the connection is still alive, reconnect if not.
         """
-        self.main_process_id = os.getpid()
 
         try:
             if not self.connected:
diff --git a/autosubmit/platforms/platform.py b/autosubmit/platforms/platform.py
index 366ec1847..4fbf4ec58 100644
--- a/autosubmit/platforms/platform.py
+++ b/autosubmit/platforms/platform.py
@@ -1,3 +1,4 @@
+import atexit
 import queue # only for the exception
 import psutil
 import setproctitle
@@ -5,14 +6,13 @@ import locale
 import os
 import traceback
 from autosubmit.job.job_common import Status
-from typing import List, Union
+from typing import List, Union, Callable
 from autosubmit.helpers.parameters import autosubmit_parameter
 from log.log import AutosubmitCritical, AutosubmitError, Log
 from multiprocessing import Process, Queue, Event
-import atexit
 import time
-def processed(fn):
-    def wrapper(*args, **kwargs):
+def processed(fn: Callable) -> Callable[..., Process]:
+    def wrapper(*args, **kwargs) -> Process:
         process = Process(target=fn, args=args, kwargs=kwargs, name=f"{args[0].name}_platform")
         process.start()
         return process
@@ -218,7 +218,7 @@ class Platform(object):
     def send_stop_signal(self,stop_event):
         Log.info(f"Sending stop signal to {self.name} platform to stop log recovery process")
         stop_event.set()
-        time.sleep(60) # matching the time of the log recovery process
+
     def get_exclusive_directive(self, job):
         """
         Returns exclusive directive for the specified job
@@ -819,6 +819,8 @@ class Platform(object):
         raise NotImplementedError
 
     def spawn_log_retrieval_process(self,as_conf):
+        if not self.log_retrieval_process_active:
+            self.main_process_id = os.getpid()
         if not self.log_retrieval_process_active and (
                 as_conf is None or str(as_conf.platforms_data.get(self.name, {}).get('DISABLE_RECOVERY_THREADS',
                                                                                      "false")).lower() == "false"):
@@ -826,7 +828,8 @@ class Platform(object):
             if as_conf and as_conf.misc_data.get("AS_COMMAND", "").lower() == "run":
                 process = self.recover_job_logs_parent()
                 process.join()
-                self.stop_event.set()
+                atexit.register(self.send_stop_signal, self.stop_event) # Register the stop signal to be sent when the program ends. ( Doesn't help with kill -9 )
+
     @processed
     def recover_job_logs_parent(self):
         """
@@ -834,11 +837,9 @@ class Platform(object):
         The main_process call to this function and waits until all process are spawn. ( which avoids to this process to zombify )
         This ensures that the recover_job_logs process is inherited by the init process, which reaps it automatically thus avoiding zombies.
         """
-        Log.get_logger("Autosubmit")
-        self.recover_job_logs()
-        #self.stop_event.set()
-        return 0
-
+        Log.get_logger("Autosubmit") # Log needs to be initialized in the new process
+        self.recover_job_logs() # will run without block
+        os._exit(0) # needed so the event/queue is not closed. ( no cleanup )
 
     @processed
     def recover_job_logs(self):
@@ -847,48 +848,56 @@ class Platform(object):
         This is an independent process that will be spawned by the recover_job_logs_parent function.
         The exit of this process is controlled by the stop_event or if the main process is killed.
         """
+
         setproctitle.setproctitle(f"autosubmit log {self.expid} recovery {self.name.lower()}")
-        Log.get_logger("Autosubmit")
-        Log.info(f"Thread: Starting log recovery for {self.name} platform")
+        identifier = f"{self.name.lower()}(log_recovery):"
+        Log.get_logger("Autosubmit") # Log needs to be initialized in the new process
+        Log.info(f"{identifier} Starting...")
         job = None
         jobs_pending_to_process = set()
         self.connected = False
         self.restore_connection(None)
+        Log.info(f"{identifier} Sucesfully connected.")
         # check if id of self.main_process exists with ps ax | grep self.main_process_id
         max_logs_without_waiting_to_process = 60
-        Log.info(f"Thread: stop_event: {self.stop_event.is_set()} - main_process_id: {psutil.pid_exists(int(self.main_process_id))}")
         while not self.stop_event.is_set() and psutil.pid_exists(int(self.main_process_id)):  # psutil, is a protection against kill -9.
-            Log.info(f"Thread: stop_event: {self.stop_event.is_set()} - main_process_id: {psutil.pid_exists(int(self.main_process_id))}")
             try:
                 logs_tried_to_retrieve = 0
                 while not self.recovery_queue.empty() and logs_tried_to_retrieve < max_logs_without_waiting_to_process:
                     try:
                         logs_tried_to_retrieve += 1
                         job = self.recovery_queue.get(timeout=1)  # Should be non-empty, but added a timeout for other possible errors.
+                        if job.platform_name != self.name:
+                            Log.warning(f"{identifier} Job {job.name} is not from {self.name}. Skipping log retrieval.")
+                            continue
                         job.children = set()
                         job.platform = self
+                        Log.info(f"{identifier} Recovering log files for job {job.name}")
                         job.retrieve_logfiles(self, raise_error=True)
+                        Log.result(f"{identifier} Sucessfully recovered log files for job {job.name}")
                     except queue.Empty:
                         pass
                 while len(jobs_pending_to_process) > 0: # jobs that had any issue during the log retrieval
                     job = jobs_pending_to_process.pop()
                     job.children = set()
                     job.platform = self
+                    Log.info(f"{identifier} (Retrial) Recovering log files for job {job.name}")
                     job.retrieve_logfiles(self, raise_error=True)
-                time.sleep(1)
+                    Log.result(f"{identifier} (Retrial) Sucesfully recovered log files for job {job.name}")
+                time.sleep(1) # TODO set a parameter for this
 
             except Exception as e:
-                Log.warning(f"Error in log recovery: {e}")
+                Log.warning(f"{identifier} Error while recovering logs: {str(e)}")
                 try:
-                    if job:
+                    if job: # If log retrieval failed, add it to the pending jobs to process.
                         jobs_pending_to_process.add(job)
+
                     if not self.stop_event.is_set() and psutil.pid_exists(int(self.main_process_id)):
                         self.connected = False
-                        self.restore_connection(None)
+                        Log.info(f"{identifier} Attempting to restore connection")
+                        self.restore_connection(None) # Always restore the connection on a failure.
+                        Log.result(f"{identifier} Sucessfully reconnected.")
                 except:
                     pass
-        self.recovery_queue.close()
-        Log.info(
-            f"Thread: stop_event: {self.stop_event.is_set()} - main_process_id: {psutil.pid_exists(int(self.main_process_id))}")
-
-        Log.info(f"Thread: Exiting log recovery for {self.name} platform")
\ No newline at end of file
+        os._exit(0)
+        self.recovery_queue.close() #
diff --git a/autosubmit/platforms/sgeplatform.py b/autosubmit/platforms/sgeplatform.py
index 44b8a1b84..1b956ff99 100644
--- a/autosubmit/platforms/sgeplatform.py
+++ b/autosubmit/platforms/sgeplatform.py
@@ -141,7 +141,6 @@ class SgePlatform(ParamikoPlatform):
         :return: True
         :rtype: bool
         """
-        self.main_process_id = os.getpid()
         self.connected = True
         self.connected(as_conf,True)
 
-- 
GitLab


From 89aed473e77dca8f3b01d5df9d8577baf7041c40 Mon Sep 17 00:00:00 2001
From: dbeltran <daniel.beltran@bsc.es>
Date: Fri, 23 Aug 2024 12:59:45 +0200
Subject: [PATCH 04/25] A new orphan/zombies code approach

---
 autosubmit/autosubmit.py         | 18 +++++----
 autosubmit/job/job.py            | 15 ++++++-
 autosubmit/job/job_list.py       | 35 +++++++++--------
 autosubmit/platforms/platform.py | 67 +++++++++++++++++++++-----------
 4 files changed, 85 insertions(+), 50 deletions(-)

diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py
index b4e98d803..b4ddcb85e 100644
--- a/autosubmit/autosubmit.py
+++ b/autosubmit/autosubmit.py
@@ -1679,7 +1679,7 @@ class Autosubmit:
         for job in job_list.get_active():
             if job.status != Status.WAITING:
                 job.status = Status.READY
-                job.ready_start_date = strftime("%Y%m%d%H%M%S")
+                job.set_ready_date()
         while job_list.get_active():
             Autosubmit.submit_ready_jobs(as_conf, job_list, platforms_to_test, packages_persistence, True,
                                          only_wrappers, hold=False)
@@ -2324,17 +2324,19 @@ class Autosubmit:
                 # Wait for all remaining threads of I/O, close remaining connections
                 # search hint - finished run
                 Log.info("Waiting for all logs to be updated")
-                # print name
-                timeout = as_conf.experiment_data.get("CONFIG",{}).get("LAST_LOGS_TIMEOUT", 20)
+                # One cycle log recovery is sys_timeout seconds ( default 60 )
+                # The 20 seconds are a grace period to download these logs if the cycle just started.
+                sys_timeout = int(as_conf.experiment_data.get("LOG_RECOVERY_TIMEOUT",60)) + 20 # 20 is a gracetime period to download the last logs
+                timeout = as_conf.experiment_data.get("CONFIG",{}).get("LAST_LOGS_TIMEOUT", sys_timeout)
+                for job in job_list.get_completed_without_logs():
+                    job.platform = submitter.platforms[job.platform_name.upper()]
+                    job_list.update_log_status(job, as_conf)  # request an update of the log status
                 for remaining in range(timeout, 0, -1):
                     if len(job_list.get_completed_without_logs()) == 0:
                         break
-                    for job in job_list.get_completed_without_logs():
-                        job.platform = submitter.platforms[job.platform_name.upper()]
-                        job_list.update_log_status(job, as_conf)
-                    sleep(1)
+                    sleep(1) # sleeps of one to ensure the fastest exit possible
                     if remaining % 10 == 0:
-                        Log.info(f"Still logs remaining, Timeout: {remaining}")
+                        Log.info(f"Logs are being recovered... max time left: {remaining}")
 
                 # Updating job data header with current information when experiment ends
                 try:
diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py
index 0f5085f1f..de891bd95 100644
--- a/autosubmit/job/job.py
+++ b/autosubmit/job/job.py
@@ -22,6 +22,7 @@ Main module for Autosubmit. Only contains an interface class to all functionalit
 """
 
 from collections import OrderedDict
+from pathlib import Path
 
 from autosubmit.job import job_utils
 from contextlib import suppress
@@ -248,12 +249,13 @@ class Job(object):
         # hetjobs
         self.het = None
         self.updated_log = False
-        self.ready_start_date = None
+        self.ready_start_date = {}
         self.log_retrieved = False
         self.start_time_written = False
         self.submit_time_timestamp = None # for wrappers, all jobs inside a wrapper are submitted at the same time
         self.finish_time_timestamp = None # for wrappers, with inner_retrials, the submission time should be the last finish_time of the previous retrial
         self._script = None # Inline code to be executed
+        self._log_recovery_retries = None
     def _init_runtime_parameters(self):
         # hetjobs
         self.het = {'HETSIZE': 0}
@@ -1132,6 +1134,15 @@ class Job(object):
                     except BaseException:
                         log_retrieved = False
             self.log_retrieved = log_retrieved
+
+    def set_ready_date(self):
+        """
+        Sets the ready start date for the job
+        """
+        if self.fail_count == 0:
+            self.ready_start_date = {}
+        self.ready_start_date[self.fail_count] = int(time.strftime("%Y%m%d%H%M%S"))
+
     def retrieve_logfiles(self, platform, raise_error=False):
         """
         Retrieves log files from remote host meant to be used inside a process.
@@ -1139,8 +1150,8 @@ class Job(object):
         :param raise_error: boolean to raise an error if the logs are not retrieved
         :return:
         """
-        backup_logname = copy.copy(self.local_logs)
 
+        backup_logname = copy.copy(self.local_logs)
         if self.wrapper_type == "vertical":
             stat_file = self.script_name[:-4] + "_STAT_"
             self.retrieve_internal_retrials_logfiles(platform)
diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py
index 4616c809e..1b69b69e3 100644
--- a/autosubmit/job/job_list.py
+++ b/autosubmit/job/job_list.py
@@ -37,7 +37,7 @@ import math
 import networkx as nx
 from bscearth.utils.date import date2str, parse_date
 from networkx import DiGraph
-from time import localtime, strftime, mktime, time
+from time import localtime, mktime, time
 
 import autosubmit.database.db_structure as DbStructure
 from autosubmit.helpers.data_transfer import JobRow
@@ -305,7 +305,7 @@ class JobList(object):
                 job.parameters = parameters
                 if not job.has_parents():
                     job.status = Status.READY
-                    job.ready_start_date = strftime("%Y%m%d%H%M%S")
+                    job.set_ready_date()
                     job.packed = False
                 else:
                     job.status = Status.WAITING
@@ -2593,9 +2593,9 @@ class JobList(object):
         if hasattr(job, "x11") and job.x11: # X11 has it log writted in the run.out file. No need to check for log files as there are none
             job.updated_log = True
             return
-        log_recovered = self.check_if_log_is_recovered(job)
 
-        if log_recovered:  # hasattr for backward compatibility
+        log_recovered = self.check_if_log_is_recovered(job)
+        if log_recovered:
             job.local_logs = (log_recovered.name, log_recovered.name[:-4] + ".err") # we only want the last one
             job.updated_log = True
         elif not job.updated_log and str(as_conf.platforms_data.get(job.platform.name, {}).get('DISABLE_RECOVERY_THREADS', "false")).lower() == "false":
@@ -2609,18 +2609,19 @@ class JobList(object):
         - File timestamp should be greater than the job ready_start_date, otherwise is from a previous run.
 
         """
+        if type(job.ready_start_date) is not dict:  # retrocompatibility
+            job.ready_start_date[job.fail_count] = int(job.ready_date)
         if not hasattr(job, "updated_log") or not job.updated_log:
             if job.wrapper_type == "vertical" and job.fail_count > 0:
                 for log_recovered in self.path_to_logs.glob(f"{job.name}.*._{job.fail_count}.out"):
-                    if job.local_logs[0][-4] in log_recovered.name:
-                        if log_recovered.stat().st_mtime > job.ready_start_date:
-                            return log_recovered
+                    if job.ready_start_date[job.fail_count] and int(datetime.datetime.fromtimestamp(log_recovered.stat().st_mtime).strftime("%Y%m%d%H%M%S")) > int(job.ready_start_date[job.fail_count]):
+                        return log_recovered
             else:
                 for log_recovered in self.path_to_logs.glob(f"{job.name}.*.out"):
-                    if job.local_logs[0] == log_recovered.name:
-                        if job.ready_start_date and log_recovered.stat().st_mtime > job.ready_start_date:
-                            return log_recovered
+                    if job.ready_start_date[job.fail_count] and int(datetime.datetime.fromtimestamp(log_recovered.stat().st_mtime).strftime("%Y%m%d%H%M%S")) > int(job.ready_start_date[job.fail_count]):
+                        return log_recovered
         return None
+
     def update_list(self, as_conf, store_change=True, fromSetStatus=False, submitter=None, first_time=False):
         # type: (AutosubmitConfig, bool, bool, object, bool) -> bool
         """
@@ -2680,7 +2681,7 @@ class JobList(object):
                                 "Resetting job: {0} status to: DELAYED for retrial...".format(job.name))
                         else:
                             job.status = Status.READY
-                            job.ready_start_date = strftime("%Y%m%d%H%M%S")
+                            job.set_ready_date()
                             job.packed = False
                             Log.debug(
                                 "Resetting job: {0} status to: READY for retrial...".format(job.name))
@@ -2706,7 +2707,7 @@ class JobList(object):
         for job in self.check_special_status():
             job.status = Status.READY
             # Run start time in format (YYYYMMDDHH:MM:SS) from current time
-            job.ready_start_date = strftime("%Y%m%d%H%M%S")
+            job.set_ready_date()
             job.id = None
             job.packed = False
             job.wrapper_type = None
@@ -2746,7 +2747,7 @@ class JobList(object):
             for job in self.get_delayed():
                 if datetime.datetime.now() >= job.delay_end:
                     job.status = Status.READY
-                    job.ready_start_date = strftime("%Y%m%d%H%M%S")
+                    job.set_ready_date()
                     job.packed = False
             for job in self.get_waiting():
                 tmp = [parent for parent in job.parents if
@@ -2758,7 +2759,7 @@ class JobList(object):
                 failed_ones = [parent for parent in job.parents if parent.status == Status.FAILED]
                 if job.parents is None or len(tmp) == len(job.parents):
                     job.status = Status.READY
-                    job.ready_start_date = strftime("%Y%m%d%H%M%S")
+                    job.set_ready_date()
                     job.packed = False
                     job.hold = False
                     Log.debug(
@@ -2779,7 +2780,7 @@ class JobList(object):
                                     break
                             if not strong_dependencies_failure and weak_dependencies_failure:
                                 job.status = Status.READY
-                                job.ready_start_date = strftime("%Y%m%d%H%M%S")
+                                job.set_ready_date()
 
                                 job.packed = False
                                 job.hold = False
@@ -2794,7 +2795,7 @@ class JobList(object):
                             for parent in job.parents:
                                 if parent.name in job.edge_info and job.edge_info[parent.name].get('optional', False):
                                     job.status = Status.READY
-                                    job.ready_start_date = strftime("%Y%m%d%H%M%S")
+                                    job.set_ready_date()
                                     job.packed = False
                                     job.hold = False
                                     Log.debug(
@@ -2811,7 +2812,7 @@ class JobList(object):
                             parent.status == Status.SKIPPED or parent.status == Status.FAILED]
                     if len(tmp2) == len(job.parents) and len(tmp3) != len(job.parents):
                         job.status = Status.READY
-                        job.ready_start_date = strftime("%Y%m%d%H%M%S")
+                        job.set_ready_date()
                         job.packed = False
                         # Run start time in format (YYYYMMDDHH:MM:SS) from current time
                         job.packed = False
diff --git a/autosubmit/platforms/platform.py b/autosubmit/platforms/platform.py
index 4fbf4ec58..aed37a7f3 100644
--- a/autosubmit/platforms/platform.py
+++ b/autosubmit/platforms/platform.py
@@ -1,4 +1,5 @@
 import atexit
+import multiprocessing
 import queue # only for the exception
 import psutil
 import setproctitle
@@ -9,8 +10,10 @@ from autosubmit.job.job_common import Status
 from typing import List, Union, Callable
 from autosubmit.helpers.parameters import autosubmit_parameter
 from log.log import AutosubmitCritical, AutosubmitError, Log
-from multiprocessing import Process, Queue, Event
+from multiprocessing import Process, Event
+from multiprocessing.queues import Queue
 import time
+
 def processed(fn: Callable) -> Callable[..., Process]:
     def wrapper(*args, **kwargs) -> Process:
         process = Process(target=fn, args=args, kwargs=kwargs, name=f"{args[0].name}_platform")
@@ -18,6 +21,21 @@ def processed(fn: Callable) -> Callable[..., Process]:
         return process
     return wrapper
 
+
+class UniqueQueue(Queue): # The reason of this class is to avoid duplicates in the queue during the same run. That can happen if the log retrieval process didn't process it yet.
+
+    def __init__(self, maxsize=-1, block=True, timeout=None):
+        self.block = block
+        self.timeout = timeout
+        self.all_items = set() # Won't be popped, so even if it is being processed by the log retrieval process, it won't be added again.
+        super().__init__(maxsize, ctx=multiprocessing.get_context())
+
+    def put(self, job, block=True, timeout=None):
+        if job.name not in self.all_items:
+            self.all_items.add(job.name)
+            Queue.put(self, job, block, timeout)
+
+
 class Platform(object):
     """
     Class to manage the connections to the different platforms.
@@ -88,7 +106,7 @@ class Platform(object):
                 self.pw = auth_password
         else:
             self.pw = None
-        self.recovery_queue = Queue()
+        self.recovery_queue = UniqueQueue()
         self.log_retrieval_process_active = False
         self.main_process_id = None
         self.stop_event = Event()
@@ -819,6 +837,9 @@ class Platform(object):
         raise NotImplementedError
 
     def spawn_log_retrieval_process(self,as_conf):
+        """
+        This function, spawn a process that spawns another process to recover the logs of the jobs that have been submitted in this platform.
+        """
         if not self.log_retrieval_process_active:
             self.main_process_id = os.getpid()
         if not self.log_retrieval_process_active and (
@@ -827,8 +848,8 @@ class Platform(object):
             self.log_retrieval_process_active = True
             if as_conf and as_conf.misc_data.get("AS_COMMAND", "").lower() == "run":
                 process = self.recover_job_logs_parent()
-                process.join()
-                atexit.register(self.send_stop_signal, self.stop_event) # Register the stop signal to be sent when the program ends. ( Doesn't help with kill -9 )
+                process.join() # Wait until the process is spawned. ( Avoids to zombify the process )
+                atexit.register(self.send_stop_signal, self.stop_event) # Register the stop signal to be sent when Autosubmit ends correctly. ( Doesn't help with kill -9 )
 
     @processed
     def recover_job_logs_parent(self):
@@ -837,9 +858,9 @@ class Platform(object):
         The main_process call to this function and waits until all process are spawn. ( which avoids to this process to zombify )
         This ensures that the recover_job_logs process is inherited by the init process, which reaps it automatically thus avoiding zombies.
         """
-        Log.get_logger("Autosubmit") # Log needs to be initialized in the new process
-        self.recover_job_logs() # will run without block
-        os._exit(0) # needed so the event/queue is not closed. ( no cleanup )
+        Log.get_logger("Autosubmit")  # Log needs to be initialized in the new process
+        self.recover_job_logs()  # will run without block
+        os._exit(0)  # needed so the event/queue is not closed. ( no cleanup )
 
     @processed
     def recover_job_logs(self):
@@ -847,57 +868,57 @@ class Platform(object):
         This function, recovers the logs of the jobs that have been submitted.
         This is an independent process that will be spawned by the recover_job_logs_parent function.
         The exit of this process is controlled by the stop_event or if the main process is killed.
+        Once a job is get from the queue, it will try to recover these log files until the end of the execution.
         """
 
         setproctitle.setproctitle(f"autosubmit log {self.expid} recovery {self.name.lower()}")
         identifier = f"{self.name.lower()}(log_recovery):"
-        Log.get_logger("Autosubmit") # Log needs to be initialized in the new process
+        Log.get_logger("Autosubmit")  # Log needs to be initialized in the new process
         Log.info(f"{identifier} Starting...")
         job = None
         jobs_pending_to_process = set()
         self.connected = False
         self.restore_connection(None)
-        Log.info(f"{identifier} Sucesfully connected.")
+        Log.result(f"{identifier} Sucessfully connected.")
         # check if id of self.main_process exists with ps ax | grep self.main_process_id
         max_logs_without_waiting_to_process = 60
-        while not self.stop_event.is_set() and psutil.pid_exists(int(self.main_process_id)):  # psutil, is a protection against kill -9.
+        while not self.stop_event.is_set() and psutil.pid_exists(int(self.main_process_id)):  # stop_event is set by atexit, whenever Autosubmit ends correctly. psutil, is a protection against kill -9.
             try:
                 logs_tried_to_retrieve = 0
                 while not self.recovery_queue.empty() and logs_tried_to_retrieve < max_logs_without_waiting_to_process:
                     try:
                         logs_tried_to_retrieve += 1
                         job = self.recovery_queue.get(timeout=1)  # Should be non-empty, but added a timeout for other possible errors.
-                        if job.platform_name != self.name:
-                            Log.warning(f"{identifier} Job {job.name} is not from {self.name}. Skipping log retrieval.")
-                            continue
-                        job.children = set()
-                        job.platform = self
-                        Log.info(f"{identifier} Recovering log files for job {job.name}")
+                        job.children = set() # Children can't be serialized, so we set it to an empty set for this process.
+                        job.platform = self # change original platform to this process platform.
+                        job._log_recovery_retries = 0 # reset the log recovery retries.
+                        Log.debug(f"{identifier} Recovering log files for job {job.name}")
                         job.retrieve_logfiles(self, raise_error=True)
                         Log.result(f"{identifier} Sucessfully recovered log files for job {job.name}")
                     except queue.Empty:
                         pass
+                # This second while is to keep retring the failed jobs.
                 while len(jobs_pending_to_process) > 0: # jobs that had any issue during the log retrieval
                     job = jobs_pending_to_process.pop()
                     job.children = set()
                     job.platform = self
-                    Log.info(f"{identifier} (Retrial) Recovering log files for job {job.name}")
+                    job._log_recovery_retries += 1
+                    Log.debug(f"{identifier} (Retrial number: {job._log_recovery_retries}) Recovering log files for job {job.name}")
                     job.retrieve_logfiles(self, raise_error=True)
-                    Log.result(f"{identifier} (Retrial) Sucesfully recovered log files for job {job.name}")
-                time.sleep(1) # TODO set a parameter for this
+                    Log.result(f"{identifier} (Retrial) Successfully recovered log files for job {job.name}")
+                time.sleep(self.config.get("LOG_RECOVERY_TIMEOUT", 60))
 
             except Exception as e:
                 Log.warning(f"{identifier} Error while recovering logs: {str(e)}")
                 try:
-                    if job: # If log retrieval failed, add it to the pending jobs to process.
+                    if job and job._log_recovery_retries < 5: # If log retrieval failed, add it to the pending jobs to process. Avoids to keep trying the same job forever.
                         jobs_pending_to_process.add(job)
 
-                    if not self.stop_event.is_set() and psutil.pid_exists(int(self.main_process_id)):
+                    if not self.stop_event.is_set() and psutil.pid_exists(int(self.main_process_id)): # The main process may be terminated already.
                         self.connected = False
                         Log.info(f"{identifier} Attempting to restore connection")
                         self.restore_connection(None) # Always restore the connection on a failure.
                         Log.result(f"{identifier} Sucessfully reconnected.")
                 except:
                     pass
-        os._exit(0)
-        self.recovery_queue.close() #
+        #os._exit(0)
-- 
GitLab


From 9a6ca265b99a64b827e896ae7bb70c39fc0b671c Mon Sep 17 00:00:00 2001
From: dbeltran <daniel.beltran@bsc.es>
Date: Mon, 26 Aug 2024 16:21:35 +0200
Subject: [PATCH 05/25] changes to signals and waitpid

---
 autosubmit/autosubmit.py         |  2 +
 autosubmit/helpers/utils.py      | 12 +++++
 autosubmit/job/job.py            |  4 +-
 autosubmit/job/job_list.py       |  7 ++-
 autosubmit/job/job_packager.py   | 10 +++-
 autosubmit/platforms/platform.py | 81 ++++++++++++++------------------
 6 files changed, 61 insertions(+), 55 deletions(-)

diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py
index b4ddcb85e..193ef60ba 100644
--- a/autosubmit/autosubmit.py
+++ b/autosubmit/autosubmit.py
@@ -2148,6 +2148,8 @@ class Autosubmit:
                 max_recovery_retrials = as_conf.experiment_data.get("CONFIG",{}).get("RECOVERY_RETRIALS",3650)  # (72h - 122h )
                 recovery_retrials = 0
                 while job_list.get_active():
+                    for platform in platforms_to_test:  # This can be a long process, so we need to tell to the log workers to keep alive.
+                        platform.work_event.set()
                     for job in [job for job in job_list.get_job_list() if job.status == Status.READY]:
                         job.update_parameters(as_conf, {})
                     did_run = True
diff --git a/autosubmit/helpers/utils.py b/autosubmit/helpers/utils.py
index 4f05c7676..f706f6a63 100644
--- a/autosubmit/helpers/utils.py
+++ b/autosubmit/helpers/utils.py
@@ -145,3 +145,15 @@ def restore_platforms(platform_to_test, mail_notify=False, as_conf=None, expid=N
             raise AutosubmitCritical("Issues while checking the connectivity of platforms.", 7010, issues + "\n" + ssh_config_issues)
 
 
+class AutosubmitUtils:
+    def __init__(self):
+        self.active_platforms = None
+
+    def set_work_pending(self):
+        """
+        This method will set the work_pending event to True for all active platforms. To indicate to the child processes that there is work to be done.
+        """
+        for platform in self.active_platforms:
+            platform.send_work_signal()
+
+
diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py
index de891bd95..c72806d09 100644
--- a/autosubmit/job/job.py
+++ b/autosubmit/job/job.py
@@ -1139,9 +1139,7 @@ class Job(object):
         """
         Sets the ready start date for the job
         """
-        if self.fail_count == 0:
-            self.ready_start_date = {}
-        self.ready_start_date[self.fail_count] = int(time.strftime("%Y%m%d%H%M%S"))
+        self.ready_start_date = int(time.strftime("%Y%m%d%H%M%S"))
 
     def retrieve_logfiles(self, platform, raise_error=False):
         """
diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py
index 1b69b69e3..1e541a874 100644
--- a/autosubmit/job/job_list.py
+++ b/autosubmit/job/job_list.py
@@ -2609,16 +2609,15 @@ class JobList(object):
         - File timestamp should be greater than the job ready_start_date, otherwise is from a previous run.
 
         """
-        if type(job.ready_start_date) is not dict:  # retrocompatibility
-            job.ready_start_date[job.fail_count] = int(job.ready_date)
+
         if not hasattr(job, "updated_log") or not job.updated_log:
             if job.wrapper_type == "vertical" and job.fail_count > 0:
                 for log_recovered in self.path_to_logs.glob(f"{job.name}.*._{job.fail_count}.out"):
-                    if job.ready_start_date[job.fail_count] and int(datetime.datetime.fromtimestamp(log_recovered.stat().st_mtime).strftime("%Y%m%d%H%M%S")) > int(job.ready_start_date[job.fail_count]):
+                    if job.ready_start_date  and int(datetime.datetime.fromtimestamp(log_recovered.stat().st_mtime).strftime("%Y%m%d%H%M%S")) > int(job.ready_start_date):
                         return log_recovered
             else:
                 for log_recovered in self.path_to_logs.glob(f"{job.name}.*.out"):
-                    if job.ready_start_date[job.fail_count] and int(datetime.datetime.fromtimestamp(log_recovered.stat().st_mtime).strftime("%Y%m%d%H%M%S")) > int(job.ready_start_date[job.fail_count]):
+                    if job.ready_start_date and int(datetime.datetime.fromtimestamp(log_recovered.stat().st_mtime).strftime("%Y%m%d%H%M%S")) > int(job.ready_start_date):
                         return log_recovered
         return None
 
diff --git a/autosubmit/job/job_packager.py b/autosubmit/job/job_packager.py
index 16e78bb87..47967143e 100644
--- a/autosubmit/job/job_packager.py
+++ b/autosubmit/job/job_packager.py
@@ -726,8 +726,11 @@ class JobPackagerVertical(object):
         stack = [(job, 1)]
         while stack:
             job, level = stack.pop()
-            if level % 10 == 0 and level > 0:
+            if level % 50 == 0 and level > 0:
                 Log.info(f"Wrapper package creation is still ongoing. So far {level} jobs have been wrapped.")
+                for event in job.platform.worker_events: # This can be a long process, so we need to tell to the log workers to keep alive.
+                    if not event.is_set():
+                        event.set()
             if len(self.jobs_list) >= self.wrapper_limits["max_v"] or len(self.jobs_list) >= \
                     self.wrapper_limits["max_by_section"][job.section] or len(self.jobs_list) >= self.wrapper_limits[
                 "max"]:
@@ -919,8 +922,11 @@ class JobPackagerHorizontal(object):
         for section in jobs_by_section:
             current_package_by_section[section] = 0
             for job in jobs_by_section[section]:
-                if jobs_processed % 10 == 0 and jobs_processed > 0:
+                if jobs_processed % 50 == 0 and jobs_processed > 0:
                     Log.info(f"Wrapper package creation is still ongoing. So far {jobs_processed} jobs have been wrapped.")
+                    for event in job.platform.worker_events:  # This can be a long process, so we need to tell to the log workers to keep alive.
+                        if not event.is_set():
+                            event.set()
                 job.update_parameters(wrapper_info[-1], {})
                 if str(job.processors).isdigit() and str(job.nodes).isdigit() and int(job.nodes) > 0 and int(job.processors) <= 1:
                     job.processors = 0
diff --git a/autosubmit/platforms/platform.py b/autosubmit/platforms/platform.py
index aed37a7f3..a9d638dd9 100644
--- a/autosubmit/platforms/platform.py
+++ b/autosubmit/platforms/platform.py
@@ -14,12 +14,6 @@ from multiprocessing import Process, Event
 from multiprocessing.queues import Queue
 import time
 
-def processed(fn: Callable) -> Callable[..., Process]:
-    def wrapper(*args, **kwargs) -> Process:
-        process = Process(target=fn, args=args, kwargs=kwargs, name=f"{args[0].name}_platform")
-        process.start()
-        return process
-    return wrapper
 
 
 class UniqueQueue(Queue): # The reason of this class is to avoid duplicates in the queue during the same run. That can happen if the log retrieval process didn't process it yet.
@@ -40,7 +34,7 @@ class Platform(object):
     """
     Class to manage the connections to the different platforms.
     """
-
+    worker_events = list()
     def __init__(self, expid, name, config, auth_password = None):
         """
         :param config:
@@ -109,9 +103,12 @@ class Platform(object):
         self.recovery_queue = UniqueQueue()
         self.log_retrieval_process_active = False
         self.main_process_id = None
-        self.stop_event = Event()
-
+        self.work_event = Event()
+        self.log_recovery_process = None
 
+    @classmethod
+    def update_workers(cls, event_worker):
+        cls.worker_events.append(event_worker)
 
     @property
     @autosubmit_parameter(name='current_arch')
@@ -233,9 +230,8 @@ class Platform(object):
     def root_dir(self, value):
         self._root_dir = value
 
-    def send_stop_signal(self,stop_event):
-        Log.info(f"Sending stop signal to {self.name} platform to stop log recovery process")
-        stop_event.set()
+    def send_work_signal(self):
+        self.work_event.set()
 
     def get_exclusive_directive(self, job):
         """
@@ -840,37 +836,31 @@ class Platform(object):
         """
         This function, spawn a process that spawns another process to recover the logs of the jobs that have been submitted in this platform.
         """
-        if not self.log_retrieval_process_active:
-            self.main_process_id = os.getpid()
         if not self.log_retrieval_process_active and (
                 as_conf is None or str(as_conf.platforms_data.get(self.name, {}).get('DISABLE_RECOVERY_THREADS',
                                                                                      "false")).lower() == "false"):
             self.log_retrieval_process_active = True
             if as_conf and as_conf.misc_data.get("AS_COMMAND", "").lower() == "run":
-                process = self.recover_job_logs_parent()
-                process.join() # Wait until the process is spawned. ( Avoids to zombify the process )
-                atexit.register(self.send_stop_signal, self.stop_event) # Register the stop signal to be sent when Autosubmit ends correctly. ( Doesn't help with kill -9 )
-
-    @processed
-    def recover_job_logs_parent(self):
-        """
-        This function, spawn some processes to recover the logs of the jobs that have been submitted and exits.
-        The main_process call to this function and waits until all process are spawn. ( which avoids to this process to zombify )
-        This ensures that the recover_job_logs process is inherited by the init process, which reaps it automatically thus avoiding zombies.
-        """
-        Log.get_logger("Autosubmit")  # Log needs to be initialized in the new process
-        self.recover_job_logs()  # will run without block
-        os._exit(0)  # needed so the event/queue is not closed. ( no cleanup )
+                Platform.update_workers(self.work_event)
+                self.log_recovery_process = Process(target=self.recover_job_logs, args=(), name=f"{self.name}_log_recovery")
+                self.log_recovery_process.daemon = True
+                self.log_recovery_process.start()
+                time.sleep(1)
+                os.waitpid(self.log_recovery_process.pid, os.WNOHANG)
+                Log.result(f"Process {self.log_recovery_process.name} started with pid {self.log_recovery_process.pid}")
+                atexit.register(self.send_stop_signal)
+
+    def send_stop_signal(self):
+        self.work_event.clear()
+        self.log_recovery_process.join()
 
-    @processed
     def recover_job_logs(self):
         """
         This function, recovers the logs of the jobs that have been submitted.
         This is an independent process that will be spawned by the recover_job_logs_parent function.
-        The exit of this process is controlled by the stop_event or if the main process is killed.
+        The exit of this process is controlled by the work_event.
         Once a job is get from the queue, it will try to recover these log files until the end of the execution.
         """
-
         setproctitle.setproctitle(f"autosubmit log {self.expid} recovery {self.name.lower()}")
         identifier = f"{self.name.lower()}(log_recovery):"
         Log.get_logger("Autosubmit")  # Log needs to be initialized in the new process
@@ -880,14 +870,10 @@ class Platform(object):
         self.connected = False
         self.restore_connection(None)
         Log.result(f"{identifier} Sucessfully connected.")
-        # check if id of self.main_process exists with ps ax | grep self.main_process_id
-        max_logs_without_waiting_to_process = 60
-        while not self.stop_event.is_set() and psutil.pid_exists(int(self.main_process_id)):  # stop_event is set by atexit, whenever Autosubmit ends correctly. psutil, is a protection against kill -9.
+        while not self.work_event.wait(60):
             try:
-                logs_tried_to_retrieve = 0
-                while not self.recovery_queue.empty() and logs_tried_to_retrieve < max_logs_without_waiting_to_process:
+                while not self.recovery_queue.empty():
                     try:
-                        logs_tried_to_retrieve += 1
                         job = self.recovery_queue.get(timeout=1)  # Should be non-empty, but added a timeout for other possible errors.
                         job.children = set() # Children can't be serialized, so we set it to an empty set for this process.
                         job.platform = self # change original platform to this process platform.
@@ -906,19 +892,22 @@ class Platform(object):
                     Log.debug(f"{identifier} (Retrial number: {job._log_recovery_retries}) Recovering log files for job {job.name}")
                     job.retrieve_logfiles(self, raise_error=True)
                     Log.result(f"{identifier} (Retrial) Successfully recovered log files for job {job.name}")
-                time.sleep(self.config.get("LOG_RECOVERY_TIMEOUT", 60))
-
             except Exception as e:
+                time.sleep(10)
                 Log.warning(f"{identifier} Error while recovering logs: {str(e)}")
                 try:
                     if job and job._log_recovery_retries < 5: # If log retrieval failed, add it to the pending jobs to process. Avoids to keep trying the same job forever.
                         jobs_pending_to_process.add(job)
-
-                    if not self.stop_event.is_set() and psutil.pid_exists(int(self.main_process_id)): # The main process may be terminated already.
-                        self.connected = False
-                        Log.info(f"{identifier} Attempting to restore connection")
-                        self.restore_connection(None) # Always restore the connection on a failure.
-                        Log.result(f"{identifier} Sucessfully reconnected.")
+                    self.connected = False
+                    Log.info(f"{identifier} Attempting to restore connection")
+                    self.restore_connection(None) # Always restore the connection on a failure.
+                    Log.result(f"{identifier} Sucessfully reconnected.")
                 except:
                     pass
-        #os._exit(0)
+            self.work_event.clear()  # Clear, if the event is never set again it means that the main process has finished and is waiting for this process to finish.
+            if self.recovery_queue.empty():
+                if not self.work_event.wait(120): # While main process is active, it will set this signal to keep this process alive.
+                    Log.info(f"{identifier} Exiting...")
+                    break
+            time.sleep(self.config.get("LOG_RECOVERY_TIMEOUT", 1))
+            Log.info(f"{identifier} Waiting for new jobs to recover logs...")
\ No newline at end of file
-- 
GitLab


From 7e492a671111347e4dceeda958120efec43772ab Mon Sep 17 00:00:00 2001
From: dbeltran <daniel.beltran@bsc.es>
Date: Tue, 27 Aug 2024 16:32:42 +0200
Subject: [PATCH 06/25] Added cleanup signal

---
 autosubmit/autosubmit.py         |  41 ++++---
 autosubmit/job/job.py            |  24 +++--
 autosubmit/job/job_list.py       |  26 ++---
 autosubmit/job/job_packager.py   |   8 +-
 autosubmit/platforms/platform.py | 176 +++++++++++++++++--------------
 5 files changed, 142 insertions(+), 133 deletions(-)

diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py
index 193ef60ba..1c7a7a09b 100644
--- a/autosubmit/autosubmit.py
+++ b/autosubmit/autosubmit.py
@@ -1679,7 +1679,6 @@ class Autosubmit:
         for job in job_list.get_active():
             if job.status != Status.WAITING:
                 job.status = Status.READY
-                job.set_ready_date()
         while job_list.get_active():
             Autosubmit.submit_ready_jobs(as_conf, job_list, platforms_to_test, packages_persistence, True,
                                          only_wrappers, hold=False)
@@ -2148,7 +2147,7 @@ class Autosubmit:
                 max_recovery_retrials = as_conf.experiment_data.get("CONFIG",{}).get("RECOVERY_RETRIALS",3650)  # (72h - 122h )
                 recovery_retrials = 0
                 while job_list.get_active():
-                    for platform in platforms_to_test:  # This can be a long process, so we need to tell to the log workers to keep alive.
+                    for platform in platforms_to_test:  # Main loop, there will be logs to recover
                         platform.work_event.set()
                     for job in [job for job in job_list.get_job_list() if job.status == Status.READY]:
                         job.update_parameters(as_conf, {})
@@ -2313,34 +2312,34 @@ class Autosubmit:
                         raise AutosubmitCritical(message, 7000)
                     except BaseException as e:
                         raise # If this happens, there is a bug in the code or an exception not-well caught
+                time.sleep(5)
                 Log.result("No more jobs to run.")
+                # search hint - finished run
+                for job in job_list.get_completed_without_logs():
+                    job_list.update_log_status(job, as_conf)
+                job_list.save()
                 if not did_run and len(job_list.get_completed_without_logs()) > 0:
-                    #connect to platforms
                     Log.info(f"Connecting to the platforms, to recover missing logs")
                     submitter = Autosubmit._get_submitter(as_conf)
                     submitter.load_platforms(as_conf)
                     if submitter.platforms is None:
                         raise AutosubmitCritical("No platforms configured!!!", 7014)
-                    platforms = [value for value in submitter.platforms.values()]
-                    Autosubmit.restore_platforms(platforms, as_conf=as_conf, expid=expid)
-                # Wait for all remaining threads of I/O, close remaining connections
-                # search hint - finished run
+                    platforms_to_test = [value for value in submitter.platforms.values()]
+                    Autosubmit.restore_platforms(platforms_to_test, as_conf=as_conf, expid=expid)
                 Log.info("Waiting for all logs to be updated")
-                # One cycle log recovery is sys_timeout seconds ( default 60 )
-                # The 20 seconds are a grace period to download these logs if the cycle just started.
-                sys_timeout = int(as_conf.experiment_data.get("LOG_RECOVERY_TIMEOUT",60)) + 20 # 20 is a gracetime period to download the last logs
-                timeout = as_conf.experiment_data.get("CONFIG",{}).get("LAST_LOGS_TIMEOUT", sys_timeout)
+                if len(job_list.get_completed_without_logs()) > 0:
+                    for p in platforms_to_test:
+                        if p.log_recovery_process:
+                            p.cleanup_event.set()
+                for p in platforms_to_test:
+                    p.log_recovery_process.join()
                 for job in job_list.get_completed_without_logs():
-                    job.platform = submitter.platforms[job.platform_name.upper()]
-                    job_list.update_log_status(job, as_conf)  # request an update of the log status
-                for remaining in range(timeout, 0, -1):
-                    if len(job_list.get_completed_without_logs()) == 0:
-                        break
-                    sleep(1) # sleeps of one to ensure the fastest exit possible
-                    if remaining % 10 == 0:
-                        Log.info(f"Logs are being recovered... max time left: {remaining}")
-
-                # Updating job data header with current information when experiment ends
+                    job_list.update_log_status(job, as_conf)
+                job_list.save()
+                if len(job_list.get_completed_without_logs()) == 0:
+                    Log.result(f"Autosubmit recovered all job logs.")
+                else:
+                    Log.warning(f"Autosubmit couldn't recover the following job logs: {[job.name for job in job_list.get_completed_without_logs()]}")
                 try:
                     exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR,
                                                     historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR)
diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py
index c72806d09..415552324 100644
--- a/autosubmit/job/job.py
+++ b/autosubmit/job/job.py
@@ -249,13 +249,14 @@ class Job(object):
         # hetjobs
         self.het = None
         self.updated_log = False
-        self.ready_start_date = {}
         self.log_retrieved = False
         self.start_time_written = False
         self.submit_time_timestamp = None # for wrappers, all jobs inside a wrapper are submitted at the same time
         self.finish_time_timestamp = None # for wrappers, with inner_retrials, the submission time should be the last finish_time of the previous retrial
         self._script = None # Inline code to be executed
         self._log_recovery_retries = None
+        self.ready_date = None
+
     def _init_runtime_parameters(self):
         # hetjobs
         self.het = {'HETSIZE': 0}
@@ -873,6 +874,13 @@ class Job(object):
         """Number of processors per node that the job can use."""
         self._processors_per_node = value
 
+    def set_ready_date(self):
+        """
+        Sets the ready start date for the job
+        """
+        self.updated_log = False
+        self.ready_date = int(time.strftime("%Y%m%d%H%M%S"))
+
     def inc_fail_count(self):
         """
         Increments fail count
@@ -1135,12 +1143,6 @@ class Job(object):
                         log_retrieved = False
             self.log_retrieved = log_retrieved
 
-    def set_ready_date(self):
-        """
-        Sets the ready start date for the job
-        """
-        self.ready_start_date = int(time.strftime("%Y%m%d%H%M%S"))
-
     def retrieve_logfiles(self, platform, raise_error=False):
         """
         Retrieves log files from remote host meant to be used inside a process.
@@ -1285,8 +1287,7 @@ class Job(object):
             # after checking the jobs , no job should have the status "submitted"
             Log.printlog("Job {0} in SUBMITTED status. This should never happen on this step..".format(
                 self.name), 6008)
-        if self.status in [Status.COMPLETED, Status.FAILED]:
-            self.updated_log = False
+
 
         # # Write start_time() if not already written and job is running, completed or failed
         # if self.status in [Status.RUNNING, Status.COMPLETED, Status.FAILED] and not self.start_time_written:
@@ -1963,6 +1964,11 @@ class Job(object):
         """
         as_conf.reload()
         self._init_runtime_parameters()
+        if hasattr(self, "start_time"):
+            self.start_time = time.time()
+        for event in self.platform.worker_events:  # keep alive log retrieval workers.
+            if not event.is_set():
+                event.set()
         # Parameters that affect to all the rest of parameters
         self.update_dict_parameters(as_conf)
         parameters = parameters.copy()
diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py
index 1e541a874..5cc9383d4 100644
--- a/autosubmit/job/job_list.py
+++ b/autosubmit/job/job_list.py
@@ -305,7 +305,6 @@ class JobList(object):
                 job.parameters = parameters
                 if not job.has_parents():
                     job.status = Status.READY
-                    job.set_ready_date()
                     job.packed = False
                 else:
                     job.status = Status.WAITING
@@ -2590,10 +2589,11 @@ class JobList(object):
         """
         if not hasattr(job, "updated_log"): # hasattr for backward compatibility (job.updated_logs is only for newer jobs, as the loaded ones may not have this set yet)
             job.updated_log = False
+        elif job.updated_log:
+            return
         if hasattr(job, "x11") and job.x11: # X11 has it log writted in the run.out file. No need to check for log files as there are none
             job.updated_log = True
             return
-
         log_recovered = self.check_if_log_is_recovered(job)
         if log_recovered:
             job.local_logs = (log_recovered.name, log_recovered.name[:-4] + ".err") # we only want the last one
@@ -2606,19 +2606,15 @@ class JobList(object):
         Check if the log is recovered.
         Conditions:
         - File must exist
-        - File timestamp should be greater than the job ready_start_date, otherwise is from a previous run.
+        - File timestamp should be greater than the job ready_date, otherwise is from a previous run.
 
         """
 
         if not hasattr(job, "updated_log") or not job.updated_log:
-            if job.wrapper_type == "vertical" and job.fail_count > 0:
-                for log_recovered in self.path_to_logs.glob(f"{job.name}.*._{job.fail_count}.out"):
-                    if job.ready_start_date  and int(datetime.datetime.fromtimestamp(log_recovered.stat().st_mtime).strftime("%Y%m%d%H%M%S")) > int(job.ready_start_date):
-                        return log_recovered
-            else:
-                for log_recovered in self.path_to_logs.glob(f"{job.name}.*.out"):
-                    if job.ready_start_date and int(datetime.datetime.fromtimestamp(log_recovered.stat().st_mtime).strftime("%Y%m%d%H%M%S")) > int(job.ready_start_date):
-                        return log_recovered
+            for log_recovered in self.path_to_logs.glob(f"{job.name}.*.out"):
+                date = int(datetime.datetime.fromtimestamp(log_recovered.stat().st_mtime).strftime("%Y%m%d%H%M%S"))
+                if job.ready_date and int(datetime.datetime.fromtimestamp(log_recovered.stat().st_mtime).strftime("%Y%m%d%H%M%S")) > int(job.ready_date):
+                    return log_recovered
         return None
 
     def update_list(self, as_conf, store_change=True, fromSetStatus=False, submitter=None, first_time=False):
@@ -2680,7 +2676,6 @@ class JobList(object):
                                 "Resetting job: {0} status to: DELAYED for retrial...".format(job.name))
                         else:
                             job.status = Status.READY
-                            job.set_ready_date()
                             job.packed = False
                             Log.debug(
                                 "Resetting job: {0} status to: READY for retrial...".format(job.name))
@@ -2706,7 +2701,6 @@ class JobList(object):
         for job in self.check_special_status():
             job.status = Status.READY
             # Run start time in format (YYYYMMDDHH:MM:SS) from current time
-            job.set_ready_date()
             job.id = None
             job.packed = False
             job.wrapper_type = None
@@ -2746,7 +2740,6 @@ class JobList(object):
             for job in self.get_delayed():
                 if datetime.datetime.now() >= job.delay_end:
                     job.status = Status.READY
-                    job.set_ready_date()
                     job.packed = False
             for job in self.get_waiting():
                 tmp = [parent for parent in job.parents if
@@ -2758,7 +2751,6 @@ class JobList(object):
                 failed_ones = [parent for parent in job.parents if parent.status == Status.FAILED]
                 if job.parents is None or len(tmp) == len(job.parents):
                     job.status = Status.READY
-                    job.set_ready_date()
                     job.packed = False
                     job.hold = False
                     Log.debug(
@@ -2779,8 +2771,6 @@ class JobList(object):
                                     break
                             if not strong_dependencies_failure and weak_dependencies_failure:
                                 job.status = Status.READY
-                                job.set_ready_date()
-
                                 job.packed = False
                                 job.hold = False
                                 Log.debug(
@@ -2794,7 +2784,6 @@ class JobList(object):
                             for parent in job.parents:
                                 if parent.name in job.edge_info and job.edge_info[parent.name].get('optional', False):
                                     job.status = Status.READY
-                                    job.set_ready_date()
                                     job.packed = False
                                     job.hold = False
                                     Log.debug(
@@ -2811,7 +2800,6 @@ class JobList(object):
                             parent.status == Status.SKIPPED or parent.status == Status.FAILED]
                     if len(tmp2) == len(job.parents) and len(tmp3) != len(job.parents):
                         job.status = Status.READY
-                        job.set_ready_date()
                         job.packed = False
                         # Run start time in format (YYYYMMDDHH:MM:SS) from current time
                         job.packed = False
diff --git a/autosubmit/job/job_packager.py b/autosubmit/job/job_packager.py
index 47967143e..8472501c4 100644
--- a/autosubmit/job/job_packager.py
+++ b/autosubmit/job/job_packager.py
@@ -526,6 +526,8 @@ class JobPackager(object):
         for package in packages_to_submit:
             self.max_jobs = self.max_jobs - 1
             package.hold = self.hold
+            for job in package.jobs:
+                job.set_ready_date()
 
         return packages_to_submit
 
@@ -728,9 +730,6 @@ class JobPackagerVertical(object):
             job, level = stack.pop()
             if level % 50 == 0 and level > 0:
                 Log.info(f"Wrapper package creation is still ongoing. So far {level} jobs have been wrapped.")
-                for event in job.platform.worker_events: # This can be a long process, so we need to tell to the log workers to keep alive.
-                    if not event.is_set():
-                        event.set()
             if len(self.jobs_list) >= self.wrapper_limits["max_v"] or len(self.jobs_list) >= \
                     self.wrapper_limits["max_by_section"][job.section] or len(self.jobs_list) >= self.wrapper_limits[
                 "max"]:
@@ -924,9 +923,6 @@ class JobPackagerHorizontal(object):
             for job in jobs_by_section[section]:
                 if jobs_processed % 50 == 0 and jobs_processed > 0:
                     Log.info(f"Wrapper package creation is still ongoing. So far {jobs_processed} jobs have been wrapped.")
-                    for event in job.platform.worker_events:  # This can be a long process, so we need to tell to the log workers to keep alive.
-                        if not event.is_set():
-                            event.set()
                 job.update_parameters(wrapper_info[-1], {})
                 if str(job.processors).isdigit() and str(job.nodes).isdigit() and int(job.nodes) > 0 and int(job.processors) <= 1:
                     job.processors = 0
diff --git a/autosubmit/platforms/platform.py b/autosubmit/platforms/platform.py
index a9d638dd9..2bf1ee9f7 100644
--- a/autosubmit/platforms/platform.py
+++ b/autosubmit/platforms/platform.py
@@ -1,6 +1,6 @@
 import atexit
 import multiprocessing
-import queue # only for the exception
+import queue  # only for the exception
 import psutil
 import setproctitle
 import locale
@@ -15,13 +15,13 @@ from multiprocessing.queues import Queue
 import time
 
 
-
-class UniqueQueue(Queue): # The reason of this class is to avoid duplicates in the queue during the same run. That can happen if the log retrieval process didn't process it yet.
+class UniqueQueue(
+    Queue):  # The reason of this class is to avoid duplicates in the queue during the same run. That can happen if the log retrieval process didn't process it yet.
 
     def __init__(self, maxsize=-1, block=True, timeout=None):
         self.block = block
         self.timeout = timeout
-        self.all_items = set() # Won't be popped, so even if it is being processed by the log retrieval process, it won't be added again.
+        self.all_items = set()  # Won't be popped, so even if it is being processed by the log retrieval process, it won't be added again.
         super().__init__(maxsize, ctx=multiprocessing.get_context())
 
     def put(self, job, block=True, timeout=None):
@@ -35,15 +35,16 @@ class Platform(object):
     Class to manage the connections to the different platforms.
     """
     worker_events = list()
-    def __init__(self, expid, name, config, auth_password = None):
+
+    def __init__(self, expid, name, config, auth_password=None):
         """
         :param config:
         :param expid:
         :param name:
         """
         self.connected = False
-        self.expid = expid # type: str
-        self._name = name # type: str
+        self.expid = expid  # type: str
+        self._name = name  # type: str
         self.config = config
         self.tmp_path = os.path.join(
             self.config.get("LOCAL_ROOT_DIR"), self.expid, self.config.get("LOCAL_TMP_DIR"))
@@ -88,9 +89,9 @@ class Platform(object):
         self.cancel_cmd = None
         self.otp_timeout = None
         self.two_factor_auth = None
-        self.otp_timeout = self.config.get("PLATFORMS", {}).get(self.name.upper(),{}).get("2FA_TIMEOUT", 60*5)
-        self.two_factor_auth = self.config.get("PLATFORMS", {}).get(self.name.upper(),{}).get("2FA", False)
-        self.two_factor_method = self.config.get("PLATFORMS", {}).get(self.name.upper(),{}).get("2FA_METHOD", "token")
+        self.otp_timeout = self.config.get("PLATFORMS", {}).get(self.name.upper(), {}).get("2FA_TIMEOUT", 60 * 5)
+        self.two_factor_auth = self.config.get("PLATFORMS", {}).get(self.name.upper(), {}).get("2FA", False)
+        self.two_factor_method = self.config.get("PLATFORMS", {}).get(self.name.upper(), {}).get("2FA_METHOD", "token")
         if not self.two_factor_auth:
             self.pw = None
         elif auth_password is not None and self.two_factor_auth:
@@ -104,6 +105,7 @@ class Platform(object):
         self.log_retrieval_process_active = False
         self.main_process_id = None
         self.work_event = Event()
+        self.cleanup_event = Event()
         self.log_recovery_process = None
 
     @classmethod
@@ -230,9 +232,6 @@ class Platform(object):
     def root_dir(self, value):
         self._root_dir = value
 
-    def send_work_signal(self):
-        self.work_event.set()
-
     def get_exclusive_directive(self, job):
         """
         Returns exclusive directive for the specified job
@@ -243,9 +242,10 @@ class Platform(object):
         """
         # only implemented for slurm
         return ""
-    def get_multiple_jobids(self,job_list,valid_packages_to_submit,failed_packages,error_message="",hold=False):
-        return False,valid_packages_to_submit
-        #raise NotImplementedError
+
+    def get_multiple_jobids(self, job_list, valid_packages_to_submit, failed_packages, error_message="", hold=False):
+        return False, valid_packages_to_submit
+        # raise NotImplementedError
 
     def process_batch_ready_jobs(self, valid_packages_to_submit, failed_packages, error_message="", hold=False):
         return True, valid_packages_to_submit
@@ -475,7 +475,6 @@ class Platform(object):
         parameters['{0}EC_QUEUE'.format(prefix)] = self.ec_queue
         parameters['{0}PARTITION'.format(prefix)] = self.partition
 
-
         parameters['{0}USER'.format(prefix)] = self.user
         parameters['{0}PROJ'.format(prefix)] = self.project
         parameters['{0}BUDG'.format(prefix)] = self.budget
@@ -578,10 +577,12 @@ class Platform(object):
         if job.current_checkpoint_step < job.max_checkpoint_step:
             remote_checkpoint_path = f'{self.get_files_path()}/CHECKPOINT_'
             self.get_file(f'{remote_checkpoint_path}{str(job.current_checkpoint_step)}', False, ignore_log=True)
-            while self.check_file_exists(f'{remote_checkpoint_path}{str(job.current_checkpoint_step)}') and job.current_checkpoint_step < job.max_checkpoint_step:
+            while self.check_file_exists(
+                    f'{remote_checkpoint_path}{str(job.current_checkpoint_step)}') and job.current_checkpoint_step < job.max_checkpoint_step:
                 self.remove_checkpoint_file(f'{remote_checkpoint_path}{str(job.current_checkpoint_step)}')
                 job.current_checkpoint_step += 1
                 self.get_file(f'{remote_checkpoint_path}{str(job.current_checkpoint_step)}', False, ignore_log=True)
+
     def get_completed_files(self, job_name, retries=0, recovery=False, wrapper_failed=False):
         """
         Get the COMPLETED file of the given job
@@ -653,6 +654,7 @@ class Platform(object):
             Log.debug('{0} been removed', filename)
             return True
         return False
+
     def remove_checkpoint_file(self, filename):
         """
         Removes *CHECKPOINT* files from remote
@@ -666,7 +668,7 @@ class Platform(object):
     def check_file_exists(self, src, wrapper_failed=False, sleeptime=5, max_retries=3, first=True):
         return True
 
-    def get_stat_file(self, job_name, retries=0, count = -1):
+    def get_stat_file(self, job_name, retries=0, count=-1):
         """
         Copies *STAT* files from remote to local
 
@@ -677,7 +679,7 @@ class Platform(object):
         :return: True if successful, False otherwise
         :rtype: bool
         """
-        if count == -1: # No internal retrials
+        if count == -1:  # No internal retrials
             filename = job_name + '_STAT'
         else:
             filename = job_name + '_STAT_{0}'.format(str(count))
@@ -692,7 +694,6 @@ class Platform(object):
         Log.debug('{0}_STAT file not found', job_name)
         return False
 
-
     @autosubmit_parameter(name='current_logdir')
     def get_files_path(self):
         """
@@ -724,9 +725,11 @@ class Platform(object):
         :rtype: int
         """
         raise NotImplementedError
+
     def check_Alljobs(self, job_list, as_conf, retries=5):
-        for job,job_prev_status in job_list:
+        for job, job_prev_status in job_list:
             self.check_job(job)
+
     def check_job(self, job, default_status=Status.COMPLETED, retries=5, submit_hold_check=False, is_wrapper=False):
         """
         Checks job running status
@@ -815,7 +818,7 @@ class Platform(object):
         # type: () -> None
         """ Opens Submit script file """
         raise NotImplementedError
-    
+
     def submit_Script(self, hold=False):
         # type: (bool) -> Union[List[str], str]
         """
@@ -829,12 +832,12 @@ class Platform(object):
     def connect(self, as_conf, reconnect=False):
         raise NotImplementedError
 
-    def restore_connection(self,as_conf):
+    def restore_connection(self, as_conf):
         raise NotImplementedError
 
-    def spawn_log_retrieval_process(self,as_conf):
+    def spawn_log_retrieval_process(self, as_conf):
         """
-        This function, spawn a process that spawns another process to recover the logs of the jobs that have been submitted in this platform.
+        This function, spawn a process to recover the logs of the jobs that have been submitted in this platform.
         """
         if not self.log_retrieval_process_active and (
                 as_conf is None or str(as_conf.platforms_data.get(self.name, {}).get('DISABLE_RECOVERY_THREADS',
@@ -842,72 +845,89 @@ class Platform(object):
             self.log_retrieval_process_active = True
             if as_conf and as_conf.misc_data.get("AS_COMMAND", "").lower() == "run":
                 Platform.update_workers(self.work_event)
-                self.log_recovery_process = Process(target=self.recover_job_logs, args=(), name=f"{self.name}_log_recovery")
+                self.log_recovery_process = Process(target=self.recover_platform_job_logs, args=(),
+                                                    name=f"{self.name}_log_recovery")
                 self.log_recovery_process.daemon = True
                 self.log_recovery_process.start()
                 time.sleep(1)
                 os.waitpid(self.log_recovery_process.pid, os.WNOHANG)
                 Log.result(f"Process {self.log_recovery_process.name} started with pid {self.log_recovery_process.pid}")
-                atexit.register(self.send_stop_signal)
+                atexit.register(self.send_cleanup_signal)
 
-    def send_stop_signal(self):
-        self.work_event.clear()
-        self.log_recovery_process.join()
+    def send_cleanup_signal(self):
+        if self.log_recovery_process and self.log_recovery_process.is_alive():
+            self.work_event.clear()
+            self.cleanup_event.set()
+            self.log_recovery_process.join()
+
+    def wait_for_work(self, timeout=60):
+        """
+        This function, waits for the work_event to be set by the main process. If it is not set, it returns False and the log recovery process ends.
+        """
+        finish = False
+        for remaining in range(timeout, 0, -2): # Wait the full timeout to not hammer the CPU.
+            if not finish and not self.work_event.is_set():
+                finish = True
+            time.sleep(2)
+        return finish
 
-    def recover_job_logs(self):
+    def recover_job_log(self, identifier, jobs_pending_to_process):
+        job = None
+        try:
+            while not self.recovery_queue.empty():
+                try:
+                    job = self.recovery_queue.get(
+                        timeout=1)  # Should be non-empty, but added a timeout for other possible errors.
+                    job.children = set()  # Children can't be serialized, so we set it to an empty set for this process.
+                    job.platform = self  # change the original platform to this process platform.
+                    job._log_recovery_retries = 0  # reset the log recovery retries.
+                    Log.debug(f"{identifier} Recovering log files for job {job.name}")
+                    job.retrieve_logfiles(self, raise_error=True)
+                    Log.result(f"{identifier} Sucessfully recovered log files for job {job.name}")
+                except queue.Empty:
+                    pass
+            # This second while is to keep retring the failed jobs.
+            while len(jobs_pending_to_process) > 0:  # jobs that had any issue during the log retrieval
+                job = jobs_pending_to_process.pop()
+                job.children = set()
+                job.platform = self
+                job._log_recovery_retries += 1
+                Log.debug(
+                    f"{identifier} (Retrial number: {job._log_recovery_retries}) Recovering log files for job {job.name}")
+                job.retrieve_logfiles(self, raise_error=True)
+                Log.result(f"{identifier} (Retrial) Successfully recovered log files for job {job.name}")
+        except Exception as e:
+            Log.warning(f"{identifier} Error while recovering logs: {str(e)}")
+            try:
+                if job and job._log_recovery_retries < 5:  # If log retrieval failed, add it to the pending jobs to process. Avoids to keep trying the same job forever.
+                    jobs_pending_to_process.add(job)
+                self.connected = False
+                Log.info(f"{identifier} Attempting to restore connection")
+                self.restore_connection(None)  # Always restore the connection on a failure.
+                Log.result(f"{identifier} Sucessfully reconnected.")
+            except:
+                pass
+        return jobs_pending_to_process
+    def recover_platform_job_logs(self):
         """
         This function, recovers the logs of the jobs that have been submitted.
-        This is an independent process that will be spawned by the recover_job_logs_parent function.
-        The exit of this process is controlled by the work_event.
-        Once a job is get from the queue, it will try to recover these log files until the end of the execution.
+        The exit of this process is controlled by the work_event and cleanup_events of the main process.
         """
         setproctitle.setproctitle(f"autosubmit log {self.expid} recovery {self.name.lower()}")
         identifier = f"{self.name.lower()}(log_recovery):"
-        Log.get_logger("Autosubmit")  # Log needs to be initialized in the new process
         Log.info(f"{identifier} Starting...")
-        job = None
         jobs_pending_to_process = set()
         self.connected = False
         self.restore_connection(None)
+        Log.get_logger("Autosubmit")  # Log needs to be initialised in the new process
         Log.result(f"{identifier} Sucessfully connected.")
-        while not self.work_event.wait(60):
-            try:
-                while not self.recovery_queue.empty():
-                    try:
-                        job = self.recovery_queue.get(timeout=1)  # Should be non-empty, but added a timeout for other possible errors.
-                        job.children = set() # Children can't be serialized, so we set it to an empty set for this process.
-                        job.platform = self # change original platform to this process platform.
-                        job._log_recovery_retries = 0 # reset the log recovery retries.
-                        Log.debug(f"{identifier} Recovering log files for job {job.name}")
-                        job.retrieve_logfiles(self, raise_error=True)
-                        Log.result(f"{identifier} Sucessfully recovered log files for job {job.name}")
-                    except queue.Empty:
-                        pass
-                # This second while is to keep retring the failed jobs.
-                while len(jobs_pending_to_process) > 0: # jobs that had any issue during the log retrieval
-                    job = jobs_pending_to_process.pop()
-                    job.children = set()
-                    job.platform = self
-                    job._log_recovery_retries += 1
-                    Log.debug(f"{identifier} (Retrial number: {job._log_recovery_retries}) Recovering log files for job {job.name}")
-                    job.retrieve_logfiles(self, raise_error=True)
-                    Log.result(f"{identifier} (Retrial) Successfully recovered log files for job {job.name}")
-            except Exception as e:
-                time.sleep(10)
-                Log.warning(f"{identifier} Error while recovering logs: {str(e)}")
-                try:
-                    if job and job._log_recovery_retries < 5: # If log retrieval failed, add it to the pending jobs to process. Avoids to keep trying the same job forever.
-                        jobs_pending_to_process.add(job)
-                    self.connected = False
-                    Log.info(f"{identifier} Attempting to restore connection")
-                    self.restore_connection(None) # Always restore the connection on a failure.
-                    Log.result(f"{identifier} Sucessfully reconnected.")
-                except:
-                    pass
-            self.work_event.clear()  # Clear, if the event is never set again it means that the main process has finished and is waiting for this process to finish.
-            if self.recovery_queue.empty():
-                if not self.work_event.wait(120): # While main process is active, it will set this signal to keep this process alive.
-                    Log.info(f"{identifier} Exiting...")
-                    break
-            time.sleep(self.config.get("LOG_RECOVERY_TIMEOUT", 1))
-            Log.info(f"{identifier} Waiting for new jobs to recover logs...")
\ No newline at end of file
+        default_timeout = 60
+        timeout = max(self.config.get("LOG_RECOVERY_TIMEOUT", default_timeout), default_timeout)
+        while self.wait_for_work(timeout=timeout):
+            self.work_event.clear()
+            jobs_pending_to_process = self.recover_job_log(identifier, jobs_pending_to_process)
+            if self.cleanup_event.is_set(): # Check if main process is waiting for this child to end.
+                self.recover_job_log(identifier, jobs_pending_to_process)
+                break
+        Log.info(f"{identifier} Exiting.")
+
-- 
GitLab


From 395ca051ebe44165e8ea21fccdc8b5b4ad2bbe75 Mon Sep 17 00:00:00 2001
From: dbeltran <daniel.beltran@bsc.es>
Date: Tue, 27 Aug 2024 17:00:08 +0200
Subject: [PATCH 07/25] cleanup code

---
 autosubmit/helpers/utils.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/autosubmit/helpers/utils.py b/autosubmit/helpers/utils.py
index f706f6a63..4f05c7676 100644
--- a/autosubmit/helpers/utils.py
+++ b/autosubmit/helpers/utils.py
@@ -145,15 +145,3 @@ def restore_platforms(platform_to_test, mail_notify=False, as_conf=None, expid=N
             raise AutosubmitCritical("Issues while checking the connectivity of platforms.", 7010, issues + "\n" + ssh_config_issues)
 
 
-class AutosubmitUtils:
-    def __init__(self):
-        self.active_platforms = None
-
-    def set_work_pending(self):
-        """
-        This method will set the work_pending event to True for all active platforms. To indicate to the child processes that there is work to be done.
-        """
-        for platform in self.active_platforms:
-            platform.send_work_signal()
-
-
-- 
GitLab


From 92ebf594755f1ac1c9cae12e8b1c678ab8e54b72 Mon Sep 17 00:00:00 2001
From: dbeltran <daniel.beltran@bsc.es>
Date: Tue, 27 Aug 2024 17:01:25 +0200
Subject: [PATCH 08/25] cleanup code

---
 autosubmit/job/job_packages.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autosubmit/job/job_packages.py b/autosubmit/job/job_packages.py
index 313f9cfa3..309881657 100644
--- a/autosubmit/job/job_packages.py
+++ b/autosubmit/job/job_packages.py
@@ -54,7 +54,7 @@ def jobs_in_wrapper_str(as_conf, current_wrapper):
     else:
         jobs_in_wrapper = jobs_in_wrapper.split(" ")
     jobs_in_wrapper = [job.strip(" ,") for job in jobs_in_wrapper]
-    return "_".join(jobs_in_wrapper)
+    return "&".join(jobs_in_wrapper)
 class JobPackageBase(object):
     """
     Class to manage the package of jobs to be submitted by autosubmit
-- 
GitLab


From 6e83e97b8acf3ff02490837b9520d1b4d04704d1 Mon Sep 17 00:00:00 2001
From: dbeltran <daniel.beltran@bsc.es>
Date: Tue, 3 Sep 2024 14:57:18 +0200
Subject: [PATCH 09/25] Added tests for signals

---
 autosubmit/platforms/platform.py |  19 ++---
 test/unit/test_log_recovery.py   | 127 +++++++++++++++++++++++++++++++
 2 files changed, 137 insertions(+), 9 deletions(-)
 create mode 100644 test/unit/test_log_recovery.py

diff --git a/autosubmit/platforms/platform.py b/autosubmit/platforms/platform.py
index 2bf1ee9f7..b8709044e 100644
--- a/autosubmit/platforms/platform.py
+++ b/autosubmit/platforms/platform.py
@@ -849,7 +849,6 @@ class Platform(object):
                                                     name=f"{self.name}_log_recovery")
                 self.log_recovery_process.daemon = True
                 self.log_recovery_process.start()
-                time.sleep(1)
                 os.waitpid(self.log_recovery_process.pid, os.WNOHANG)
                 Log.result(f"Process {self.log_recovery_process.name} started with pid {self.log_recovery_process.pid}")
                 atexit.register(self.send_cleanup_signal)
@@ -864,12 +863,14 @@ class Platform(object):
         """
         This function, waits for the work_event to be set by the main process. If it is not set, it returns False and the log recovery process ends.
         """
-        finish = False
-        for remaining in range(timeout, 0, -2): # Wait the full timeout to not hammer the CPU.
-            if not finish and not self.work_event.is_set():
-                finish = True
-            time.sleep(2)
-        return finish
+        keep_working = False
+        for remaining in range(timeout, 0, -1): # Wait the full timeout to not hammer the CPU.
+            time.sleep(1)
+            if not keep_working and self.work_event.is_set():
+                Log.info(f"{self.name.lower()}(log_recovery): Work event set.")
+                keep_working = True
+        self.work_event.clear()
+        return keep_working
 
     def recover_job_log(self, identifier, jobs_pending_to_process):
         job = None
@@ -908,6 +909,7 @@ class Platform(object):
             except:
                 pass
         return jobs_pending_to_process
+
     def recover_platform_job_logs(self):
         """
         This function, recovers the logs of the jobs that have been submitted.
@@ -922,9 +924,8 @@ class Platform(object):
         Log.get_logger("Autosubmit")  # Log needs to be initialised in the new process
         Log.result(f"{identifier} Sucessfully connected.")
         default_timeout = 60
-        timeout = max(self.config.get("LOG_RECOVERY_TIMEOUT", default_timeout), default_timeout)
+        timeout = max(int(self.config.get("LOG_RECOVERY_TIMEOUT", default_timeout)), default_timeout)
         while self.wait_for_work(timeout=timeout):
-            self.work_event.clear()
             jobs_pending_to_process = self.recover_job_log(identifier, jobs_pending_to_process)
             if self.cleanup_event.is_set(): # Check if main process is waiting for this child to end.
                 self.recover_job_log(identifier, jobs_pending_to_process)
diff --git a/test/unit/test_log_recovery.py b/test/unit/test_log_recovery.py
new file mode 100644
index 000000000..11e20ddac
--- /dev/null
+++ b/test/unit/test_log_recovery.py
@@ -0,0 +1,127 @@
+import time
+import pytest
+from pathlib import Path
+import os
+import pwd
+from autosubmitconfigparser.config.basicconfig import BasicConfig
+from autosubmitconfigparser.config.configcommon import AutosubmitConfig
+
+def _get_script_files_path() -> Path:
+    return Path(__file__).resolve().parent / 'files'
+
+
+@pytest.fixture
+def current_tmpdir(tmpdir_factory):
+    folder = tmpdir_factory.mktemp(f'tests')
+    os.mkdir(folder.join('scratch'))
+    os.mkdir(folder.join('scheduler_tmp_dir'))
+    file_stat = os.stat(f"{folder.strpath}")
+    file_owner_id = file_stat.st_uid
+    file_owner = pwd.getpwuid(file_owner_id).pw_name
+    folder.owner = file_owner
+    return folder
+
+
+@pytest.fixture
+def prepare_test(current_tmpdir):
+    # touch as_misc
+    platforms_path = Path(f"{current_tmpdir.strpath}/platforms_t000.yml")
+    jobs_path = Path(f"{current_tmpdir.strpath}/jobs_t000.yml")
+    project = "whatever"
+    scratch_dir = f"{current_tmpdir.strpath}/scratch"
+    Path(f"{scratch_dir}/{project}/{current_tmpdir.owner}").mkdir(parents=True, exist_ok=True)
+
+    # Add each platform to test
+    with platforms_path.open('w') as f:
+        f.write(f"""
+PLATFORMS:
+    pytest-ps:
+        type: ps
+        host: 127.0.0.1
+        user: {current_tmpdir.owner}
+        project: {project}
+        scratch_dir: {scratch_dir}
+        """)
+    # add a job of each platform type
+    with jobs_path.open('w') as f:
+        f.write(f"""
+JOBS:
+    base:
+        SCRIPT: |
+            echo "Hello World"
+            echo sleep 5
+        QUEUE: hpc
+        PLATFORM: pytest-ps
+        RUNNING: once
+        wallclock: 00:01
+EXPERIMENT:
+    # List of start dates
+    DATELIST: '20000101'
+    # List of members.
+    MEMBERS: fc0
+    # Unit of the chunk size. Can be hour, day, month, or year.
+    CHUNKSIZEUNIT: month
+    # Size of each chunk.
+    CHUNKSIZE: '4'
+    # Number of chunks of the experiment.
+    NUMCHUNKS: '2'
+    CHUNKINI: ''
+    # Calendar used for the experiment. Can be standard or noleap.
+    CALENDAR: standard
+  """)
+    return current_tmpdir
+
+
+@pytest.fixture
+def local(prepare_test):
+    # Init Local platform
+    from autosubmit.platforms.locplatform import LocalPlatform
+    config = {
+        'LOCAL_ROOT_DIR': f"{prepare_test}/scratch",
+        'LOCAL_TMP_DIR': f"{prepare_test}/scratch",
+    }
+    local = LocalPlatform(expid='t000', name='local', config=config)
+    return local
+
+
+@pytest.fixture
+def as_conf(prepare_test):
+    as_conf = AutosubmitConfig("t000")
+    as_conf.experiment_data = as_conf.load_config_file(as_conf.experiment_data, Path(prepare_test.join('platforms_t000.yml')))
+    as_conf.misc_data = {"AS_COMMAND": "run"}
+    return as_conf
+
+
+def test_log_recovery_no_keep_alive(prepare_test, local, mocker, as_conf):
+    mocker.patch('autosubmit.platforms.platform.max', return_value=1)
+    local.spawn_log_retrieval_process(as_conf)
+    assert local.log_recovery_process.is_alive()
+    time.sleep(2)
+    assert local.log_recovery_process.is_alive() is False
+
+
+def test_log_recovery_keep_alive(prepare_test, local, mocker, as_conf):
+    mocker.patch('autosubmit.platforms.platform.max', return_value=3)
+    local.spawn_log_retrieval_process(as_conf)
+    assert local.log_recovery_process.is_alive()
+    local.work_event.set()
+    time.sleep(3)
+    assert local.log_recovery_process.is_alive()
+    local.work_event.set()
+    time.sleep(3)
+    assert local.log_recovery_process.is_alive()
+    time.sleep(3)
+    assert local.log_recovery_process.is_alive() is False
+
+
+def test_log_recovery_keep_alive_cleanup(prepare_test, local, mocker, as_conf):
+    mocker.patch('autosubmit.platforms.platform.max', return_value=3)
+    local.spawn_log_retrieval_process(as_conf)
+    assert local.log_recovery_process.is_alive()
+    local.work_event.set()
+    time.sleep(3)
+    assert local.log_recovery_process.is_alive()
+    local.work_event.set()
+    local.cleanup_event.set()
+    time.sleep(3)
+    assert local.log_recovery_process.is_alive() is False
\ No newline at end of file
-- 
GitLab


From e71ac31af7eae46af47c3448edcb85275912e011 Mon Sep 17 00:00:00 2001
From: dbeltran <daniel.beltran@bsc.es>
Date: Tue, 3 Sep 2024 14:58:06 +0200
Subject: [PATCH 10/25] removed debug message

---
 autosubmit/platforms/platform.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/autosubmit/platforms/platform.py b/autosubmit/platforms/platform.py
index b8709044e..d4467b1e7 100644
--- a/autosubmit/platforms/platform.py
+++ b/autosubmit/platforms/platform.py
@@ -867,7 +867,6 @@ class Platform(object):
         for remaining in range(timeout, 0, -1): # Wait the full timeout to not hammer the CPU.
             time.sleep(1)
             if not keep_working and self.work_event.is_set():
-                Log.info(f"{self.name.lower()}(log_recovery): Work event set.")
                 keep_working = True
         self.work_event.clear()
         return keep_working
-- 
GitLab


From e1818b76516598579d79928d79bccbd5ef8f2b70 Mon Sep 17 00:00:00 2001
From: dbeltran <daniel.beltran@bsc.es>
Date: Tue, 3 Sep 2024 15:00:29 +0200
Subject: [PATCH 11/25] Changed the timeout

---
 autosubmit/platforms/platform.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/autosubmit/platforms/platform.py b/autosubmit/platforms/platform.py
index d4467b1e7..f40faa30f 100644
--- a/autosubmit/platforms/platform.py
+++ b/autosubmit/platforms/platform.py
@@ -865,9 +865,13 @@ class Platform(object):
         """
         keep_working = False
         for remaining in range(timeout, 0, -1): # Wait the full timeout to not hammer the CPU.
-            time.sleep(1)
-            if not keep_working and self.work_event.is_set():
-                keep_working = True
+            if keep_working:
+                time.sleep(remaining)
+                break
+            else:
+                time.sleep(1)
+                if self.work_event.is_set():
+                    keep_working = True
         self.work_event.clear()
         return keep_working
 
-- 
GitLab


From 7f591d185a6c80d5a4a5289d9cd27b0317eee8f9 Mon Sep 17 00:00:00 2001
From: dbeltran <daniel.beltran@bsc.es>
Date: Tue, 3 Sep 2024 18:42:25 +0200
Subject: [PATCH 12/25] added more tests

---
 autosubmit/job/job.py            | 41 +++++++++++++++++---------------
 autosubmit/platforms/platform.py | 13 ++++------
 test/unit/test_log_recovery.py   | 34 ++++++++++++++++++++++++--
 3 files changed, 59 insertions(+), 29 deletions(-)

diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py
index 415552324..3462cf865 100644
--- a/autosubmit/job/job.py
+++ b/autosubmit/job/job.py
@@ -270,7 +270,7 @@ class Job(object):
         self._memory = ''
         self._memory_per_task = ''
         self.log_retrieved = False
-        self.start_time_placeholder = ""
+        self.start_time_placeholder = time.time()
         self.processors_per_node = ""
 
 
@@ -1143,6 +1143,25 @@ class Job(object):
                         log_retrieved = False
             self.log_retrieved = log_retrieved
 
+    def write_stats(self, stat_file, platform):
+        # Update the logs with Autosubmit Job ID Brand
+        try:
+            for local_log in self.local_logs:
+                platform.write_jobid(self.id, os.path.join(
+                    self._tmp_path, 'LOG_' + str(self.expid), local_log))
+        except BaseException as e:
+            Log.printlog("Trace {0} \n Failed to write the {1} e=6001".format(str(e), self.name))
+        # write stats
+        if self.wrapper_type == "vertical":  # Disable AS retrials for vertical wrappers to use internal ones
+            for i in range(0, int(self.retrials + 1)):
+                if self.platform.get_stat_file(self.name, stat_file, count=i):
+                    self.write_vertical_time(i)
+                    self.inc_fail_count()
+        else:
+            self.platform.get_stat_file(self.name, stat_file)
+            self.write_start_time(from_stat_file=True)
+            self.write_end_time(self.status == Status.COMPLETED)
+
     def retrieve_logfiles(self, platform, raise_error=False):
         """
         Retrieves log files from remote host meant to be used inside a process.
@@ -1165,25 +1184,9 @@ class Job(object):
                 raise AutosubmitCritical("Failed to retrieve logs for job {0}".format(self.name), 6000)
             else:
                 Log.printlog("Failed to retrieve logs for job {0}".format(self.name), 6000)
-
         else:
-            # Update the logs with Autosubmit Job ID Brand
-            try:
-                for local_log in self.local_logs:
-                    platform.write_jobid(self.id, os.path.join(
-                        self._tmp_path, 'LOG_' + str(self.expid), local_log))
-            except BaseException as e:
-                Log.printlog("Trace {0} \n Failed to write the {1} e=6001".format(str(e), self.name))
-            # write stats
-            if self.wrapper_type == "vertical": # Disable AS retrials for vertical wrappers to use internal ones
-                for i in range(0,int(self.retrials+1)):
-                    if self.platform.get_stat_file(self.name, stat_file, count=i):
-                        self.write_vertical_time(i)
-                        self.inc_fail_count()
-            else:
-                self.platform.get_stat_file(self.name, stat_file)
-                self.write_start_time(from_stat_file=True)
-                self.write_end_time(self.status == Status.COMPLETED)
+            self.write_stats(stat_file, platform)
+
 
     def parse_time(self,wallclock):
         regex = re.compile(r'(((?P<hours>\d+):)((?P<minutes>\d+)))(:(?P<seconds>\d+))?')
diff --git a/autosubmit/platforms/platform.py b/autosubmit/platforms/platform.py
index f40faa30f..c15422fa0 100644
--- a/autosubmit/platforms/platform.py
+++ b/autosubmit/platforms/platform.py
@@ -865,13 +865,11 @@ class Platform(object):
         """
         keep_working = False
         for remaining in range(timeout, 0, -1): # Wait the full timeout to not hammer the CPU.
-            if keep_working:
-                time.sleep(remaining)
+            time.sleep(1)
+            if self.work_event.is_set() or not self.recovery_queue.empty():
+                keep_working = True
+            if not self.recovery_queue.empty() or self.cleanup_event.is_set():
                 break
-            else:
-                time.sleep(1)
-                if self.work_event.is_set():
-                    keep_working = True
         self.work_event.clear()
         return keep_working
 
@@ -933,5 +931,4 @@ class Platform(object):
             if self.cleanup_event.is_set(): # Check if main process is waiting for this child to end.
                 self.recover_job_log(identifier, jobs_pending_to_process)
                 break
-        Log.info(f"{identifier} Exiting.")
-
+        Log.info(f"{identifier} Exiting.")
\ No newline at end of file
diff --git a/test/unit/test_log_recovery.py b/test/unit/test_log_recovery.py
index 11e20ddac..ef060c845 100644
--- a/test/unit/test_log_recovery.py
+++ b/test/unit/test_log_recovery.py
@@ -3,8 +3,12 @@ import pytest
 from pathlib import Path
 import os
 import pwd
+
+from autosubmit.job.job_common import Status
 from autosubmitconfigparser.config.basicconfig import BasicConfig
 from autosubmitconfigparser.config.configcommon import AutosubmitConfig
+from autosubmit.job.job import Job
+
 
 def _get_script_files_path() -> Path:
     return Path(__file__).resolve().parent / 'files'
@@ -14,7 +18,6 @@ def _get_script_files_path() -> Path:
 def current_tmpdir(tmpdir_factory):
     folder = tmpdir_factory.mktemp(f'tests')
     os.mkdir(folder.join('scratch'))
-    os.mkdir(folder.join('scheduler_tmp_dir'))
     file_stat = os.stat(f"{folder.strpath}")
     file_owner_id = file_stat.st_uid
     file_owner = pwd.getpwuid(file_owner_id).pw_name
@@ -30,6 +33,9 @@ def prepare_test(current_tmpdir):
     project = "whatever"
     scratch_dir = f"{current_tmpdir.strpath}/scratch"
     Path(f"{scratch_dir}/{project}/{current_tmpdir.owner}").mkdir(parents=True, exist_ok=True)
+    Path(f"{scratch_dir}/LOG_t000").mkdir(parents=True, exist_ok=True)
+    Path(f"{scratch_dir}/LOG_t000/t000.cmd.out.0").touch()
+    Path(f"{scratch_dir}/LOG_t000/t000.cmd.err.0").touch()
 
     # Add each platform to test
     with platforms_path.open('w') as f:
@@ -124,4 +130,28 @@ def test_log_recovery_keep_alive_cleanup(prepare_test, local, mocker, as_conf):
     local.work_event.set()
     local.cleanup_event.set()
     time.sleep(3)
-    assert local.log_recovery_process.is_alive() is False
\ No newline at end of file
+    assert local.log_recovery_process.is_alive() is False
+
+
+def test_log_recovery_recover_log(prepare_test, local, mocker, as_conf):
+    mocker.patch('autosubmit.platforms.platform.max', return_value=20)
+    mocker.patch('autosubmit.job.job.Job.write_stats')  # not sure how to test this
+    local.spawn_log_retrieval_process(as_conf)
+    local.work_event.set()
+    job = Job('t000', '0000', Status.COMPLETED, 0)
+    job.name = 'test_job'
+    job.platform = local
+    job.platform_name = 'local'
+    job.local_logs = ("t000.cmd.out.moved","t000.cmd.err.moved")
+    job._init_runtime_parameters()
+    local.work_event.set()
+    local.add_job_to_log_recover(job)
+    local.cleanup_event.set()
+    local.log_recovery_process.join(30)  # should exit earlier.
+    assert local.log_recovery_process.is_alive() is False
+    assert Path(f"{prepare_test.strpath}/scratch/LOG_t000/t000.cmd.out.moved").exists()
+    assert Path(f"{prepare_test.strpath}/scratch/LOG_t000/t000.cmd.err.moved").exists()
+
+
+
+
-- 
GitLab


From 405169812b0c30333a3de0eb323e0bf251f2f38a Mon Sep 17 00:00:00 2001
From: dbeltran <daniel.beltran@bsc.es>
Date: Wed, 4 Sep 2024 11:09:55 +0200
Subject: [PATCH 13/25] clean import

---
 test/unit/test_log_recovery.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/test/unit/test_log_recovery.py b/test/unit/test_log_recovery.py
index ef060c845..bec63a973 100644
--- a/test/unit/test_log_recovery.py
+++ b/test/unit/test_log_recovery.py
@@ -3,9 +3,7 @@ import pytest
 from pathlib import Path
 import os
 import pwd
-
 from autosubmit.job.job_common import Status
-from autosubmitconfigparser.config.basicconfig import BasicConfig
 from autosubmitconfigparser.config.configcommon import AutosubmitConfig
 from autosubmit.job.job import Job
 
@@ -93,7 +91,8 @@ def local(prepare_test):
 @pytest.fixture
 def as_conf(prepare_test):
     as_conf = AutosubmitConfig("t000")
-    as_conf.experiment_data = as_conf.load_config_file(as_conf.experiment_data, Path(prepare_test.join('platforms_t000.yml')))
+    as_conf.experiment_data = as_conf.load_config_file(as_conf.experiment_data,
+                                                       Path(prepare_test.join('platforms_t000.yml')))
     as_conf.misc_data = {"AS_COMMAND": "run"}
     return as_conf
 
@@ -142,7 +141,7 @@ def test_log_recovery_recover_log(prepare_test, local, mocker, as_conf):
     job.name = 'test_job'
     job.platform = local
     job.platform_name = 'local'
-    job.local_logs = ("t000.cmd.out.moved","t000.cmd.err.moved")
+    job.local_logs = ("t000.cmd.out.moved", "t000.cmd.err.moved")
     job._init_runtime_parameters()
     local.work_event.set()
     local.add_job_to_log_recover(job)
@@ -151,7 +150,3 @@ def test_log_recovery_recover_log(prepare_test, local, mocker, as_conf):
     assert local.log_recovery_process.is_alive() is False
     assert Path(f"{prepare_test.strpath}/scratch/LOG_t000/t000.cmd.out.moved").exists()
     assert Path(f"{prepare_test.strpath}/scratch/LOG_t000/t000.cmd.err.moved").exists()
-
-
-
-
-- 
GitLab


From 0ede75adfe026d4777de0c4badc4c7158bcddc2b Mon Sep 17 00:00:00 2001
From: dbeltran <daniel.beltran@bsc.es>
Date: Wed, 4 Sep 2024 16:20:12 +0200
Subject: [PATCH 14/25] Fix events queue being reset in a platform failure

---
 autosubmit/autosubmit.py                  | 13 +++++++------
 autosubmit/platforms/paramiko_platform.py |  1 -
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py
index 1c7a7a09b..853dcd516 100644
--- a/autosubmit/autosubmit.py
+++ b/autosubmit/autosubmit.py
@@ -1907,7 +1907,7 @@ class Autosubmit:
         return exp_history
     @staticmethod
     def prepare_run(expid, notransitive=False, start_time=None, start_after=None,
-                       run_only_members=None, recover = False, check_scripts= False):
+                       run_only_members=None, recover = False, check_scripts= False, submitter=None):
         """
         Prepare the run of the experiment.
         :param expid: a string with the experiment id.
@@ -1916,6 +1916,7 @@ class Autosubmit:
         :param start_after: a string with the experiment id to start after.
         :param run_only_members: a string with the members to run.
         :param recover: a boolean to indicate if the experiment is recovering from a failure.
+        :param submitter: the actual loaded platforms if any
         :return: a tuple
         """
         host = platform.node()
@@ -1950,8 +1951,9 @@ class Autosubmit:
 
         # Loads the communication lib, always paramiko.
         # Paramiko is the only way to communicate with the remote machines. Previously we had also Saga.
-        submitter = Autosubmit._get_submitter(as_conf)
-        submitter.load_platforms(as_conf)
+        if not submitter:
+            submitter = Autosubmit._get_submitter(as_conf)
+            submitter.load_platforms(as_conf)
         # Tries to load the job_list from disk, discarding any changes in running time ( if recovery ).
         # Could also load a backup from previous iteration.
         # The submit ready functions will cancel all job submitted if one submitted in that iteration had issues, so it should be safe to recover from a backup without losing job ids
@@ -2064,7 +2066,7 @@ class Autosubmit:
             Autosubmit.restore_platforms(platforms_to_test,as_conf=as_conf)
             return job_list, submitter , exp_history, host , as_conf, platforms_to_test, packages_persistence, False
         else:
-            return job_list, submitter , None, None, as_conf , platforms_to_test, packages_persistence, True
+            return job_list, submitter, None, None, as_conf, platforms_to_test, packages_persistence, True
     @staticmethod
     def get_iteration_info(as_conf,job_list):
         """
@@ -2259,7 +2261,7 @@ class Autosubmit:
                                                                                                 start_time,
                                                                                                 start_after,
                                                                                                 run_only_members,
-                                                                                                recover=True)
+                                                                                                recover=True, submitter = submitter)
                             except AutosubmitError as e:
                                 recovery = False
                                 Log.result("Recover of job_list has fail {0}".format(e.message))
@@ -2312,7 +2314,6 @@ class Autosubmit:
                         raise AutosubmitCritical(message, 7000)
                     except BaseException as e:
                         raise # If this happens, there is a bug in the code or an exception not-well caught
-                time.sleep(5)
                 Log.result("No more jobs to run.")
                 # search hint - finished run
                 for job in job_list.get_completed_without_logs():
diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py
index bd4c96cde..a3be9202f 100644
--- a/autosubmit/platforms/paramiko_platform.py
+++ b/autosubmit/platforms/paramiko_platform.py
@@ -111,7 +111,6 @@ class ParamikoPlatform(Platform):
         if display is None:
             display = "localhost:0"
         self.local_x11_display = xlib_connect.get_display(display)
-        self.log_retrieval_process_active = False
     def test_connection(self,as_conf):
         """
         Test if the connection is still alive, reconnect if not.
-- 
GitLab


From b81d570cb5013c310e8781311e2d4bbfd6b85efd Mon Sep 17 00:00:00 2001
From: dbeltran <daniel.beltran@bsc.es>
Date: Thu, 5 Sep 2024 09:32:20 +0200
Subject: [PATCH 15/25] Mock exists() so it works in the pipeline

---
 test/unit/test_log_recovery.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/test/unit/test_log_recovery.py b/test/unit/test_log_recovery.py
index bec63a973..de36b2807 100644
--- a/test/unit/test_log_recovery.py
+++ b/test/unit/test_log_recovery.py
@@ -89,8 +89,9 @@ def local(prepare_test):
 
 
 @pytest.fixture
-def as_conf(prepare_test):
-    as_conf = AutosubmitConfig("t000")
+def as_conf(prepare_test, mocker):
+    mocker.patch('pathlib.Path.exists', return_value=True)
+    as_conf = AutosubmitConfig("test")
     as_conf.experiment_data = as_conf.load_config_file(as_conf.experiment_data,
                                                        Path(prepare_test.join('platforms_t000.yml')))
     as_conf.misc_data = {"AS_COMMAND": "run"}
@@ -103,6 +104,7 @@ def test_log_recovery_no_keep_alive(prepare_test, local, mocker, as_conf):
     assert local.log_recovery_process.is_alive()
     time.sleep(2)
     assert local.log_recovery_process.is_alive() is False
+    local.cleanup_event.set()
 
 
 def test_log_recovery_keep_alive(prepare_test, local, mocker, as_conf):
@@ -117,6 +119,7 @@ def test_log_recovery_keep_alive(prepare_test, local, mocker, as_conf):
     assert local.log_recovery_process.is_alive()
     time.sleep(3)
     assert local.log_recovery_process.is_alive() is False
+    local.cleanup_event.set()
 
 
 def test_log_recovery_keep_alive_cleanup(prepare_test, local, mocker, as_conf):
@@ -130,6 +133,7 @@ def test_log_recovery_keep_alive_cleanup(prepare_test, local, mocker, as_conf):
     local.cleanup_event.set()
     time.sleep(3)
     assert local.log_recovery_process.is_alive() is False
+    local.cleanup_event.set()
 
 
 def test_log_recovery_recover_log(prepare_test, local, mocker, as_conf):
-- 
GitLab


From 1051c8f59a4cf2d62d8b242f7f87a225a2a88e04 Mon Sep 17 00:00:00 2001
From: dbeltran <daniel.beltran@bsc.es>
Date: Thu, 5 Sep 2024 14:14:17 +0200
Subject: [PATCH 16/25] reduced time.sleeps

---
 test/unit/test_database_regression.py | 258 ++++++++++++++++++++++++++
 test/unit/test_log_recovery.py        |  14 +-
 2 files changed, 265 insertions(+), 7 deletions(-)
 create mode 100644 test/unit/test_database_regression.py

diff --git a/test/unit/test_database_regression.py b/test/unit/test_database_regression.py
new file mode 100644
index 000000000..42d6e1b9a
--- /dev/null
+++ b/test/unit/test_database_regression.py
@@ -0,0 +1,258 @@
+import pytest
+from pathlib import Path
+from autosubmit.autosubmit import Autosubmit
+import os
+import pwd
+
+from test.unit.utils.common import create_database, init_expid
+
+def _get_script_files_path() -> Path:
+    return Path(__file__).resolve().parent / 'files'
+
+# Maybe this should be a regression test
+
+@pytest.fixture
+def scheduler_tmpdir(tmpdir_factory):
+    folder = tmpdir_factory.mktemp(f'scheduler_tests')
+    os.mkdir(folder.join('scratch'))
+    os.mkdir(folder.join('scheduler_tmp_dir'))
+    file_stat = os.stat(f"{folder.strpath}")
+    file_owner_id = file_stat.st_uid
+    file_owner = pwd.getpwuid(file_owner_id).pw_name
+    folder.owner = file_owner
+
+    # Write an autosubmitrc file in the temporary directory
+    autosubmitrc = folder.join('autosubmitrc')
+    autosubmitrc.write(f'''
+[database]
+path = {folder}
+filename = tests.db
+
+[local]
+path = {folder}
+
+[globallogs]
+path = {folder}
+
+[structures]
+path = {folder}
+
+[historicdb]
+path = {folder}
+
+[historiclog]
+path = {folder}
+
+[defaultstats]
+path = {folder}
+
+''')
+    os.environ['AUTOSUBMIT_CONFIGURATION'] = str(folder.join('autosubmitrc'))
+    create_database(str(folder.join('autosubmitrc')))
+    assert "tests.db" in [Path(f).name for f in folder.listdir()]
+    init_expid(str(folder.join('autosubmitrc')), platform='local', create=False)
+    assert "t000" in [Path(f).name for f in folder.listdir()]
+    return folder
+
+@pytest.fixture
+def prepare_scheduler(scheduler_tmpdir):
+    # touch as_misc
+    platforms_path = Path(f"{scheduler_tmpdir.strpath}/t000/conf/platforms_t000.yml")
+    jobs_path = Path(f"{scheduler_tmpdir.strpath}/t000/conf/jobs_t000.yml")
+    # Add each platform to test
+    with platforms_path.open('w') as f:
+        f.write(f"""
+PLATFORMS:
+    pytest-pjm:
+        type: pjm
+        host: 127.0.0.1
+        user: {scheduler_tmpdir.owner}
+        project: whatever
+        scratch_dir: {scheduler_tmpdir}/scratch   
+        MAX_WALLCLOCK: 48:00
+        TEMP_DIR: ''
+        MAX_PROCESSORS: 99999
+        queue: dummy
+        DISABLE_RECOVERY_THREADS: True
+    pytest-slurm:
+        type: slurm
+        host: 127.0.0.1
+        user: {scheduler_tmpdir.owner}
+        project: whatever
+        scratch_dir: {scheduler_tmpdir}/scratch       
+        QUEUE: gp_debug
+        ADD_PROJECT_TO_HOST: false
+        MAX_WALLCLOCK: 48:00
+        TEMP_DIR: ''
+        MAX_PROCESSORS: 99999
+        PROCESSORS_PER_NODE: 123
+        DISABLE_RECOVERY_THREADS: True
+    pytest-ecaccess:
+        type: ecaccess
+        version: slurm
+        host: 127.0.0.1
+        QUEUE: nf
+        EC_QUEUE: hpc
+        user: {scheduler_tmpdir.owner}
+        project: whatever
+        scratch_dir: {scheduler_tmpdir}/scratch
+        DISABLE_RECOVERY_THREADS: True
+    pytest-ps:
+        type: ps
+        host: 127.0.0.1
+        user: {scheduler_tmpdir.owner}
+        project: whatever
+        scratch_dir: {scheduler_tmpdir}/scratch
+        DISABLE_RECOVERY_THREADS: True
+        """)
+    # add a job of each platform type
+    with jobs_path.open('w') as f:
+        f.write(f"""
+JOBS:
+    job:
+        SCRIPT: |
+            echo "Hello World"
+        For: 
+            PLATFORM: [ pytest-pjm , pytest-slurm, pytest-ecaccess, pytest-ps]
+            QUEUE: [dummy, gp_debug, nf, hpc]
+            NAME: [pjm, slurm, ecaccess, ps]
+        RUNNING: once
+        wallclock: 00:01
+        PROCESSORS: 5
+        nodes: 2
+        threads: 40
+        tasks: 90
+    base:
+        SCRIPT: |
+            echo "Hello World"
+        For:
+            PLATFORM: [ pytest-pjm , pytest-slurm, pytest-ecaccess, pytest-ps]
+            QUEUE: [dummy, gp_debug, nf, hpc]
+            NAME: [pjm, slurm, ecaccess, ps]
+        RUNNING: once
+        wallclock: 00:01
+    wrap:
+        SCRIPT: |
+            echo "Hello World, I'm a wrapper"
+        For:
+             NAME: [horizontal,vertical,vertical_horizontal,horizontal_vertical]
+             DEPENDENCIES: [wrap_horizontal-1,wrap_vertical-1,wrap_vertical_horizontal-1,wrap_horizontal_vertical-1]
+        QUEUE: gp_debug
+        PLATFORM: pytest-slurm
+        RUNNING: chunk
+        wallclock: 00:01
+Wrappers:
+    wrapper_h:
+        type: horizontal
+        jobs_in_wrapper: wrap_horizontal
+    wrapper_v:
+        type: vertical
+        jobs_in_wrapper: wrap_vertical
+    wrapper_vh:
+        type: vertical-horizontal
+        jobs_in_wrapper: wrap_vertical_horizontal
+    wrapper_hv:
+        type: horizontal-vertical
+        jobs_in_wrapper: wrap_horizontal_vertical
+EXPERIMENT:
+    # List of start dates
+    DATELIST: '20000101'
+    # List of members.
+    MEMBERS: fc0 fc1
+    # Unit of the chunk size. Can be hour, day, month, or year.
+    CHUNKSIZEUNIT: month
+    # Size of each chunk.
+    CHUNKSIZE: '4'
+    # Number of chunks of the experiment.
+    NUMCHUNKS: '2'
+    CHUNKINI: ''
+    # Calendar used for the experiment. Can be standard or noleap.
+    CALENDAR: standard
+  """)
+
+
+
+
+
+    expid_dir = Path(f"{scheduler_tmpdir.strpath}/scratch/whatever/{scheduler_tmpdir.owner}/t000")
+    dummy_dir = Path(f"{scheduler_tmpdir.strpath}/scratch/whatever/{scheduler_tmpdir.owner}/t000/dummy_dir")
+    real_data = Path(f"{scheduler_tmpdir.strpath}/scratch/whatever/{scheduler_tmpdir.owner}/t000/real_data")
+    # write some dummy data inside scratch dir
+    os.makedirs(expid_dir, exist_ok=True)
+    os.makedirs(dummy_dir, exist_ok=True)
+    os.makedirs(real_data, exist_ok=True)
+
+    with open(dummy_dir.joinpath('dummy_file'), 'w') as f:
+        f.write('dummy data')
+    # create some dummy absolute symlinks in expid_dir to test migrate function
+    (real_data / 'dummy_symlink').symlink_to(dummy_dir / 'dummy_file')
+    return scheduler_tmpdir
+
+@pytest.fixture
+def generate_cmds(prepare_scheduler):
+    init_expid(os.environ["AUTOSUBMIT_CONFIGURATION"], platform='local', expid='t000', create=True)
+    Autosubmit.inspect(expid='t000', check_wrapper=True, force=True, lst=None, filter_chunks=None, filter_status=None, filter_section=None)
+    return prepare_scheduler
+
+@pytest.mark.parametrize("scheduler, job_type", [
+    ('pjm', 'SINGLE'),
+    ('slurm', 'SINGLE'),
+    ('ecaccess', 'SINGLE'),
+    ('ps', 'SINGLE'),
+    ('slurm', 'horizontal'),
+    ('slurm', 'vertical'),
+    ('slurm', 'horizontal_vertical'),
+    ('slurm', 'vertical_horizontal')
+])
+def test_scheduler_job_types(scheduler, job_type, generate_cmds):
+    # Test code that uses scheduler and job_typedef test_default_parameters(scheduler: str, job_type: str, generate_cmds):
+    """
+    Test that the default parameters are correctly set in the scheduler files. It is a comparasion line to line, so the new templates must match the same line order as the old ones. Additional default parameters must be filled in the files/base_{scheduler}.yml as well as any change in the order
+    :param generate_cmds: fixture that generates the templates.
+    :param scheduler: Target scheduler
+    :param job_type: Wrapped or not
+    :return:
+    """
+
+    # Load the base file for each scheduler
+    scheduler = scheduler.upper()
+    job_type = job_type.upper()
+    expected_data = {}
+    if job_type == "SINGLE":
+        for base_f in _get_script_files_path().glob('base_*.cmd'):
+            if scheduler in base_f.stem.split('_')[1].upper():
+                expected_data = Path(base_f).read_text()
+                break
+    else:
+        expected_data = (Path(_get_script_files_path()) / Path(f"base_{job_type.lower()}_{scheduler.lower()}.cmd")).read_text()
+    if not expected_data:
+        assert False, f"Could not find the expected data for {scheduler} and {job_type}"
+
+    # Get the actual default parameters for the scheduler
+    if job_type == "SINGLE":
+        actual = Path(f"{generate_cmds.strpath}/t000/tmp/t000_BASE_{scheduler}.cmd").read_text()
+    else:
+        for asthread in Path(f"{generate_cmds.strpath}/t000/tmp").glob(f"*ASThread_WRAP_{job_type}_[0-9]*.cmd"):
+            actual = asthread.read_text()
+            break
+        else:
+            assert False, f"Could not find the actual data for {scheduler} and {job_type}"
+    # Remove all after # Autosubmit header
+    # ###################
+    # count number of lines in expected
+    expected_lines = expected_data.split('\n')
+    actual = actual.split('\n')[:len(expected_lines)]
+    actual = '\n'.join(actual)
+    # Compare line to line
+    for i, (line1, line2) in enumerate(zip(expected_data.split('\n'), actual.split('\n'))):
+        if "PJM -o" in line1 or "PJM -e" in line1 or "#SBATCH --output" in line1 or "#SBATCH --error" in line1 or "#SBATCH -J" in line1: # output error will be different
+            continue
+        elif "##" in line1 or "##" in line2: # comment line
+            continue
+        elif "header" in line1 or "header" in line2: # header line
+            continue
+        else:
+            assert line1 == line2
+
+
+
diff --git a/test/unit/test_log_recovery.py b/test/unit/test_log_recovery.py
index de36b2807..036cffb31 100644
--- a/test/unit/test_log_recovery.py
+++ b/test/unit/test_log_recovery.py
@@ -108,30 +108,30 @@ def test_log_recovery_no_keep_alive(prepare_test, local, mocker, as_conf):
 
 
 def test_log_recovery_keep_alive(prepare_test, local, mocker, as_conf):
-    mocker.patch('autosubmit.platforms.platform.max', return_value=3)
+    mocker.patch('autosubmit.platforms.platform.max', return_value=1)
     local.spawn_log_retrieval_process(as_conf)
     assert local.log_recovery_process.is_alive()
     local.work_event.set()
-    time.sleep(3)
+    time.sleep(1)
     assert local.log_recovery_process.is_alive()
     local.work_event.set()
-    time.sleep(3)
+    time.sleep(1)
     assert local.log_recovery_process.is_alive()
-    time.sleep(3)
+    time.sleep(1)
     assert local.log_recovery_process.is_alive() is False
     local.cleanup_event.set()
 
 
 def test_log_recovery_keep_alive_cleanup(prepare_test, local, mocker, as_conf):
-    mocker.patch('autosubmit.platforms.platform.max', return_value=3)
+    mocker.patch('autosubmit.platforms.platform.max', return_value=1)
     local.spawn_log_retrieval_process(as_conf)
     assert local.log_recovery_process.is_alive()
     local.work_event.set()
-    time.sleep(3)
+    time.sleep(1)
     assert local.log_recovery_process.is_alive()
     local.work_event.set()
     local.cleanup_event.set()
-    time.sleep(3)
+    time.sleep(1)
     assert local.log_recovery_process.is_alive() is False
     local.cleanup_event.set()
 
-- 
GitLab


From 95453aa600fcd3106a0dbb583e0a4a664ee9978d Mon Sep 17 00:00:00 2001
From: dbeltran <daniel.beltran@bsc.es>
Date: Tue, 10 Sep 2024 16:07:05 +0200
Subject: [PATCH 17/25] Added database test, fixed some issues

---
 autosubmit/job/job.py                 |   2 -
 autosubmit/job/job_packages.py        |   1 -
 autosubmit/platforms/platform.py      |   2 +-
 test/unit/test_database_regression.py | 355 ++++++++++++++------------
 4 files changed, 197 insertions(+), 163 deletions(-)

diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py
index 3462cf865..8431417cf 100644
--- a/autosubmit/job/job.py
+++ b/autosubmit/job/job.py
@@ -1268,8 +1268,6 @@ class Job(object):
             if not failed_file:
                 Log.printlog("Job {0} is FAILED. Checking completed files to confirm the failure...".format(
                     self.name), 3000)
-                self._platform.get_completed_files(
-                    self.name, wrapper_failed=self.packed)
                 self.check_completion()
                 if self.status == Status.COMPLETED:
                     Log.result("Job {0} is COMPLETED", self.name)
diff --git a/autosubmit/job/job_packages.py b/autosubmit/job/job_packages.py
index 309881657..1d2c567aa 100644
--- a/autosubmit/job/job_packages.py
+++ b/autosubmit/job/job_packages.py
@@ -263,7 +263,6 @@ class JobPackageSimple(JobPackageBase):
                 continue
             Log.info("{0} submitted", job.name)
             job.status = Status.SUBMITTED            
-            job.write_submit_time(hold=self.hold)
 
 
 class JobPackageSimpleWrapped(JobPackageSimple):
diff --git a/autosubmit/platforms/platform.py b/autosubmit/platforms/platform.py
index c15422fa0..515ad6b60 100644
--- a/autosubmit/platforms/platform.py
+++ b/autosubmit/platforms/platform.py
@@ -842,8 +842,8 @@ class Platform(object):
         if not self.log_retrieval_process_active and (
                 as_conf is None or str(as_conf.platforms_data.get(self.name, {}).get('DISABLE_RECOVERY_THREADS',
                                                                                      "false")).lower() == "false"):
-            self.log_retrieval_process_active = True
             if as_conf and as_conf.misc_data.get("AS_COMMAND", "").lower() == "run":
+                self.log_retrieval_process_active = True
                 Platform.update_workers(self.work_event)
                 self.log_recovery_process = Process(target=self.recover_platform_job_logs, args=(),
                                                     name=f"{self.name}_log_recovery")
diff --git a/test/unit/test_database_regression.py b/test/unit/test_database_regression.py
index 42d6e1b9a..0b550f8a4 100644
--- a/test/unit/test_database_regression.py
+++ b/test/unit/test_database_regression.py
@@ -1,21 +1,27 @@
+import shutil
+
 import pytest
 from pathlib import Path
 from autosubmit.autosubmit import Autosubmit
+from log.log import Log
 import os
 import pwd
 
 from test.unit.utils.common import create_database, init_expid
+import sqlite3
+
 
 def _get_script_files_path() -> Path:
     return Path(__file__).resolve().parent / 'files'
 
+
 # Maybe this should be a regression test
 
 @pytest.fixture
-def scheduler_tmpdir(tmpdir_factory):
-    folder = tmpdir_factory.mktemp(f'scheduler_tests')
+def db_tmpdir(tmpdir_factory):
+    folder = tmpdir_factory.mktemp(f'db_tests')
     os.mkdir(folder.join('scratch'))
-    os.mkdir(folder.join('scheduler_tmp_dir'))
+    os.mkdir(folder.join('db_tmp_dir'))
     file_stat = os.stat(f"{folder.strpath}")
     file_owner_id = file_stat.st_uid
     file_owner = pwd.getpwuid(file_owner_id).pw_name
@@ -54,129 +60,74 @@ path = {folder}
     assert "t000" in [Path(f).name for f in folder.listdir()]
     return folder
 
+
 @pytest.fixture
-def prepare_scheduler(scheduler_tmpdir):
+def prepare_db(db_tmpdir):
     # touch as_misc
-    platforms_path = Path(f"{scheduler_tmpdir.strpath}/t000/conf/platforms_t000.yml")
-    jobs_path = Path(f"{scheduler_tmpdir.strpath}/t000/conf/jobs_t000.yml")
+    # remove files under t000/conf
+    conf_folder = Path(f"{db_tmpdir.strpath}/t000/conf")
+    shutil.rmtree(conf_folder)
+    os.makedirs(conf_folder)
+    platforms_path = Path(f"{db_tmpdir.strpath}/t000/conf/platforms.yml")
+    main_path = Path(f"{db_tmpdir.strpath}/t000/conf/main.yml")
     # Add each platform to test
     with platforms_path.open('w') as f:
         f.write(f"""
 PLATFORMS:
-    pytest-pjm:
+    dummy:
         type: pjm
         host: 127.0.0.1
-        user: {scheduler_tmpdir.owner}
+        user: {db_tmpdir.owner}
         project: whatever
-        scratch_dir: {scheduler_tmpdir}/scratch   
+        scratch_dir: {db_tmpdir}/scratch   
         MAX_WALLCLOCK: 48:00
         TEMP_DIR: ''
         MAX_PROCESSORS: 99999
         queue: dummy
         DISABLE_RECOVERY_THREADS: True
-    pytest-slurm:
-        type: slurm
-        host: 127.0.0.1
-        user: {scheduler_tmpdir.owner}
-        project: whatever
-        scratch_dir: {scheduler_tmpdir}/scratch       
-        QUEUE: gp_debug
-        ADD_PROJECT_TO_HOST: false
-        MAX_WALLCLOCK: 48:00
-        TEMP_DIR: ''
-        MAX_PROCESSORS: 99999
-        PROCESSORS_PER_NODE: 123
-        DISABLE_RECOVERY_THREADS: True
-    pytest-ecaccess:
-        type: ecaccess
-        version: slurm
-        host: 127.0.0.1
-        QUEUE: nf
-        EC_QUEUE: hpc
-        user: {scheduler_tmpdir.owner}
-        project: whatever
-        scratch_dir: {scheduler_tmpdir}/scratch
-        DISABLE_RECOVERY_THREADS: True
-    pytest-ps:
-        type: ps
-        host: 127.0.0.1
-        user: {scheduler_tmpdir.owner}
-        project: whatever
-        scratch_dir: {scheduler_tmpdir}/scratch
-        DISABLE_RECOVERY_THREADS: True
         """)
-    # add a job of each platform type
-    with jobs_path.open('w') as f:
+
+    with main_path.open('w') as f:
         f.write(f"""
-JOBS:
-    job:
-        SCRIPT: |
-            echo "Hello World"
-        For: 
-            PLATFORM: [ pytest-pjm , pytest-slurm, pytest-ecaccess, pytest-ps]
-            QUEUE: [dummy, gp_debug, nf, hpc]
-            NAME: [pjm, slurm, ecaccess, ps]
-        RUNNING: once
-        wallclock: 00:01
-        PROCESSORS: 5
-        nodes: 2
-        threads: 40
-        tasks: 90
-    base:
-        SCRIPT: |
-            echo "Hello World"
-        For:
-            PLATFORM: [ pytest-pjm , pytest-slurm, pytest-ecaccess, pytest-ps]
-            QUEUE: [dummy, gp_debug, nf, hpc]
-            NAME: [pjm, slurm, ecaccess, ps]
-        RUNNING: once
-        wallclock: 00:01
-    wrap:
-        SCRIPT: |
-            echo "Hello World, I'm a wrapper"
-        For:
-             NAME: [horizontal,vertical,vertical_horizontal,horizontal_vertical]
-             DEPENDENCIES: [wrap_horizontal-1,wrap_vertical-1,wrap_vertical_horizontal-1,wrap_horizontal_vertical-1]
-        QUEUE: gp_debug
-        PLATFORM: pytest-slurm
-        RUNNING: chunk
-        wallclock: 00:01
-Wrappers:
-    wrapper_h:
-        type: horizontal
-        jobs_in_wrapper: wrap_horizontal
-    wrapper_v:
-        type: vertical
-        jobs_in_wrapper: wrap_vertical
-    wrapper_vh:
-        type: vertical-horizontal
-        jobs_in_wrapper: wrap_vertical_horizontal
-    wrapper_hv:
-        type: horizontal-vertical
-        jobs_in_wrapper: wrap_horizontal_vertical
 EXPERIMENT:
     # List of start dates
     DATELIST: '20000101'
     # List of members.
-    MEMBERS: fc0 fc1
+    MEMBERS: fc0
     # Unit of the chunk size. Can be hour, day, month, or year.
     CHUNKSIZEUNIT: month
     # Size of each chunk.
     CHUNKSIZE: '4'
     # Number of chunks of the experiment.
-    NUMCHUNKS: '2'
+    NUMCHUNKS: '3'  
     CHUNKINI: ''
     # Calendar used for the experiment. Can be standard or noleap.
     CALENDAR: standard
-  """)
-
-
-
 
-
-    expid_dir = Path(f"{scheduler_tmpdir.strpath}/scratch/whatever/{scheduler_tmpdir.owner}/t000")
-    dummy_dir = Path(f"{scheduler_tmpdir.strpath}/scratch/whatever/{scheduler_tmpdir.owner}/t000/dummy_dir")
-    real_data = Path(f"{scheduler_tmpdir.strpath}/scratch/whatever/{scheduler_tmpdir.owner}/t000/real_data")
+CONFIG:
+    # Current version of Autosubmit.
+    AUTOSUBMIT_VERSION: ""
+    # Total number of jobs in the workflow.
+    TOTALJOBS: 20
+    # Maximum number of jobs permitted in the waiting status.
+    MAXWAITINGJOBS: 20
+    SAFETYSLEEPTIME: 1
+DEFAULT:
+    # Job experiment ID.
+    EXPID: "t000"
+    # Default HPC platform name.
+    HPCARCH: "local"
+    #hint: use %PROJDIR% to point to the project folder (where the project is cloned)
+    # Custom configuration location.
+project:
+    # Type of the project.
+    PROJECT_TYPE: None
+    # Folder to hold the project sources.
+    PROJECT_DESTINATION: local_project
+""")
+    expid_dir = Path(f"{db_tmpdir.strpath}/scratch/whatever/{db_tmpdir.owner}/t000")
+    dummy_dir = Path(f"{db_tmpdir.strpath}/scratch/whatever/{db_tmpdir.owner}/t000/dummy_dir")
+    real_data = Path(f"{db_tmpdir.strpath}/scratch/whatever/{db_tmpdir.owner}/t000/real_data")
     # write some dummy data inside scratch dir
     os.makedirs(expid_dir, exist_ok=True)
     os.makedirs(dummy_dir, exist_ok=True)
@@ -186,73 +137,159 @@ EXPERIMENT:
         f.write('dummy data')
     # create some dummy absolute symlinks in expid_dir to test migrate function
     (real_data / 'dummy_symlink').symlink_to(dummy_dir / 'dummy_file')
-    return scheduler_tmpdir
+    return db_tmpdir
+
 
 @pytest.fixture
-def generate_cmds(prepare_scheduler):
-    init_expid(os.environ["AUTOSUBMIT_CONFIGURATION"], platform='local', expid='t000', create=True)
-    Autosubmit.inspect(expid='t000', check_wrapper=True, force=True, lst=None, filter_chunks=None, filter_status=None, filter_section=None)
-    return prepare_scheduler
+def success_jobs_file(db_tmpdir):
+    jobs_path = Path(f"{db_tmpdir.strpath}/t000/conf/jobs.yml")
+    with jobs_path.open('w') as f:
+        f.write(f"""
+        JOBS:
+            job:
+                SCRIPT: |
+                    echo "Hello World"
+                PLATFORM: local
+                RUNNING: chunk
+                wallclock: 00:01
+        """)
+
+
+@pytest.fixture
+def failure_jobs_file(db_tmpdir):
+    jobs_path = Path(f"{db_tmpdir.strpath}/t000/conf/jobs.yml")
+    with jobs_path.open('w') as f:
+        f.write(f"""
+    JOBS:
+        job:
+            SCRIPT: |
+                echo "Hello World"
+                exit 1
+            PLATFORM: local
+            RUNNING: chunk
+            wallclock: 00:01
+            retrials: 1
+    """)
+
 
-@pytest.mark.parametrize("scheduler, job_type", [
-    ('pjm', 'SINGLE'),
-    ('slurm', 'SINGLE'),
-    ('ecaccess', 'SINGLE'),
-    ('ps', 'SINGLE'),
-    ('slurm', 'horizontal'),
-    ('slurm', 'vertical'),
-    ('slurm', 'horizontal_vertical'),
-    ('slurm', 'vertical_horizontal')
-])
-def test_scheduler_job_types(scheduler, job_type, generate_cmds):
-    # Test code that uses scheduler and job_typedef test_default_parameters(scheduler: str, job_type: str, generate_cmds):
-    """
-    Test that the default parameters are correctly set in the scheduler files. It is a comparasion line to line, so the new templates must match the same line order as the old ones. Additional default parameters must be filled in the files/base_{scheduler}.yml as well as any change in the order
-    :param generate_cmds: fixture that generates the templates.
-    :param scheduler: Target scheduler
-    :param job_type: Wrapped or not
-    :return:
-    """
+@pytest.fixture
+def run_experiment_success(prepare_db, db_tmpdir):
+    init_expid(os.environ["AUTOSUBMIT_CONFIGURATION"], platform='local', expid='t000', create=True)
+    # job_list, submitter, exp_history, host, as_conf, platforms_to_test, packages_persistence, _ = Autosubmit.prepare_run("t000")
+    as_misc = Path(f"{db_tmpdir.strpath}/t000/conf/as_misc.yml")
+    with as_misc.open('w') as f:
+        f.write(f"""
+AS_MISC: True
+ASMISC:
+    COMMAND: run
+AS_COMMAND: run
+        """)
+    # Completed
+    Autosubmit.run_experiment(expid='t000')
+    # Access to the job_historical.db
+    job_data = Path(f"{db_tmpdir.strpath}/job_data_t000.db")
+    autosubmit_db = Path(f"{db_tmpdir.strpath}/tests.db")
+    assert job_data.exists()
+    assert autosubmit_db.exists()
+    return prepare_db
 
-    # Load the base file for each scheduler
-    scheduler = scheduler.upper()
-    job_type = job_type.upper()
-    expected_data = {}
-    if job_type == "SINGLE":
-        for base_f in _get_script_files_path().glob('base_*.cmd'):
-            if scheduler in base_f.stem.split('_')[1].upper():
-                expected_data = Path(base_f).read_text()
-                break
-    else:
-        expected_data = (Path(_get_script_files_path()) / Path(f"base_{job_type.lower()}_{scheduler.lower()}.cmd")).read_text()
-    if not expected_data:
-        assert False, f"Could not find the expected data for {scheduler} and {job_type}"
 
-    # Get the actual default parameters for the scheduler
-    if job_type == "SINGLE":
-        actual = Path(f"{generate_cmds.strpath}/t000/tmp/t000_BASE_{scheduler}.cmd").read_text()
-    else:
-        for asthread in Path(f"{generate_cmds.strpath}/t000/tmp").glob(f"*ASThread_WRAP_{job_type}_[0-9]*.cmd"):
-            actual = asthread.read_text()
-            break
-        else:
-            assert False, f"Could not find the actual data for {scheduler} and {job_type}"
-    # Remove all after # Autosubmit header
-    # ###################
-    # count number of lines in expected
-    expected_lines = expected_data.split('\n')
-    actual = actual.split('\n')[:len(expected_lines)]
-    actual = '\n'.join(actual)
-    # Compare line to line
-    for i, (line1, line2) in enumerate(zip(expected_data.split('\n'), actual.split('\n'))):
-        if "PJM -o" in line1 or "PJM -e" in line1 or "#SBATCH --output" in line1 or "#SBATCH --error" in line1 or "#SBATCH -J" in line1: # output error will be different
-            continue
-        elif "##" in line1 or "##" in line2: # comment line
-            continue
-        elif "header" in line1 or "header" in line2: # header line
-            continue
-        else:
-            assert line1 == line2
+@pytest.fixture
+def run_experiment_failure(prepare_db, db_tmpdir, failure_jobs_file, mocker):
+    init_expid(os.environ["AUTOSUBMIT_CONFIGURATION"], platform='local', expid='t000', create=True)
+    # job_list, submitter, exp_history, host, as_conf, platforms_to_test, packages_persistence, _ = Autosubmit.prepare_run("t000")
+    as_misc = Path(f"{db_tmpdir.strpath}/t000/conf/as_misc.yml")
+    with as_misc.open('w') as f:
+        f.write(f"""
+AS_MISC: True
+ASMISC:
+    COMMAND: run
+AS_COMMAND: run
+        """)
+    # Completed
+    # mock platform.localplatform.check_exists
+    with mocker.patch('platforms.localplatform.LocalPlatform.check_exists') as check_exists:
+        check_exists.return_value = False
+        Autosubmit.run_experiment(expid='t000')
+    # Access to the job_historical.db
+    job_data = Path(f"{db_tmpdir.strpath}/job_data_t000.db")
+    autosubmit_db = Path(f"{db_tmpdir.strpath}/tests.db")
+    assert job_data.exists()
+    assert autosubmit_db.exists()
+    return prepare_db
 
 
+def test_db_success(run_experiment_success, db_tmpdir):
+    job_data = Path(f"{db_tmpdir.strpath}/job_data_t000.db")
+    conn = sqlite3.connect(job_data)
+    conn.row_factory = sqlite3.Row
+    c = conn.cursor()
+    c.execute("SELECT * FROM job_data")
+    rows = c.fetchall()
+    # Convert rows to a list of dictionaries
+    rows_as_dicts = [dict(row) for row in rows]
+    # Tune the print so it is more readable, so it is easier to debug in case of failure
+    column_names = rows_as_dicts[0].keys() if rows_as_dicts else []
+    column_widths = [max(len(str(row[col])) for row in rows_as_dicts + [dict(zip(column_names, column_names))]) for col
+                     in column_names]
+    header = " | ".join(f"{name:<{width}}" for name, width in zip(column_names, column_widths))
+    print(f"\n{header}")
+    print("-" * len(header))
+    # Print the rows
+    for row_dict in rows_as_dicts:
+        print(" | ".join(f"{str(row_dict[col]):<{width}}" for col, width in zip(column_names, column_widths)))
+        # Check that all fields contain data, except extra_data, children, and platform_output
+        # Check that submit, start and finish are > 0
+        assert row_dict["submit"] > 0 and row_dict["finish"] != 1970010101
+        assert row_dict["start"] > 0 and row_dict["finish"] != 1970010101
+        assert row_dict["finish"] > 0 and row_dict["finish"] != 1970010101
+        assert row_dict["status"] == "COMPLETED"
+        for key in [key for key in row_dict.keys() if
+                    key not in ["status", "finish", "submit", "start", "extra_data", "children", "platform_output"]]:
+            assert str(row_dict[key]) != ""
+    # Check that the job_data table has the expected number of entries
+    c.execute("SELECT job_name, COUNT(*) as count FROM job_data GROUP BY job_name")
+    count_rows = c.fetchall()
+    for row in count_rows:
+        assert row["count"] == 1
+    # Close the cursor and connection
+    c.close()
+    conn.close()
 
+# Need to improve the performance of this test 
+# def test_db_failure(run_experiment_failure, db_tmpdir):
+#     job_data = Path(f"{db_tmpdir.strpath}/job_data_t000.db")
+#     conn = sqlite3.connect(job_data)
+#     conn.row_factory = sqlite3.Row
+#     c = conn.cursor()
+#     c.execute("SELECT * FROM job_data")
+#     rows = c.fetchall()
+#     # Convert rows to a list of dictionaries
+#     rows_as_dicts = [dict(row) for row in rows]
+#     # Tune the print so it is more readable, so it is easier to debug in case of failure
+#     column_names = rows_as_dicts[0].keys() if rows_as_dicts else []
+#     column_widths = [max(len(str(row[col])) for row in rows_as_dicts + [dict(zip(column_names, column_names))]) for col
+#                      in column_names]
+#     header = " | ".join(f"{name:<{width}}" for name, width in zip(column_names, column_widths))
+#     print(f"\n{header}")
+#     print("-" * len(header))
+#     # Print the rows
+#     for row_dict in rows_as_dicts:
+#         print(" | ".join(f"{str(row_dict[col]):<{width}}" for col, width in zip(column_names, column_widths)))
+#         # Check that all fields contain data, except extra_data, children, and platform_output
+#         # Check that submit, start and finish are > 0
+#         # assert row_dict["submit"] > 0 and row_dict["finish"] != 1970010101
+#         # assert row_dict["start"] > 0 and row_dict["finish"] != 1970010101
+#         # assert row_dict["finish"] > 0 and row_dict["finish"] != 1970010101
+#         # assert row_dict["status"] == "FAILED"
+#         # for key in [key for key in row_dict.keys() if
+#         #             key not in ["status", "finish", "submit", "start", "extra_data", "children", "platform_output"]]:
+#         #     assert str(row_dict[key]) != ""
+#     # Check that the job_data table has the expected number of entries
+#     c.execute("SELECT job_name, COUNT(*) as count FROM job_data GROUP BY job_name")
+#     count_rows = c.fetchall()
+#     for row in count_rows:
+#         assert row["count"] == 1
+#     # Close the cursor and connection
+#     c.close()
+#     conn.close()
-- 
GitLab


From bf382aa4300ffa72a5c62d10e23dc41ef00d5bcf Mon Sep 17 00:00:00 2001
From: dbeltran <daniel.beltran@bsc.es>
Date: Thu, 12 Sep 2024 17:27:54 +0200
Subject: [PATCH 18/25] Changes to stat, db and retrials (WIP)

---
 autosubmit/job/job.py                 | 17 +++---
 autosubmit/job/job_common.py          |  4 +-
 autosubmit/job/job_list.py            |  2 +
 autosubmit/job/job_packages.py        |  8 +--
 autosubmit/platforms/locplatform.py   | 42 +++++---------
 autosubmit/platforms/platform.py      | 42 ++++++--------
 test/unit/test_database_regression.py | 80 +++++++++++++--------------
 7 files changed, 87 insertions(+), 108 deletions(-)

diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py
index 8431417cf..594bf28a6 100644
--- a/autosubmit/job/job.py
+++ b/autosubmit/job/job.py
@@ -208,6 +208,7 @@ class Job(object):
         self._local_logs = ('', '')
         self._remote_logs = ('', '')
         self.script_name = self.name + ".cmd"
+        self.stat_file = self.script_name[:-4] + "_STAT"
         self.status = status
         self.prev_status = status
         self.old_status = self.status
@@ -272,6 +273,7 @@ class Job(object):
         self.log_retrieved = False
         self.start_time_placeholder = time.time()
         self.processors_per_node = ""
+        self.stat_file = self.script_name[:-4] + "_STAT"
 
 
     @property
@@ -1143,7 +1145,7 @@ class Job(object):
                         log_retrieved = False
             self.log_retrieved = log_retrieved
 
-    def write_stats(self, stat_file, platform):
+    def write_stats(self, platform):
         # Update the logs with Autosubmit Job ID Brand
         try:
             for local_log in self.local_logs:
@@ -1154,11 +1156,11 @@ class Job(object):
         # write stats
         if self.wrapper_type == "vertical":  # Disable AS retrials for vertical wrappers to use internal ones
             for i in range(0, int(self.retrials + 1)):
-                if self.platform.get_stat_file(self.name, stat_file, count=i):
+                if self.platform.get_stat_file(self, count=i):
                     self.write_vertical_time(i)
                     self.inc_fail_count()
         else:
-            self.platform.get_stat_file(self.name, stat_file)
+            self.platform.get_stat_file(self)
             self.write_start_time(from_stat_file=True)
             self.write_end_time(self.status == Status.COMPLETED)
 
@@ -1172,10 +1174,8 @@ class Job(object):
 
         backup_logname = copy.copy(self.local_logs)
         if self.wrapper_type == "vertical":
-            stat_file = self.script_name[:-4] + "_STAT_"
             self.retrieve_internal_retrials_logfiles(platform)
         else:
-            stat_file = self.script_name[:-4] + "_STAT"
             self.retrieve_external_retrials_logfiles(platform)
 
         if not self.log_retrieved:
@@ -1185,7 +1185,7 @@ class Job(object):
             else:
                 Log.printlog("Failed to retrieve logs for job {0}".format(self.name), 6000)
         else:
-            self.write_stats(stat_file, platform)
+            self.write_stats(platform)
 
 
     def parse_time(self,wallclock):
@@ -1266,9 +1266,6 @@ class Job(object):
             Log.result("Job {0} is COMPLETED", self.name)
         elif self.status == Status.FAILED:
             if not failed_file:
-                Log.printlog("Job {0} is FAILED. Checking completed files to confirm the failure...".format(
-                    self.name), 3000)
-                self.check_completion()
                 if self.status == Status.COMPLETED:
                     Log.result("Job {0} is COMPLETED", self.name)
                 else:
@@ -1886,6 +1883,7 @@ class Job(object):
         self.shape = as_conf.jobs_data[self.section].get("SHAPE", "")
         self.script = as_conf.jobs_data[self.section].get("SCRIPT", "")
         self.x11 = False if str(as_conf.jobs_data[self.section].get("X11", False)).lower() == "false" else True
+        self.stat_file = f"{self.script_name[:-4]}_STAT_{self.fail_count}"
         if self.checkpoint: # To activate placeholder sustitution per <empty> in the template
             parameters["AS_CHECKPOINT"] = self.checkpoint
         parameters['JOBNAME'] = self.name
@@ -2507,6 +2505,7 @@ class WrapperJob(Job):
         self.hold = hold
         self.inner_jobs_running = list()
 
+
     def _queuing_reason_cancel(self, reason):
         try:
             if len(reason.split('(', 1)) > 1:
diff --git a/autosubmit/job/job_common.py b/autosubmit/job/job_common.py
index 92dc1b1bd..5897ec0fa 100644
--- a/autosubmit/job/job_common.py
+++ b/autosubmit/job/job_common.py
@@ -127,7 +127,7 @@ class StatisticsSnippetBash:
             
             set -xuve
             job_name_ptrn='%CURRENT_LOGDIR%/%JOBNAME%'
-            echo $(date +%s) > ${job_name_ptrn}_STAT
+            echo $(date +%s) > ${job_name_ptrn}_STAT_%FAIL_COUNT%
 
             ################### 
             # AS CHECKPOINT FUNCTION
@@ -154,7 +154,7 @@ class StatisticsSnippetBash:
                 # Autosubmit tailer
                 ###################
                 set -xuve
-                echo $(date +%s) >> ${job_name_ptrn}_STAT
+                echo $(date +%s) >> ${job_name_ptrn}_STAT_%FAIL_COUNT%
                 touch ${job_name_ptrn}_COMPLETED
                 exit 0
 
diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py
index 5cc9383d4..fa92242af 100644
--- a/autosubmit/job/job_list.py
+++ b/autosubmit/job/job_list.py
@@ -2594,6 +2594,8 @@ class JobList(object):
         if hasattr(job, "x11") and job.x11: # X11 has it log writted in the run.out file. No need to check for log files as there are none
             job.updated_log = True
             return
+        Log.result(f"Sending job {job.stat_file} to the log recovery thread")
+
         log_recovered = self.check_if_log_is_recovered(job)
         if log_recovered:
             job.local_logs = (log_recovered.name, log_recovered.name[:-4] + ".err") # we only want the last one
diff --git a/autosubmit/job/job_packages.py b/autosubmit/job/job_packages.py
index 1d2c567aa..f99654fe8 100644
--- a/autosubmit/job/job_packages.py
+++ b/autosubmit/job/job_packages.py
@@ -256,7 +256,7 @@ class JobPackageSimple(JobPackageBase):
                 os.remove(log_completed)
             if os.path.exists(log_stat):
                 os.remove(log_stat)
-            self.platform.remove_stat_file(job.name)
+            self.platform.remove_stat_file(job)
             self.platform.remove_completed_file(job.name)
             job.id = self.platform.submit_job(job, job_scripts[job.name], hold=hold, export = self.export)
             if job.id is None or not job.id:
@@ -342,7 +342,7 @@ class JobPackageArray(JobPackageBase):
 
     def _do_submission(self, job_scripts=None, hold=False):
         for job in self.jobs:
-            self.platform.remove_stat_file(job.name)
+            self.platform.remove_stat_file(job)
             self.platform.remove_completed_file(job.name)
 
         package_id = self.platform.submit_job(None, self._common_script, hold=hold, export = self.export)
@@ -578,7 +578,7 @@ class JobPackageThread(JobPackageBase):
 
         else:
             for job in self.jobs:
-                self.platform.remove_stat_file(job.name)
+                self.platform.remove_stat_file(job)
                 self.platform.remove_completed_file(job.name)
                 if hold:
                     job.hold = hold
@@ -658,7 +658,7 @@ class JobPackageThreadWrapped(JobPackageThread):
 
     def _do_submission(self, job_scripts=None, hold=False):
         for job in self.jobs:
-            self.platform.remove_stat_file(job.name)
+            self.platform.remove_stat_file(job)
             self.platform.remove_completed_file(job.name)
             if hold:
                 job.hold = hold
diff --git a/autosubmit/platforms/locplatform.py b/autosubmit/platforms/locplatform.py
index 8e884bde9..f6356ab26 100644
--- a/autosubmit/platforms/locplatform.py
+++ b/autosubmit/platforms/locplatform.py
@@ -180,39 +180,25 @@ class LocalPlatform(ParamikoPlatform):
         return True
 
     # Moves .err .out
-    def check_file_exists(self, src, wrapper_failed=False, sleeptime=5, max_retries=3, first=True):
+    def check_file_exists(self, src, wrapper_failed=False, sleeptime=None, max_retries=None, first=None):
         """
-        Moves a file on the platform
+        Checks if a file exists in the platform
         :param src: source name
         :type src: str
-        :param: wrapper_failed: if True, the wrapper failed.
+        :param wrapper_failed: if the wrapper failed
         :type wrapper_failed: bool
-
+        :param sleeptime: time to sleep
+        :type sleeptime: int
+        :param max_retries: maximum number of retries
+        :type max_retries: int
+        :param first: if it is the first time
+        :type first: bool
+        :return: True if the file exists, False otherwise
+        :rtype: bool
         """
-        file_exist = False
-        remote_path = os.path.join(self.get_files_path(), src)
-        retries = 0
-        # Not first is meant for vertical_wrappers. There you have to download STAT_{MAX_LOGS} then STAT_{MAX_LOGS-1} and so on
-        if not first:
-            max_retries = 1
-            sleeptime = 0
-        while not file_exist and retries < max_retries:
-            try:
-                file_exist = os.path.isfile(os.path.join(self.get_files_path(),src))
-                if not file_exist:  # File doesn't exist, retry in sleep-time
-                    if first:
-                        Log.debug("{2} File does not exist.. waiting {0}s for a new retry (retries left: {1})", sleeptime,
-                                 max_retries - retries, remote_path)
-                    if not wrapper_failed:
-                        sleep(sleeptime)
-                        sleeptime = sleeptime + 5
-                        retries = retries + 1
-                    else:
-                        retries = 9999
-            except BaseException as e:  # Unrecoverable error
-                Log.printlog("File does not exist, logs {0} {1}".format(self.get_files_path(),src),6001)
-                file_exist = False  # won't exist
-                retries = 999  # no more retries
+        file_exist = os.path.isfile(os.path.join(self.get_files_path(), src))
+        if not file_exist:
+            Log.debug(f"File {self.get_files_path()}/{src} does not exist")
         return file_exist
 
     def delete_file(self, filename,del_cmd  = False):
diff --git a/autosubmit/platforms/platform.py b/autosubmit/platforms/platform.py
index 515ad6b60..d67ac6af0 100644
--- a/autosubmit/platforms/platform.py
+++ b/autosubmit/platforms/platform.py
@@ -25,8 +25,9 @@ class UniqueQueue(
         super().__init__(maxsize, ctx=multiprocessing.get_context())
 
     def put(self, job, block=True, timeout=None):
-        if job.name not in self.all_items:
-            self.all_items.add(job.name)
+        name_with_retrial = job.name+str(job.fail_count)
+        if name_with_retrial not in self.all_items:
+            self.all_items.add(name_with_retrial)
             Queue.put(self, job, block, timeout)
 
 
@@ -611,18 +612,14 @@ class Platform(object):
         else:
             return False
 
-    def remove_stat_file(self, job_name):
+    def remove_stat_file(self, job):
         """
         Removes *STAT* files from remote
-
-        :param job_name: name of job to check
-        :type job_name: str
-        :return: True if successful, False otherwise
-        :rtype: bool
+        param job: job to check
+        type job: Job
         """
-        filename = job_name + '_STAT'
-        if self.delete_file(filename):
-            Log.debug('{0}_STAT have been removed', job_name)
+        if self.delete_file(job.stat_file):
+            Log.debug(f"{job.stat_file} have been removed")
             return True
         return False
 
@@ -668,30 +665,21 @@ class Platform(object):
     def check_file_exists(self, src, wrapper_failed=False, sleeptime=5, max_retries=3, first=True):
         return True
 
-    def get_stat_file(self, job_name, retries=0, count=-1):
-        """
-        Copies *STAT* files from remote to local
+    def get_stat_file(self, job, count=-1):
 
-        :param retries: number of intents to get the completed files
-        :type retries: int
-        :param job_name: name of job to check
-        :type job_name: str
-        :return: True if successful, False otherwise
-        :rtype: bool
-        """
         if count == -1:  # No internal retrials
-            filename = job_name + '_STAT'
+            filename = job.stat_file
         else:
-            filename = job_name + '_STAT_{0}'.format(str(count))
+            filename = job.name + '_STAT_{0}'.format(str(count))
         stat_local_path = os.path.join(
             self.config.get("LOCAL_ROOT_DIR"), self.expid, self.config.get("LOCAL_TMP_DIR"), filename)
         if os.path.exists(stat_local_path):
             os.remove(stat_local_path)
         if self.check_file_exists(filename):
             if self.get_file(filename, True):
-                Log.debug('{0}_STAT file have been transferred', job_name)
+                Log.debug('{0}_STAT file have been transferred', job.name)
                 return True
-        Log.debug('{0}_STAT file not found', job_name)
+        Log.debug('{0}_STAT file not found', job.name)
         return False
 
     @autosubmit_parameter(name='current_logdir')
@@ -870,6 +858,10 @@ class Platform(object):
                 keep_working = True
             if not self.recovery_queue.empty() or self.cleanup_event.is_set():
                 break
+        interruptible_additional_timeout = 60
+        while not self.work_event.is_set() and not self.cleanup_event.is_set() and interruptible_additional_timeout > 0:
+            time.sleep(1)
+            interruptible_additional_timeout -= 1
         self.work_event.clear()
         return keep_working
 
diff --git a/test/unit/test_database_regression.py b/test/unit/test_database_regression.py
index 0b550f8a4..8d17f0e12 100644
--- a/test/unit/test_database_regression.py
+++ b/test/unit/test_database_regression.py
@@ -6,6 +6,7 @@ from autosubmit.autosubmit import Autosubmit
 from log.log import Log
 import os
 import pwd
+from autosubmit.platforms.locplatform import LocalPlatform
 
 from test.unit.utils.common import create_database, init_expid
 import sqlite3
@@ -173,7 +174,7 @@ def failure_jobs_file(db_tmpdir):
 
 
 @pytest.fixture
-def run_experiment_success(prepare_db, db_tmpdir):
+def run_experiment_success(prepare_db, db_tmpdir, success_jobs_file):
     init_expid(os.environ["AUTOSUBMIT_CONFIGURATION"], platform='local', expid='t000', create=True)
     # job_list, submitter, exp_history, host, as_conf, platforms_to_test, packages_persistence, _ = Autosubmit.prepare_run("t000")
     as_misc = Path(f"{db_tmpdir.strpath}/t000/conf/as_misc.yml")
@@ -208,8 +209,7 @@ AS_COMMAND: run
         """)
     # Completed
     # mock platform.localplatform.check_exists
-    with mocker.patch('platforms.localplatform.LocalPlatform.check_exists') as check_exists:
-        check_exists.return_value = False
+    with mocker.patch('autosubmit.platforms.platform.Platform.get_completed_files', return_value=False):
         Autosubmit.run_experiment(expid='t000')
     # Access to the job_historical.db
     job_data = Path(f"{db_tmpdir.strpath}/job_data_t000.db")
@@ -256,40 +256,40 @@ def test_db_success(run_experiment_success, db_tmpdir):
     c.close()
     conn.close()
 
-# Need to improve the performance of this test 
-# def test_db_failure(run_experiment_failure, db_tmpdir):
-#     job_data = Path(f"{db_tmpdir.strpath}/job_data_t000.db")
-#     conn = sqlite3.connect(job_data)
-#     conn.row_factory = sqlite3.Row
-#     c = conn.cursor()
-#     c.execute("SELECT * FROM job_data")
-#     rows = c.fetchall()
-#     # Convert rows to a list of dictionaries
-#     rows_as_dicts = [dict(row) for row in rows]
-#     # Tune the print so it is more readable, so it is easier to debug in case of failure
-#     column_names = rows_as_dicts[0].keys() if rows_as_dicts else []
-#     column_widths = [max(len(str(row[col])) for row in rows_as_dicts + [dict(zip(column_names, column_names))]) for col
-#                      in column_names]
-#     header = " | ".join(f"{name:<{width}}" for name, width in zip(column_names, column_widths))
-#     print(f"\n{header}")
-#     print("-" * len(header))
-#     # Print the rows
-#     for row_dict in rows_as_dicts:
-#         print(" | ".join(f"{str(row_dict[col]):<{width}}" for col, width in zip(column_names, column_widths)))
-#         # Check that all fields contain data, except extra_data, children, and platform_output
-#         # Check that submit, start and finish are > 0
-#         # assert row_dict["submit"] > 0 and row_dict["finish"] != 1970010101
-#         # assert row_dict["start"] > 0 and row_dict["finish"] != 1970010101
-#         # assert row_dict["finish"] > 0 and row_dict["finish"] != 1970010101
-#         # assert row_dict["status"] == "FAILED"
-#         # for key in [key for key in row_dict.keys() if
-#         #             key not in ["status", "finish", "submit", "start", "extra_data", "children", "platform_output"]]:
-#         #     assert str(row_dict[key]) != ""
-#     # Check that the job_data table has the expected number of entries
-#     c.execute("SELECT job_name, COUNT(*) as count FROM job_data GROUP BY job_name")
-#     count_rows = c.fetchall()
-#     for row in count_rows:
-#         assert row["count"] == 1
-#     # Close the cursor and connection
-#     c.close()
-#     conn.close()
+#Need to improve the performance of this test
+def test_db_failure(run_experiment_failure, db_tmpdir):
+    job_data = Path(f"{db_tmpdir.strpath}/job_data_t000.db")
+    conn = sqlite3.connect(job_data)
+    conn.row_factory = sqlite3.Row
+    c = conn.cursor()
+    c.execute("SELECT * FROM job_data")
+    rows = c.fetchall()
+    # Convert rows to a list of dictionaries
+    rows_as_dicts = [dict(row) for row in rows]
+    # Tune the print so it is more readable, so it is easier to debug in case of failure
+    column_names = rows_as_dicts[0].keys() if rows_as_dicts else []
+    column_widths = [max(len(str(row[col])) for row in rows_as_dicts + [dict(zip(column_names, column_names))]) for col
+                     in column_names]
+    header = " | ".join(f"{name:<{width}}" for name, width in zip(column_names, column_widths))
+    print(f"\n{header}")
+    print("-" * len(header))
+    # Print the rows
+    for row_dict in rows_as_dicts:
+        print(" | ".join(f"{str(row_dict[col]):<{width}}" for col, width in zip(column_names, column_widths)))
+        # Check that all fields contain data, except extra_data, children, and platform_output
+        # Check that submit, start and finish are > 0
+        # assert row_dict["submit"] > 0 and row_dict["finish"] != 1970010101
+        # assert row_dict["start"] > 0 and row_dict["finish"] != 1970010101
+        # assert row_dict["finish"] > 0 and row_dict["finish"] != 1970010101
+        # assert row_dict["status"] == "FAILED"
+        # for key in [key for key in row_dict.keys() if
+        #             key not in ["status", "finish", "submit", "start", "extra_data", "children", "platform_output"]]:
+        #     assert str(row_dict[key]) != ""
+    # Check that the job_data table has the expected number of entries
+    c.execute("SELECT job_name, COUNT(*) as count FROM job_data GROUP BY job_name")
+    count_rows = c.fetchall()
+    for row in count_rows:
+        assert row["count"] == 2 # two retrials
+    # Close the cursor and connection
+    c.close()
+    conn.close()
-- 
GitLab


From 5f666b1c5abd0bdee5da71f05324e5bd68240952 Mon Sep 17 00:00:00 2001
From: dbeltran <daniel.beltran@bsc.es>
Date: Mon, 16 Sep 2024 12:12:02 +0200
Subject: [PATCH 19/25] [Single job] Test working, retrials working

---
 autosubmit/autosubmit.py                      | 14 ++--
 .../experiment_history_db_manager.py          | 30 ++++++-
 autosubmit/history/experiment_history.py      | 42 ++--------
 autosubmit/job/job.py                         | 19 ++---
 autosubmit/job/job_list.py                    | 30 ++-----
 autosubmit/platforms/platform.py              | 83 ++++++++++---------
 test/unit/test_database_regression.py         | 29 +++----
 7 files changed, 117 insertions(+), 130 deletions(-)

diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py
index 853dcd516..21f001961 100644
--- a/autosubmit/autosubmit.py
+++ b/autosubmit/autosubmit.py
@@ -191,7 +191,7 @@ class Autosubmit:
                                 default='DEBUG', type=str,
                                 help="sets file's log level.")
             parser.add_argument('-lc', '--logconsole', choices=('NO_LOG', 'INFO', 'WARNING', 'DEBUG'),
-                                default='INFO', type=str,
+                                default='DEBUG', type=str,
                                 help="sets console's log level")
 
             subparsers = parser.add_subparsers(dest='command')
@@ -2316,10 +2316,10 @@ class Autosubmit:
                         raise # If this happens, there is a bug in the code or an exception not-well caught
                 Log.result("No more jobs to run.")
                 # search hint - finished run
-                for job in job_list.get_completed_without_logs():
+                for job in job_list.get_completed_failed_without_logs():
                     job_list.update_log_status(job, as_conf)
                 job_list.save()
-                if not did_run and len(job_list.get_completed_without_logs()) > 0:
+                if not did_run and len(job_list.get_completed_failed_without_logs()) > 0:
                     Log.info(f"Connecting to the platforms, to recover missing logs")
                     submitter = Autosubmit._get_submitter(as_conf)
                     submitter.load_platforms(as_conf)
@@ -2328,19 +2328,19 @@ class Autosubmit:
                     platforms_to_test = [value for value in submitter.platforms.values()]
                     Autosubmit.restore_platforms(platforms_to_test, as_conf=as_conf, expid=expid)
                 Log.info("Waiting for all logs to be updated")
-                if len(job_list.get_completed_without_logs()) > 0:
+                if len(job_list.get_completed_failed_without_logs()) > 0:
                     for p in platforms_to_test:
                         if p.log_recovery_process:
                             p.cleanup_event.set()
                 for p in platforms_to_test:
                     p.log_recovery_process.join()
-                for job in job_list.get_completed_without_logs():
+                for job in job_list.get_completed_failed_without_logs():
                     job_list.update_log_status(job, as_conf)
                 job_list.save()
-                if len(job_list.get_completed_without_logs()) == 0:
+                if len(job_list.get_completed_failed_without_logs()) == 0:
                     Log.result(f"Autosubmit recovered all job logs.")
                 else:
-                    Log.warning(f"Autosubmit couldn't recover the following job logs: {[job.name for job in job_list.get_completed_without_logs()]}")
+                    Log.warning(f"Autosubmit couldn't recover the following job logs: {[job.name for job in job_list.get_completed_failed_without_logs()]}")
                 try:
                     exp_history = ExperimentHistory(expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR,
                                                     historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR)
diff --git a/autosubmit/history/database_managers/experiment_history_db_manager.py b/autosubmit/history/database_managers/experiment_history_db_manager.py
index 8df415c94..b974313c5 100644
--- a/autosubmit/history/database_managers/experiment_history_db_manager.py
+++ b/autosubmit/history/database_managers/experiment_history_db_manager.py
@@ -215,6 +215,11 @@ class ExperimentHistoryDbManager(DatabaseManager):
     """ Update JobData data class. Returns latest last=1 row from job_data by job_name. """
     self._update_job_data_by_id(job_data_dc)
     return self.get_job_data_dc_unique_latest_by_job_name(job_data_dc.job_name)
+
+  def update_job_data_dc_by_job_id(self, job_data_dc):
+    """ Update JobData data class. Returns latest last=1 row from job_data by job_name. """
+    self._update_job_data_by_job_id(job_data_dc)
+    return self.get_job_data_by_job_id(job_data_dc.job_id)
   
   def update_list_job_data_dc_by_each_id(self, job_data_dcs):
     """ Return length of updated list. """
@@ -326,6 +331,21 @@ class ExperimentHistoryDbManager(DatabaseManager):
                 job_data_dc.children, job_data_dc.platform_output, job_data_dc._id)
     self.execute_statement_with_arguments_on_dbfile(self.historicaldb_file_path, statement, arguments)
 
+  def _update_job_data_by_job_id(self, job_data_dc):
+    """
+    Update job_data table with data class JobData.
+    Update last, submit, start, finish, modified, job_id, status, energy, extra_data, nnodes, ncpus, rowstatus, out, err by id.
+    """
+    statement = ''' UPDATE job_data SET last=?, submit=?, start=?, finish=?, modified=?, 
+                    job_id=?, status=?, energy=?, extra_data=?, 
+                    nnodes=?, ncpus=?, rowstatus=?, out=?, err=?, 
+                    children=?, platform_output=? WHERE job_id=? '''
+    arguments = (job_data_dc.last, job_data_dc.submit, job_data_dc.start, job_data_dc.finish, HUtils.get_current_datetime(),
+                job_data_dc.job_id, job_data_dc.status, job_data_dc.energy, job_data_dc.extra_data,
+                job_data_dc.nnodes, job_data_dc.ncpus, job_data_dc.rowstatus, job_data_dc.out, job_data_dc.err,
+                job_data_dc.children, job_data_dc.platform_output, job_data_dc.job_id)
+    self.execute_statement_with_arguments_on_dbfile(self.historicaldb_file_path, statement, arguments)
+
   def _update_experiment_run(self, experiment_run_dc):
     """ 
     Update experiment_run table with data class ExperimentRun.  
@@ -352,7 +372,15 @@ class ExperimentHistoryDbManager(DatabaseManager):
     arguments = (job_name,)
     job_data_rows = self.get_from_statement_with_arguments(self.historicaldb_file_path, statement, arguments)
     return [Models.JobDataRow(*row) for row in job_data_rows]
-  
+
+  def get_job_data_by_job_id(self, job_id):
+    """ Get List of Models.JobDataRow for job_id """
+    statement = self.get_built_select_statement("job_data", "job_id=?")
+    arguments = (int(job_id),)
+    job_data_rows = self.get_from_statement_with_arguments(self.historicaldb_file_path, statement, arguments)
+    models = [Models.JobDataRow(*row) for row in job_data_rows][0]
+    return JobData.from_model(models)
+
   def get_job_data_max_counter(self):
     """ The max counter is the maximum count value for the count column in job_data. """
     statement = "SELECT MAX(counter) as maxcounter FROM job_data"
diff --git a/autosubmit/history/experiment_history.py b/autosubmit/history/experiment_history.py
index 5fd081600..1423139cd 100644
--- a/autosubmit/history/experiment_history.py
+++ b/autosubmit/history/experiment_history.py
@@ -97,23 +97,7 @@ class ExperimentHistory:
                          member="", section="", chunk=0, platform="NA", job_id=0, wrapper_queue=None, wrapper_code=None,
                          children=""):
         try:
-            job_data_dc_last = self.manager.get_job_data_dc_unique_latest_by_job_name(job_name)
-            if not job_data_dc_last:
-                job_data_dc_last = self.write_submit_time(job_name=job_name,
-                                                          status=status,
-                                                          ncpus=ncpus,
-                                                          wallclock=wallclock,
-                                                          qos=qos,
-                                                          date=date,
-                                                          member=member,
-                                                          section=section,
-                                                          chunk=chunk,
-                                                          platform=platform,
-                                                          job_id=job_id,
-                                                          wrapper_queue=wrapper_queue,
-                                                          wrapper_code=wrapper_code)
-                self._log.log("write_start_time {0} start not found.".format(job_name))
-            job_data_dc_last = self.manager.get_job_data_dc_unique_latest_by_job_name(job_name)
+            job_data_dc_last = self.manager.get_job_data_by_job_id(job_id)
             if not job_data_dc_last:
                 raise Exception("Job {0} has not been found in the database.".format(job_name))
             job_data_dc_last.start = start
@@ -122,7 +106,7 @@ class ExperimentHistory:
             job_data_dc_last.rowtype = self._get_defined_rowtype(wrapper_code)
             job_data_dc_last.job_id = job_id
             job_data_dc_last.children = children
-            return self.manager.update_job_data_dc_by_id(job_data_dc_last)
+            return self.manager.update_job_data_dc_by_job_id(job_data_dc_last)
         except Exception as exp:
             self._log.log(str(exp), traceback.format_exc())
             Log.debug(f'Historical Database error: {str(exp)} {traceback.format_exc()}')
@@ -131,24 +115,7 @@ class ExperimentHistory:
                           member="", section="", chunk=0, platform="NA", job_id=0, out_file=None, err_file=None,
                           wrapper_queue=None, wrapper_code=None, children=""):
         try:
-            job_data_dc_last = self.manager.get_job_data_dc_unique_latest_by_job_name(job_name)
-            if not job_data_dc_last:
-                job_data_dc_last = self.write_submit_time(job_name=job_name,
-                                                          status=status,
-                                                          ncpus=ncpus,
-                                                          wallclock=wallclock,
-                                                          qos=qos,
-                                                          date=date,
-                                                          member=member,
-                                                          section=section,
-                                                          chunk=chunk,
-                                                          platform=platform,
-                                                          job_id=job_id,
-                                                          wrapper_queue=wrapper_queue,
-                                                          wrapper_code=wrapper_code,
-                                                          children=children)
-                self._log.log("write_finish_time {0} submit not found.".format(job_name))
-            job_data_dc_last = self.manager.get_job_data_dc_unique_latest_by_job_name(job_name)
+            job_data_dc_last = self.manager.get_job_data_by_job_id(job_id)
             if not job_data_dc_last:
                 raise Exception("Job {0} has not been found in the database.".format(job_name))
             job_data_dc_last.finish = finish if finish > 0 else int(time())
@@ -157,7 +124,8 @@ class ExperimentHistory:
             job_data_dc_last.rowstatus = Models.RowStatus.PENDING_PROCESS
             job_data_dc_last.out = out_file if out_file else ""
             job_data_dc_last.err = err_file if err_file else ""
-            return self.manager.update_job_data_dc_by_id(job_data_dc_last)
+            return self.manager.update_job_data_dc_by_job_id(job_data_dc_last)
+
         except Exception as exp:
             self._log.log(str(exp), traceback.format_exc())
             Log.debug(f'Historical Database error: {str(exp)} {traceback.format_exc()}')
diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py
index 594bf28a6..5e94b6ac0 100644
--- a/autosubmit/job/job.py
+++ b/autosubmit/job/job.py
@@ -208,7 +208,7 @@ class Job(object):
         self._local_logs = ('', '')
         self._remote_logs = ('', '')
         self.script_name = self.name + ".cmd"
-        self.stat_file = self.script_name[:-4] + "_STAT"
+        self.stat_file = self.script_name[:-4] + "_STAT_0"
         self.status = status
         self.prev_status = status
         self.old_status = self.status
@@ -986,7 +986,7 @@ class Job(object):
         :rtype: int
         """
         if fail_count == -1:
-            logname = os.path.join(self._tmp_path, self.name + '_STAT')
+            logname = os.path.join(self._tmp_path, self.stat_file)
         else:
             fail_count = str(fail_count)
             logname = os.path.join(self._tmp_path, self.name + '_STAT_' + fail_count)
@@ -1145,11 +1145,12 @@ class Job(object):
                         log_retrieved = False
             self.log_retrieved = log_retrieved
 
-    def write_stats(self, platform):
+    def write_stats(self):
         # Update the logs with Autosubmit Job ID Brand
+
         try:
             for local_log in self.local_logs:
-                platform.write_jobid(self.id, os.path.join(
+                self.platform.write_jobid(self.id, os.path.join(
                     self._tmp_path, 'LOG_' + str(self.expid), local_log))
         except BaseException as e:
             Log.printlog("Trace {0} \n Failed to write the {1} e=6001".format(str(e), self.name))
@@ -1163,6 +1164,7 @@ class Job(object):
             self.platform.get_stat_file(self)
             self.write_start_time(from_stat_file=True)
             self.write_end_time(self.status == Status.COMPLETED)
+            Log.result(f"{self.fail_count} retrials of job:{self.name} and {self.id} has been inserted in the db")
 
     def retrieve_logfiles(self, platform, raise_error=False):
         """
@@ -1171,7 +1173,6 @@ class Job(object):
         :param raise_error: boolean to raise an error if the logs are not retrieved
         :return:
         """
-
         backup_logname = copy.copy(self.local_logs)
         if self.wrapper_type == "vertical":
             self.retrieve_internal_retrials_logfiles(platform)
@@ -1185,8 +1186,7 @@ class Job(object):
             else:
                 Log.printlog("Failed to retrieve logs for job {0}".format(self.name), 6000)
         else:
-            self.write_stats(platform)
-
+            self.write_stats()
 
     def parse_time(self,wallclock):
         regex = re.compile(r'(((?P<hours>\d+):)((?P<minutes>\d+)))(:(?P<seconds>\d+))?')
@@ -1286,11 +1286,6 @@ class Job(object):
             Log.printlog("Job {0} in SUBMITTED status. This should never happen on this step..".format(
                 self.name), 6008)
 
-
-        # # Write start_time() if not already written and job is running, completed or failed
-        # if self.status in [Status.RUNNING, Status.COMPLETED, Status.FAILED] and not self.start_time_written:
-        #     self.write_start_time()
-
         # Updating logs
         if self.status in [Status.COMPLETED, Status.FAILED, Status.UNKNOWN]:
             if str(as_conf.platforms_data.get(self.platform.name, {}).get('DISABLE_RECOVERY_THREADS', "false")).lower() == "true":
diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py
index fa92242af..6aad371df 100644
--- a/autosubmit/job/job_list.py
+++ b/autosubmit/job/job_list.py
@@ -1725,35 +1725,20 @@ class JobList(object):
         else:
             return completed_jobs
 
-    def get_completed_without_logs(self, platform=None):
+    def get_completed_failed_without_logs(self, platform=None):
         """
         Returns a list of completed jobs without updated logs
 
         :param platform: job platform
         :type platform: HPCPlatform
-        :return: completed jobs
-        :rtype: list
-        """
-
-        completed_jobs = [job for job in self._job_list if (platform is None or job.platform.name == platform.name) and
-                          job.status == Status.COMPLETED and job.updated_log is False ]
-
-        return completed_jobs
-
-    def get_completed_without_logs(self, platform=None):
-        """
-        Returns a list of completed jobs without updated logs
-
-        :param platform: job platform
-        :type platform: HPCPlatform
-        :return: completed jobs
+        :return: completed and failed jobs without logs
         :rtype: list
         """
 
-        completed_jobs = [job for job in self._job_list if (platform is None or job.platform.name == platform.name) and
-                          job.status == Status.COMPLETED and job.updated_log is False ]
+        completed_failed_jobs = [job for job in self._job_list if (platform is None or job.platform.name == platform.name) and
+                                 (job.status == Status.COMPLETED or job.status == Status.FAILED) and job.updated_log is False ]
 
-        return completed_jobs
+        return completed_failed_jobs
 
     def get_uncompleted(self, platform=None, wrapper=False):
         """
@@ -2602,6 +2587,7 @@ class JobList(object):
             job.updated_log = True
         elif not job.updated_log and str(as_conf.platforms_data.get(job.platform.name, {}).get('DISABLE_RECOVERY_THREADS', "false")).lower() == "false":
             job.platform.add_job_to_log_recover(job)
+        return log_recovered
 
     def check_if_log_is_recovered(self, job):
         """
@@ -2639,6 +2625,8 @@ class JobList(object):
             save = store_change
         Log.debug('Updating FAILED jobs')
         write_log_status = False
+        for job in self.get_completed_failed_without_logs():
+            save = self.update_log_status(job, as_conf) if not save else save
         if not first_time:
             for job in self.get_failed():
                 job.packed = False
@@ -2714,8 +2702,6 @@ class JobList(object):
             # Log name has this format:
                 # a02o_20000101_fc0_2_SIM.20240212115021.err
                 # $jobname.$(YYYYMMDDHHMMSS).err or .out
-            if not first_time:
-                self.update_log_status(job, as_conf)
             if job.synchronize is not None and len(str(job.synchronize)) > 0:
                 tmp = [parent for parent in job.parents if parent.status == Status.COMPLETED]
                 if len(tmp) != len(job.parents):
diff --git a/autosubmit/platforms/platform.py b/autosubmit/platforms/platform.py
index d67ac6af0..9bca11920 100644
--- a/autosubmit/platforms/platform.py
+++ b/autosubmit/platforms/platform.py
@@ -36,6 +36,7 @@ class Platform(object):
     Class to manage the connections to the different platforms.
     """
     worker_events = list()
+    lock = multiprocessing.Lock()
 
     def __init__(self, expid, name, config, auth_password=None):
         """
@@ -852,55 +853,63 @@ class Platform(object):
         This function, waits for the work_event to be set by the main process. If it is not set, it returns False and the log recovery process ends.
         """
         keep_working = False
-        for remaining in range(timeout, 0, -1): # Wait the full timeout to not hammer the CPU.
+        # The whole point of this if is to make the regression tests faster to run.
+        if timeout >= 60:
+            unstoppable_timeout = 60
+            timeout = timeout - 60
+        else:
+            unstoppable_timeout = timeout
+            timeout = 0
+        for remaining in range(unstoppable_timeout, 0, -1):
             time.sleep(1)
             if self.work_event.is_set() or not self.recovery_queue.empty():
                 keep_working = True
             if not self.recovery_queue.empty() or self.cleanup_event.is_set():
                 break
-        interruptible_additional_timeout = 60
-        while not self.work_event.is_set() and not self.cleanup_event.is_set() and interruptible_additional_timeout > 0:
-            time.sleep(1)
-            interruptible_additional_timeout -= 1
+        if not keep_working:
+            while timeout > 0 and self.recovery_queue.empty() and not self.cleanup_event.is_set() and not self.work_event.is_set():
+                time.sleep(1)
+                timeout -= 1
+            if not self.recovery_queue.empty() or self.cleanup_event.is_set() or self.work_event.is_set():
+                keep_working = True
         self.work_event.clear()
         return keep_working
 
     def recover_job_log(self, identifier, jobs_pending_to_process):
         job = None
-        try:
-            while not self.recovery_queue.empty():
-                try:
-                    job = self.recovery_queue.get(
-                        timeout=1)  # Should be non-empty, but added a timeout for other possible errors.
-                    job.children = set()  # Children can't be serialized, so we set it to an empty set for this process.
-                    job.platform = self  # change the original platform to this process platform.
-                    job._log_recovery_retries = 0  # reset the log recovery retries.
-                    Log.debug(f"{identifier} Recovering log files for job {job.name}")
-                    job.retrieve_logfiles(self, raise_error=True)
-                    Log.result(f"{identifier} Sucessfully recovered log files for job {job.name}")
-                except queue.Empty:
-                    pass
-            # This second while is to keep retring the failed jobs.
-            while len(jobs_pending_to_process) > 0:  # jobs that had any issue during the log retrieval
-                job = jobs_pending_to_process.pop()
-                job.children = set()
-                job.platform = self
-                job._log_recovery_retries += 1
-                Log.debug(
-                    f"{identifier} (Retrial number: {job._log_recovery_retries}) Recovering log files for job {job.name}")
-                job.retrieve_logfiles(self, raise_error=True)
-                Log.result(f"{identifier} (Retrial) Successfully recovered log files for job {job.name}")
-        except Exception as e:
-            Log.warning(f"{identifier} Error while recovering logs: {str(e)}")
+        # try:
+        while not self.recovery_queue.empty():
             try:
-                if job and job._log_recovery_retries < 5:  # If log retrieval failed, add it to the pending jobs to process. Avoids to keep trying the same job forever.
-                    jobs_pending_to_process.add(job)
-                self.connected = False
-                Log.info(f"{identifier} Attempting to restore connection")
-                self.restore_connection(None)  # Always restore the connection on a failure.
-                Log.result(f"{identifier} Sucessfully reconnected.")
-            except:
+                job = self.recovery_queue.get(
+                    timeout=1)  # Should be non-empty, but added a timeout for other possible errors.
+                job.children = set()  # Children can't be serialized, so we set it to an empty set for this process.
+                job.platform = self  # change the original platform to this process platform.
+                job._log_recovery_retries = 0  # reset the log recovery retries.
+                Log.debug(f"{identifier} Recovering log files for job {job.name}")
+                job.retrieve_logfiles(self, raise_error=True)
+                if job.status == Status.FAILED:
+                    Log.result(f"{identifier} Sucessfully recovered log files for job {job.name} and retrial:{job.fail_count}")
+            except queue.Empty:
                 pass
+        # This second while is to keep retring the failed jobs.
+        while len(jobs_pending_to_process) > 0:  # jobs that had any issue during the log retrieval
+            job = jobs_pending_to_process.pop()
+            job._log_recovery_retries += 1
+            Log.debug(
+                f"{identifier} (Retrial number: {job._log_recovery_retries}) Recovering log files for job {job.name}")
+            job.retrieve_logfiles(self, raise_error=True)
+            Log.result(f"{identifier} (Retrial) Successfully recovered log files for job {job.name}")
+        # except Exception as e:
+        #     Log.info(f"{identifier} Error while recovering logs: {str(e)}")
+        #     try:
+        #         if job and job._log_recovery_retries < 5:  # If log retrieval failed, add it to the pending jobs to process. Avoids to keep trying the same job forever.
+        #             jobs_pending_to_process.add(job)
+        #         self.connected = False
+        #         Log.info(f"{identifier} Attempting to restore connection")
+        #         self.restore_connection(None)  # Always restore the connection on a failure.
+        #         Log.result(f"{identifier} Sucessfully reconnected.")
+        #     except:
+        #         pass
         return jobs_pending_to_process
 
     def recover_platform_job_logs(self):
diff --git a/test/unit/test_database_regression.py b/test/unit/test_database_regression.py
index 8d17f0e12..3e5032a7a 100644
--- a/test/unit/test_database_regression.py
+++ b/test/unit/test_database_regression.py
@@ -169,12 +169,12 @@ def failure_jobs_file(db_tmpdir):
             PLATFORM: local
             RUNNING: chunk
             wallclock: 00:01
-            retrials: 1
+            retrials: 2
     """)
 
 
 @pytest.fixture
-def run_experiment_success(prepare_db, db_tmpdir, success_jobs_file):
+def run_experiment_success(prepare_db, db_tmpdir, success_jobs_file, mocker):
     init_expid(os.environ["AUTOSUBMIT_CONFIGURATION"], platform='local', expid='t000', create=True)
     # job_list, submitter, exp_history, host, as_conf, platforms_to_test, packages_persistence, _ = Autosubmit.prepare_run("t000")
     as_misc = Path(f"{db_tmpdir.strpath}/t000/conf/as_misc.yml")
@@ -186,7 +186,8 @@ ASMISC:
 AS_COMMAND: run
         """)
     # Completed
-    Autosubmit.run_experiment(expid='t000')
+    with mocker.patch('autosubmit.platforms.platform.max', return_value=50):
+        Autosubmit.run_experiment(expid='t000')
     # Access to the job_historical.db
     job_data = Path(f"{db_tmpdir.strpath}/job_data_t000.db")
     autosubmit_db = Path(f"{db_tmpdir.strpath}/tests.db")
@@ -209,8 +210,9 @@ AS_COMMAND: run
         """)
     # Completed
     # mock platform.localplatform.check_exists
-    with mocker.patch('autosubmit.platforms.platform.Platform.get_completed_files', return_value=False):
-        Autosubmit.run_experiment(expid='t000')
+    with mocker.patch('autosubmit.platforms.platform.max', return_value=10):
+        with mocker.patch('autosubmit.platforms.platform.Platform.get_completed_files', return_value=False):
+            Autosubmit.run_experiment(expid='t000')
     # Access to the job_historical.db
     job_data = Path(f"{db_tmpdir.strpath}/job_data_t000.db")
     autosubmit_db = Path(f"{db_tmpdir.strpath}/tests.db")
@@ -256,7 +258,6 @@ def test_db_success(run_experiment_success, db_tmpdir):
     c.close()
     conn.close()
 
-#Need to improve the performance of this test
 def test_db_failure(run_experiment_failure, db_tmpdir):
     job_data = Path(f"{db_tmpdir.strpath}/job_data_t000.db")
     conn = sqlite3.connect(job_data)
@@ -278,18 +279,18 @@ def test_db_failure(run_experiment_failure, db_tmpdir):
         print(" | ".join(f"{str(row_dict[col]):<{width}}" for col, width in zip(column_names, column_widths)))
         # Check that all fields contain data, except extra_data, children, and platform_output
         # Check that submit, start and finish are > 0
-        # assert row_dict["submit"] > 0 and row_dict["finish"] != 1970010101
-        # assert row_dict["start"] > 0 and row_dict["finish"] != 1970010101
-        # assert row_dict["finish"] > 0 and row_dict["finish"] != 1970010101
-        # assert row_dict["status"] == "FAILED"
-        # for key in [key for key in row_dict.keys() if
-        #             key not in ["status", "finish", "submit", "start", "extra_data", "children", "platform_output"]]:
-        #     assert str(row_dict[key]) != ""
+        assert row_dict["submit"] > 0 and row_dict["finish"] != 1970010101
+        assert row_dict["start"] > 0 and row_dict["finish"] != 1970010101
+        assert row_dict["finish"] > 0 and row_dict["finish"] != 1970010101
+        assert row_dict["status"] == "FAILED"
+        for key in [key for key in row_dict.keys() if
+                    key not in ["status", "finish", "submit", "start", "extra_data", "children", "platform_output"]]:
+            assert str(row_dict[key]) != ""
     # Check that the job_data table has the expected number of entries
     c.execute("SELECT job_name, COUNT(*) as count FROM job_data GROUP BY job_name")
     count_rows = c.fetchall()
     for row in count_rows:
-        assert row["count"] == 2 # two retrials
+        assert row["count"] == 3 # three retrials
     # Close the cursor and connection
     c.close()
     conn.close()
-- 
GitLab


From aa14b011a92ce4cb8cb8a2828381068cab5b79d3 Mon Sep 17 00:00:00 2001
From: dbeltran <daniel.beltran@bsc.es>
Date: Thu, 26 Sep 2024 09:28:28 +0200
Subject: [PATCH 20/25] !475 vertical regression test

---
 autosubmit/autosubmit.py                      |   9 +-
 .../experiment_history_db_manager.py          |  40 +--
 autosubmit/history/experiment_history.py      |   8 +-
 autosubmit/job/job.py                         | 284 ++++++++++--------
 autosubmit/job/job_list.py                    |   5 +-
 autosubmit/job/job_packager.py                |  32 +-
 autosubmit/job/job_packages.py                |  27 +-
 autosubmit/platforms/locplatform.py           |  67 ++++-
 autosubmit/platforms/paramiko_platform.py     |  56 +++-
 autosubmit/platforms/pjmplatform.py           |  13 +-
 autosubmit/platforms/platform.py              | 119 ++++----
 autosubmit/platforms/slurmplatform.py         |  27 +-
 .../platforms/wrappers/wrapper_builder.py     |  51 +++-
 .../platforms/wrappers/wrapper_factory.py     |  52 ++++
 test/unit/test_database_regression.py         | 192 +++++-------
 test/unit/test_log_recovery.py                |   8 +-
 16 files changed, 539 insertions(+), 451 deletions(-)

diff --git a/autosubmit/autosubmit.py b/autosubmit/autosubmit.py
index 21f001961..e27d4dfd4 100644
--- a/autosubmit/autosubmit.py
+++ b/autosubmit/autosubmit.py
@@ -2333,7 +2333,8 @@ class Autosubmit:
                         if p.log_recovery_process:
                             p.cleanup_event.set()
                 for p in platforms_to_test:
-                    p.log_recovery_process.join()
+                    if p.log_recovery_process:
+                        p.log_recovery_process.join()
                 for job in job_list.get_completed_failed_without_logs():
                     job_list.update_log_status(job, as_conf)
                 job_list.save()
@@ -2506,12 +2507,6 @@ class Autosubmit:
                 if error_message != "":
                     raise AutosubmitCritical("Submission Failed due wrong configuration:{0}".format(error_message),
                                              7014)
-                if not inspect:
-                    for package in valid_packages_to_submit:
-                        wrapper_time = None
-                        for job in package.jobs: # if jobs > 1 == wrapped == same submission time
-                            job.write_submit_time(wrapper_submit_time=wrapper_time)
-                            wrapper_time = job.submit_time_timestamp
 
             if wrapper_errors and not any_job_submitted and len(job_list.get_in_queue()) == 0:
                 # Deadlock situation
diff --git a/autosubmit/history/database_managers/experiment_history_db_manager.py b/autosubmit/history/database_managers/experiment_history_db_manager.py
index b974313c5..1e8127bef 100644
--- a/autosubmit/history/database_managers/experiment_history_db_manager.py
+++ b/autosubmit/history/database_managers/experiment_history_db_manager.py
@@ -216,10 +216,10 @@ class ExperimentHistoryDbManager(DatabaseManager):
     self._update_job_data_by_id(job_data_dc)
     return self.get_job_data_dc_unique_latest_by_job_name(job_data_dc.job_name)
 
-  def update_job_data_dc_by_job_id(self, job_data_dc):
+  def update_job_data_dc_by_job_id_name(self, job_data_dc):
     """ Update JobData data class. Returns latest last=1 row from job_data by job_name. """
-    self._update_job_data_by_job_id(job_data_dc)
-    return self.get_job_data_by_job_id(job_data_dc.job_id)
+    self._update_job_data_by_id(job_data_dc)
+    return self.get_job_data_by_job_id_name(job_data_dc.job_id, job_data_dc.job_name)
   
   def update_list_job_data_dc_by_each_id(self, job_data_dcs):
     """ Return length of updated list. """
@@ -324,26 +324,11 @@ class ExperimentHistoryDbManager(DatabaseManager):
     statement = ''' UPDATE job_data SET last=?, submit=?, start=?, finish=?, modified=?, 
                     job_id=?, status=?, energy=?, extra_data=?, 
                     nnodes=?, ncpus=?, rowstatus=?, out=?, err=?, 
-                    children=?, platform_output=? WHERE id=? '''
-    arguments = (job_data_dc.last, job_data_dc.submit, job_data_dc.start, job_data_dc.finish, HUtils.get_current_datetime(), 
-                job_data_dc.job_id, job_data_dc.status, job_data_dc.energy, job_data_dc.extra_data, 
-                job_data_dc.nnodes, job_data_dc.ncpus, job_data_dc.rowstatus, job_data_dc.out, job_data_dc.err, 
-                job_data_dc.children, job_data_dc.platform_output, job_data_dc._id)
-    self.execute_statement_with_arguments_on_dbfile(self.historicaldb_file_path, statement, arguments)
-
-  def _update_job_data_by_job_id(self, job_data_dc):
-    """
-    Update job_data table with data class JobData.
-    Update last, submit, start, finish, modified, job_id, status, energy, extra_data, nnodes, ncpus, rowstatus, out, err by id.
-    """
-    statement = ''' UPDATE job_data SET last=?, submit=?, start=?, finish=?, modified=?, 
-                    job_id=?, status=?, energy=?, extra_data=?, 
-                    nnodes=?, ncpus=?, rowstatus=?, out=?, err=?, 
-                    children=?, platform_output=? WHERE job_id=? '''
+                    children=?, platform_output=?, id=? WHERE id=?'''
     arguments = (job_data_dc.last, job_data_dc.submit, job_data_dc.start, job_data_dc.finish, HUtils.get_current_datetime(),
                 job_data_dc.job_id, job_data_dc.status, job_data_dc.energy, job_data_dc.extra_data,
                 job_data_dc.nnodes, job_data_dc.ncpus, job_data_dc.rowstatus, job_data_dc.out, job_data_dc.err,
-                job_data_dc.children, job_data_dc.platform_output, job_data_dc.job_id)
+                job_data_dc.children, job_data_dc.platform_output, job_data_dc._id, job_data_dc._id)
     self.execute_statement_with_arguments_on_dbfile(self.historicaldb_file_path, statement, arguments)
 
   def _update_experiment_run(self, experiment_run_dc):
@@ -366,19 +351,12 @@ class ExperimentHistoryDbManager(DatabaseManager):
     job_data_rows = self.get_from_statement_with_arguments(self.historicaldb_file_path, statement, arguments)
     return [Models.JobDataRow(*row) for row in job_data_rows]
 
-  def get_job_data_by_name(self, job_name):
-    """ Get List of Models.JobDataRow for job_name """
-    statement = self.get_built_select_statement("job_data", "job_name=? ORDER BY counter DESC")
-    arguments = (job_name,)
-    job_data_rows = self.get_from_statement_with_arguments(self.historicaldb_file_path, statement, arguments)
-    return [Models.JobDataRow(*row) for row in job_data_rows]
-
-  def get_job_data_by_job_id(self, job_id):
+  def get_job_data_by_job_id_name(self, job_id, job_name):
     """ Get List of Models.JobDataRow for job_id """
-    statement = self.get_built_select_statement("job_data", "job_id=?")
-    arguments = (int(job_id),)
+    statement = self.get_built_select_statement("job_data", "job_id=? AND job_name=? ORDER BY counter")
+    arguments = (int(job_id), str(job_name),)
     job_data_rows = self.get_from_statement_with_arguments(self.historicaldb_file_path, statement, arguments)
-    models = [Models.JobDataRow(*row) for row in job_data_rows][0]
+    models = [Models.JobDataRow(*row) for row in job_data_rows][-1]
     return JobData.from_model(models)
 
   def get_job_data_max_counter(self):
diff --git a/autosubmit/history/experiment_history.py b/autosubmit/history/experiment_history.py
index 1423139cd..8dcf5d068 100644
--- a/autosubmit/history/experiment_history.py
+++ b/autosubmit/history/experiment_history.py
@@ -97,7 +97,7 @@ class ExperimentHistory:
                          member="", section="", chunk=0, platform="NA", job_id=0, wrapper_queue=None, wrapper_code=None,
                          children=""):
         try:
-            job_data_dc_last = self.manager.get_job_data_by_job_id(job_id)
+            job_data_dc_last = self.manager.get_job_data_by_job_id_name(job_id, job_name)
             if not job_data_dc_last:
                 raise Exception("Job {0} has not been found in the database.".format(job_name))
             job_data_dc_last.start = start
@@ -106,7 +106,7 @@ class ExperimentHistory:
             job_data_dc_last.rowtype = self._get_defined_rowtype(wrapper_code)
             job_data_dc_last.job_id = job_id
             job_data_dc_last.children = children
-            return self.manager.update_job_data_dc_by_job_id(job_data_dc_last)
+            return self.manager.update_job_data_dc_by_job_id_name(job_data_dc_last)
         except Exception as exp:
             self._log.log(str(exp), traceback.format_exc())
             Log.debug(f'Historical Database error: {str(exp)} {traceback.format_exc()}')
@@ -115,7 +115,7 @@ class ExperimentHistory:
                           member="", section="", chunk=0, platform="NA", job_id=0, out_file=None, err_file=None,
                           wrapper_queue=None, wrapper_code=None, children=""):
         try:
-            job_data_dc_last = self.manager.get_job_data_by_job_id(job_id)
+            job_data_dc_last = self.manager.get_job_data_by_job_id_name(job_id, job_name)
             if not job_data_dc_last:
                 raise Exception("Job {0} has not been found in the database.".format(job_name))
             job_data_dc_last.finish = finish if finish > 0 else int(time())
@@ -124,7 +124,7 @@ class ExperimentHistory:
             job_data_dc_last.rowstatus = Models.RowStatus.PENDING_PROCESS
             job_data_dc_last.out = out_file if out_file else ""
             job_data_dc_last.err = err_file if err_file else ""
-            return self.manager.update_job_data_dc_by_job_id(job_data_dc_last)
+            return self.manager.update_job_data_dc_by_job_id_name(job_data_dc_last)
 
         except Exception as exp:
             self._log.log(str(exp), traceback.format_exc())
diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py
index 5e94b6ac0..23491b6db 100644
--- a/autosubmit/job/job.py
+++ b/autosubmit/job/job.py
@@ -251,12 +251,14 @@ class Job(object):
         self.het = None
         self.updated_log = False
         self.log_retrieved = False
-        self.start_time_written = False
         self.submit_time_timestamp = None # for wrappers, all jobs inside a wrapper are submitted at the same time
+        self.start_time_timestamp = None
         self.finish_time_timestamp = None # for wrappers, with inner_retrials, the submission time should be the last finish_time of the previous retrial
         self._script = None # Inline code to be executed
         self._log_recovery_retries = None
         self.ready_date = None
+        self.wrapper_name = None
+        self.is_wrapper = False
 
     def _init_runtime_parameters(self):
         # hetjobs
@@ -271,9 +273,10 @@ class Job(object):
         self._memory = ''
         self._memory_per_task = ''
         self.log_retrieved = False
-        self.start_time_placeholder = time.time()
+        self.start_time_timestamp = time.time()
+        self.end_time_placeholder = time.time()
         self.processors_per_node = ""
-        self.stat_file = self.script_name[:-4] + "_STAT"
+        self.stat_file = self.script_name[:-4] + "_STAT_0"
 
 
     @property
@@ -1127,66 +1130,77 @@ class Job(object):
 
     def retrieve_internal_retrials_logfiles(self, platform):
         log_retrieved = False
-        original = copy.deepcopy(self.local_logs)
-        for i in range(0, int(self.retrials + 1)):
-            if i > 0:
-                self.local_logs = (original[0][:-4] + "_{0}".format(i) + ".out", original[1][:-4] + "_{0}".format(i) + ".err")
-            self.remote_logs = self.get_new_remotelog_name(i)
-            if not self.remote_logs:
-                self.log_retrieved = False
-            else:
+        last_retrial = 0
+        try:
+            for i in range(0, int(self.retrials + 1)):
+                self.update_local_logs(count=i, update_submit_time=False)
+                backup_log = copy.copy(self.remote_logs)
+                self.remote_logs = self.get_new_remotelog_name(i)
                 if self.check_remote_log_exists(platform):
-                    try:
-                        self.synchronize_logs(platform, self.remote_logs, self.local_logs)
-                        remote_logs = copy.deepcopy(self.local_logs)
-                        platform.get_logs_files(self.expid, remote_logs)
-                        log_retrieved = True
-                    except BaseException:
-                        log_retrieved = False
-            self.log_retrieved = log_retrieved
-
-    def write_stats(self):
-        # Update the logs with Autosubmit Job ID Brand
+                    self.synchronize_logs(platform, self.remote_logs, self.local_logs)
+                    remote_logs = copy.deepcopy(self.local_logs)
+                    platform.get_logs_files(self.expid, remote_logs)
+                    log_retrieved = True
+                    last_retrial = i
+                else:
+                    self.remote_logs = backup_log
+                    break
+        except:
+            pass
+        self.log_retrieved = log_retrieved
+        if self.log_retrieved:
+            self.platform.processed_wrapper_logs.add(self.wrapper_name)
 
-        try:
-            for local_log in self.local_logs:
-                self.platform.write_jobid(self.id, os.path.join(
-                    self._tmp_path, 'LOG_' + str(self.expid), local_log))
-        except BaseException as e:
-            Log.printlog("Trace {0} \n Failed to write the {1} e=6001".format(str(e), self.name))
+        return last_retrial
+
+    def write_stats(self, last_retrial):
         # write stats
         if self.wrapper_type == "vertical":  # Disable AS retrials for vertical wrappers to use internal ones
-            for i in range(0, int(self.retrials + 1)):
-                if self.platform.get_stat_file(self, count=i):
-                    self.write_vertical_time(i)
-                    self.inc_fail_count()
+            for i in range(0, int(last_retrial + 1)):
+                self.platform.get_stat_file(self, count=i)
+                self.write_vertical_time(i)
+                self.inc_fail_count()
+                # Update the logs with Autosubmit Job ID Brand
+                try:
+                    for local_log in self.local_logs:
+                        self.platform.write_jobid(self.id, os.path.join(
+                            self._tmp_path, 'LOG_' + str(self.expid), local_log))
+                except BaseException as e:
+                    Log.printlog("Trace {0} \n Failed to write the {1} e=6001".format(str(e), self.name))
         else:
+            self.update_local_logs(update_submit_time=False)
             self.platform.get_stat_file(self)
-            self.write_start_time(from_stat_file=True)
+            self.write_submit_time()
+            self.write_start_time()
             self.write_end_time(self.status == Status.COMPLETED)
             Log.result(f"{self.fail_count} retrials of job:{self.name} and {self.id} has been inserted in the db")
+            # Update the logs with Autosubmit Job ID Brand
+            try:
+                for local_log in self.local_logs:
+                    self.platform.write_jobid(self.id, os.path.join(
+                        self._tmp_path, 'LOG_' + str(self.expid), local_log))
+            except BaseException as e:
+                Log.printlog("Trace {0} \n Failed to write the {1} e=6001".format(str(e), self.name))
 
     def retrieve_logfiles(self, platform, raise_error=False):
         """
-        Retrieves log files from remote host meant to be used inside a process.
-        :param platform: platform that is calling the function, already connected.
-        :param raise_error: boolean to raise an error if the logs are not retrieved
-        :return:
+        Retrieves log files from remote host
+        :param platform: HPCPlatform object
+        :param raise_error: boolean, if True, raises an error if the log files are not retrieved
+        :return: dict with finish timestamps per job
         """
         backup_logname = copy.copy(self.local_logs)
         if self.wrapper_type == "vertical":
-            self.retrieve_internal_retrials_logfiles(platform)
+            last_retrial = self.retrieve_internal_retrials_logfiles(platform)
         else:
             self.retrieve_external_retrials_logfiles(platform)
-
+            last_retrial = 0
         if not self.log_retrieved:
             self.local_logs = backup_logname
-            if raise_error:
+            if raise_error and self.wrapper_name not in self.platform.processed_wrapper_logs:
                 raise AutosubmitCritical("Failed to retrieve logs for job {0}".format(self.name), 6000)
-            else:
-                Log.printlog("Failed to retrieve logs for job {0}".format(self.name), 6000)
         else:
-            self.write_stats()
+            self.write_stats(last_retrial)
 
     def parse_time(self,wallclock):
         regex = re.compile(r'(((?P<hours>\d+):)((?P<minutes>\d+)))(:(?P<seconds>\d+))?')
@@ -1292,10 +1306,6 @@ class Job(object):
                 self.retrieve_logfiles(self.platform)
             else:
                 self.platform.add_job_to_log_recover(self)
-
-
-
-
         return self.status
 
     @staticmethod
@@ -1328,16 +1338,16 @@ class Job(object):
         :param default_status: status to set if job is not completed. By default, is FAILED
         :type default_status: Status
         """
-        log_name = os.path.join(self._tmp_path, self.name + '_COMPLETED')
+        completed_file = os.path.join(self._tmp_path, self.name + '_COMPLETED')
+        completed_file_location = os.path.join(self._tmp_path, f"LOG_{self.expid}", self.name + '_COMPLETED')
 
-        if os.path.exists(log_name):
+        if os.path.exists(completed_file) or os.path.exists(completed_file_location):
             if not over_wallclock:
                 self.status = Status.COMPLETED
             else:
                 return Status.COMPLETED
         else:
-            Log.printlog("Job {0} completion check failed. There is no COMPLETED file".format(
-                self.name), 6009)
+            Log.warning(f"Couldn't find {self.name} COMPLETED file")
             if not over_wallclock:
                 self.status = default_status
             else:
@@ -1878,7 +1888,11 @@ class Job(object):
         self.shape = as_conf.jobs_data[self.section].get("SHAPE", "")
         self.script = as_conf.jobs_data[self.section].get("SCRIPT", "")
         self.x11 = False if str(as_conf.jobs_data[self.section].get("X11", False)).lower() == "false" else True
-        self.stat_file = f"{self.script_name[:-4]}_STAT_{self.fail_count}"
+        if self.wrapper_type != "vertical" and self.packed:
+            self.stat_file = f"{self.script_name[:-4]}_STAT_{self.fail_count}"
+        else:
+            self.stat_file = f"{self.script_name[:-4]}_STAT_0"
+
         if self.checkpoint: # To activate placeholder sustitution per <empty> in the template
             parameters["AS_CHECKPOINT"] = self.checkpoint
         parameters['JOBNAME'] = self.name
@@ -1960,9 +1974,6 @@ class Job(object):
         self._init_runtime_parameters()
         if hasattr(self, "start_time"):
             self.start_time = time.time()
-        for event in self.platform.worker_events:  # keep alive log retrieval workers.
-            if not event.is_set():
-                event.set()
         # Parameters that affect to all the rest of parameters
         self.update_dict_parameters(as_conf)
         parameters = parameters.copy()
@@ -1986,6 +1997,9 @@ class Job(object):
         # For some reason, there is return but the assignee is also necessary
         self.parameters = parameters
         # This return is only being used by the mock , to change the mock
+        for event in self.platform.worker_events:  # keep alive log retrieval workers.
+            if not event.is_set():
+                event.set()
         return parameters
 
     def update_content_extra(self,as_conf,files):
@@ -2229,29 +2243,20 @@ class Job(object):
                         str(set(parameters) - set(variables))), 5013)
         return out
 
-    def write_submit_time(self, hold=False, enable_vertical_write=False, wrapper_submit_time=None):
+    def update_local_logs(self, count=-1, update_submit_time=True):
+        if update_submit_time:
+            self.submit_time_timestamp = date2str(datetime.datetime.now(), 'S')
+        if count > 0:
+            self.local_logs = (f"{self.name}.{self.submit_time_timestamp}.out_retrial_{count}",
+                               f"{self.name}.{self.submit_time_timestamp}.err_retrial_{count}")
+        else:
+            self.local_logs = (f"{self.name}.{self.submit_time_timestamp}.out",
+                               f"{self.name}.{self.submit_time_timestamp}.err")
+
+    def write_submit_time(self):
         """
         Writes submit date and time to TOTAL_STATS file. It doesn't write if hold is True.
         """
-
-        self.start_time_written = False
-        if not enable_vertical_write:
-            if wrapper_submit_time:
-                self.submit_time_timestamp = wrapper_submit_time
-            else:
-                self.submit_time_timestamp = date2str(datetime.datetime.now(), 'S')
-            if self.wrapper_type != "vertical":
-                self.local_logs = (f"{self.name}.{self.submit_time_timestamp}.out", f"{self.name}.{self.submit_time_timestamp}.err") # for wrappers with inner retrials
-            else:
-                self.local_logs = (f"{self.name}.{self.submit_time_timestamp}.out",
-                                   f"{self.name}.{self.submit_time_timestamp}.err")  # for wrappers with inner retrials
-                return
-        if self.wrapper_type == "vertical" and self.fail_count > 0:
-            self.submit_time_timestamp = self.finish_time_timestamp
-        print(("Call from {} with status {}".format(self.name, self.status_str)))
-        if hold is True:
-            return # Do not write for HELD jobs.
-
         data_time = ["",int(datetime.datetime.strptime(self.submit_time_timestamp, "%Y%m%d%H%M%S").timestamp())]
         path = os.path.join(self._tmp_path, self.name + '_TOTAL_STATS')
         if os.path.exists(path):
@@ -2268,57 +2273,56 @@ class Job(object):
                                     platform=self.platform_name, job_id=self.id, wrapper_queue=self._wrapper_queue, wrapper_code=get_job_package_code(self.expid, self.name),
                                     children=self.children_names_str)
 
-    def write_start_time(self, enable_vertical_write=False, from_stat_file=False, count=-1):
+    def update_start_time(self, count=-1):
+        start_time_ = self.check_start_time(count) # last known start time from the .cmd file
+        if start_time_:
+            self.start_time_timestamp = start_time_
+        else:
+            Log.warning(f"Start time for job {self.name} not found in the .cmd file, using last known time.")
+            self.start_time_timestamp = self.start_time_timestamp if self.start_time_timestamp else time.time()
+        if count > 0 or self.wrapper_name in self.platform.processed_wrapper_logs:
+            self.submit_time_timestamp = date2str(datetime.datetime.fromtimestamp(self.start_time_timestamp),'S')
+
+    def write_start_time(self, count=-1, vertical_wrapper=False):
         """
         Writes start date and time to TOTAL_STATS file
         :return: True if successful, False otherwise
         :rtype: bool
         """
-
-        if not enable_vertical_write and self.wrapper_type == "vertical":
-            return
-
-        self.start_time_written = True
-        if not from_stat_file: # last known start time from AS
-            self.start_time_placeholder = time.time()
-        elif from_stat_file:
-            start_time_ = self.check_start_time(count) # last known start time from the .cmd file
-            if start_time_:
-                start_time = start_time_
-            else:
-                start_time = self.start_time_placeholder if self.start_time_placeholder else time.time()
-            path = os.path.join(self._tmp_path, self.name + '_TOTAL_STATS')
-            f = open(path, 'a')
-            f.write(' ')
-            # noinspection PyTypeChecker
-            f.write(date2str(datetime.datetime.fromtimestamp(start_time), 'S'))
-            # Writing database
-            exp_history = ExperimentHistory(self.expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR)
-            exp_history.write_start_time(self.name, start=start_time, status=Status.VALUE_TO_KEY.get(self.status, "UNKNOWN"), ncpus=self.processors,
-                                    wallclock=self.wallclock, qos=self.queue, date=self.date, member=self.member, section=self.section, chunk=self.chunk,
-                                    platform=self.platform_name, job_id=self.id, wrapper_queue=self._wrapper_queue, wrapper_code=get_job_package_code(self.expid, self.name),
-                                    children=self.children_names_str)
+        if not vertical_wrapper:
+            self.update_start_time(count)
+        path = os.path.join(self._tmp_path, self.name + '_TOTAL_STATS')
+        f = open(path, 'a')
+        f.write(' ')
+        # noinspection PyTypeChecker
+        f.write(date2str(datetime.datetime.fromtimestamp(self.start_time_timestamp), 'S'))
+        # Writing database
+        exp_history = ExperimentHistory(self.expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR)
+        exp_history.write_start_time(self.name, start=self.start_time_timestamp, status=Status.VALUE_TO_KEY.get(self.status, "UNKNOWN"), ncpus=self.processors,
+                                wallclock=self.wallclock, qos=self.queue, date=self.date, member=self.member, section=self.section, chunk=self.chunk,
+                                platform=self.platform_name, job_id=self.id, wrapper_queue=self._wrapper_queue, wrapper_code=get_job_package_code(self.expid, self.name),
+                                children=self.children_names_str)
         return True
 
     def write_vertical_time(self, count=-1):
-        self.write_submit_time(enable_vertical_write=True)
-        self.write_start_time(enable_vertical_write=True, from_stat_file=True, count=count)
-        self.write_end_time(self.status == Status.COMPLETED, enable_vertical_write=True, count=count)
+        self.update_start_time(count=count)
+        self.update_local_logs(update_submit_time=False)
+        self.write_submit_time()
+        self.write_start_time(count=count, vertical_wrapper=True)
+        self.write_end_time(self.status == Status.COMPLETED, count=count)
 
-    def write_end_time(self, completed, enable_vertical_write=False, count = -1):
+    def write_end_time(self, completed, count=-1):
         """
-        Writes ends date and time to TOTAL_STATS file
-        :param completed: True if job was completed successfully, False otherwise
+        Writes end timestamp to TOTAL_STATS file and jobs_data.db
+        :param completed: True if the job has been completed, False otherwise
         :type completed: bool
+        :param count: number of retrials
+        :type count: int
         """
-        if not enable_vertical_write and self.wrapper_type == "vertical":
-            return
         end_time = self.check_end_time(count)
         path = os.path.join(self._tmp_path, self.name + '_TOTAL_STATS')
         f = open(path, 'a')
         f.write(' ')
-        finish_time = None
-        final_status = None
         if end_time > 0:
             # noinspection PyTypeChecker
             f.write(date2str(datetime.datetime.fromtimestamp(float(end_time)), 'S'))
@@ -2328,7 +2332,7 @@ class Job(object):
         else:
             f.write(date2str(datetime.datetime.now(), 'S'))
             self.finish_time_timestamp = date2str(datetime.datetime.now(), 'S')
-            finish_time = time.time()  # date2str(datetime.datetime.now(), 'S')
+            finish_time = time.time()
         f.write(' ')
         if completed:
             final_status = "COMPLETED"
@@ -2337,7 +2341,6 @@ class Job(object):
             final_status = "FAILED"
             f.write('FAILED')
         out, err = self.local_logs
-        path_out = os.path.join(self._tmp_path, 'LOG_' + str(self.expid), out)
         # Launch first as simple non-threaded function
         exp_history = ExperimentHistory(self.expid, jobdata_dir_path=BasicConfig.JOBDATA_DIR, historiclog_dir_path=BasicConfig.HISTORICAL_LOG_DIR)
         job_data_dc = exp_history.write_finish_time(self.name, finish=finish_time, status=final_status, ncpus=self.processors,
@@ -2351,6 +2354,7 @@ class Job(object):
             thread_write_finish.name = "JOB_data_{}".format(self.name)
             thread_write_finish.start()
 
+
     def write_total_stat_by_retries(self, total_stats, first_retrial = False):
         """
         Writes all data to TOTAL_STATS file
@@ -2499,6 +2503,7 @@ class WrapperJob(Job):
         self.checked_time = datetime.datetime.now()
         self.hold = hold
         self.inner_jobs_running = list()
+        self.is_wrapper = True
 
 
     def _queuing_reason_cancel(self, reason):
@@ -2535,6 +2540,7 @@ class WrapperJob(Job):
             self._check_running_jobs()  # Check and update inner_jobs status that are eligible
         # Completed wrapper will always come from check function.
         elif self.status == Status.COMPLETED:
+            self._check_running_jobs()  # Check and update inner_jobs status that are eligible
             self.check_inner_jobs_completed(self.job_list)
 
         # Fail can come from check function or running/completed checkers.
@@ -2580,8 +2586,16 @@ class WrapperJob(Job):
                         job.new_status = Status.COMPLETED
                         job.updated_log = False
                         job.update_status(self.as_config)
+            # for job in self.job_list:
+            #     if job not in completed_jobs and job in self.inner_jobs_running:
+            #         job.new_status = Status.FAILED
+            #         job.packed = False
+            #     else:
+            #         job.new_status = Status.WAITING
+            #         job.packed = False
             for job in completed_jobs:
                 self.running_jobs_start.pop(job, None)
+
             not_completed_jobs = list(
                 set(not_completed_jobs) - set(completed_jobs))
 
@@ -2657,6 +2671,7 @@ class WrapperJob(Job):
             not_finished_jobs_names = ' '.join(list(not_finished_jobs_dict.keys()))
             remote_log_dir = self._platform.get_remote_log_dir()
             # PREPARE SCRIPT TO SEND
+            # When a inner_job is running? When the job has an _STAT file
             command = textwrap.dedent("""
             cd {1}
             for job in {0}
@@ -2679,9 +2694,13 @@ class WrapperJob(Job):
                 os.chmod(log_dir, 0o770)
             open(multiple_checker_inner_jobs, 'w+').write(command)
             os.chmod(multiple_checker_inner_jobs, 0o770)
-            self._platform.send_file(multiple_checker_inner_jobs, False)
-            command = os.path.join(
-                self._platform.get_files_path(), "inner_jobs_checker.sh")
+            if self.platform.name != "local":  # already "sent"...
+                self._platform.send_file(multiple_checker_inner_jobs, False)
+                command = f"cd {self._platform.get_files_path()}; {os.path.join(
+                    self._platform.get_files_path(), 'inner_jobs_checker.sh')}"
+            else:
+                command = os.path.join(
+                    self._platform.get_files_path(), "inner_jobs_checker.sh")
             #
             wait = 2
             retries = 5
@@ -2722,9 +2741,9 @@ class WrapperJob(Job):
                 if content == '':
                     sleep(wait)
                 retries = retries - 1
-            temp_list = self.inner_jobs_running
-            self.inner_jobs_running = [
-                job for job in temp_list if job.status == Status.RUNNING]
+            # temp_list = self.inner_jobs_running
+            # self.inner_jobs_running = [
+            #     job for job in temp_list if job.status == Status.RUNNING]
             if retries == 0 or over_wallclock:
                 self.status = Status.FAILED
 
@@ -2753,7 +2772,7 @@ class WrapperJob(Job):
             running_jobs += [job for job in self.job_list if job.status == Status.READY or job.status == Status.SUBMITTED or job.status == Status.QUEUING]
         self.inner_jobs_running = list()
         for job in running_jobs:
-            if job.platform.check_file_exists('{0}_FAILED'.format(job.name), wrapper_failed=True):
+            if job.platform.check_file_exists('{0}_FAILED'.format(job.name), wrapper_failed=True, max_retries=2):
                 if job.platform.get_file('{0}_FAILED'.format(job.name), False, wrapper_failed=True):
                     self._check_finished_job(job, True)
             else:
@@ -2761,21 +2780,28 @@ class WrapperJob(Job):
                     self.inner_jobs_running.append(job)
 
     def cancel_failed_wrapper_job(self):
-        Log.printlog("Cancelling job with id {0}".format(self.id), 6009)
         try:
-            self._platform.send_command(
-                self._platform.cancel_cmd + " " + str(self.id))
+            if self.platform_name == "local":
+                # Check if the job is still running to avoid a misleading message in the logs
+                if self.platform.get_pscall(self.id):
+                    self._platform.send_command(
+                        self._platform.cancel_cmd + " " + str(self.id))
+            else:
+                Log.warning(f"Wrapper {self.name} failed, cancelling it")
+                self._platform.send_command(
+                    self._platform.cancel_cmd + " " + str(self.id))
         except:
-            Log.info(f'Job with {self.id} was finished before canceling it')
-
+            Log.debug(f'Job with {self.id} was finished before canceling it')
+        self._check_running_jobs()
+        for job in self.inner_jobs_running:
+            job.status = Status.FAILED
         for job in self.job_list:
-            #if job.status == Status.RUNNING:
-                #job.inc_fail_count()
-            #    job.packed = False
-            #    job.status = Status.FAILED
+            job.packed = False
             if job.status not in [Status.COMPLETED, Status.FAILED]:
-                job.packed = False
                 job.status = Status.WAITING
+            else:
+                if job.wrapper_type == "vertical":  # job is being retrieved internally by the wrapper
+                    job.fail_count = job.retrials
 
 
     def _update_completed_jobs(self):
diff --git a/autosubmit/job/job_list.py b/autosubmit/job/job_list.py
index 6aad371df..98744ed09 100644
--- a/autosubmit/job/job_list.py
+++ b/autosubmit/job/job_list.py
@@ -2579,7 +2579,6 @@ class JobList(object):
         if hasattr(job, "x11") and job.x11: # X11 has it log writted in the run.out file. No need to check for log files as there are none
             job.updated_log = True
             return
-        Log.result(f"Sending job {job.stat_file} to the log recovery thread")
 
         log_recovered = self.check_if_log_is_recovered(job)
         if log_recovered:
@@ -2600,8 +2599,8 @@ class JobList(object):
 
         if not hasattr(job, "updated_log") or not job.updated_log:
             for log_recovered in self.path_to_logs.glob(f"{job.name}.*.out"):
-                date = int(datetime.datetime.fromtimestamp(log_recovered.stat().st_mtime).strftime("%Y%m%d%H%M%S"))
-                if job.ready_date and int(datetime.datetime.fromtimestamp(log_recovered.stat().st_mtime).strftime("%Y%m%d%H%M%S")) > int(job.ready_date):
+                file_timestamp = int(datetime.datetime.fromtimestamp(log_recovered.stat().st_mtime).strftime("%Y%m%d%H%M%S"))
+                if job.ready_date and file_timestamp >= int(job.ready_date):
                     return log_recovered
         return None
 
diff --git a/autosubmit/job/job_packager.py b/autosubmit/job/job_packager.py
index 8472501c4..2604f656f 100644
--- a/autosubmit/job/job_packager.py
+++ b/autosubmit/job/job_packager.py
@@ -491,14 +491,12 @@ class JobPackager(object):
                 current_info.append(param[self.current_wrapper_section])
             current_info.append(self._as_config)
 
-            if self.wrapper_type[self.current_wrapper_section] == 'vertical':
-                built_packages_tmp = self._build_vertical_packages(jobs, wrapper_limits,wrapper_info=current_info)
-            elif self.wrapper_type[self.current_wrapper_section] == 'horizontal':
-                built_packages_tmp = self._build_horizontal_packages(jobs, wrapper_limits, section,wrapper_info=current_info)
+            if self.wrapper_type[self.current_wrapper_section] == 'horizontal':
+                built_packages_tmp = self._build_horizontal_packages(jobs, wrapper_limits, section, wrapper_info=current_info)
             elif self.wrapper_type[self.current_wrapper_section] in ['vertical-horizontal', 'horizontal-vertical']:
-                built_packages_tmp.append(self._build_hybrid_package(jobs, wrapper_limits, section,wrapper_info=current_info))
+                built_packages_tmp.append(self._build_hybrid_package(jobs, wrapper_limits, section, wrapper_info=current_info))
             else:
-                built_packages_tmp = self._build_vertical_packages(jobs, wrapper_limits)
+                built_packages_tmp = self._build_vertical_packages(jobs, wrapper_limits, wrapper_info=current_info)
             Log.result(f"Built {len(built_packages_tmp)} wrappers for {wrapper_name}")
             packages_to_submit,max_jobs_to_submit = self.check_packages_respect_wrapper_policy(built_packages_tmp,packages_to_submit,max_jobs_to_submit,wrapper_limits,any_simple_packages)
 
@@ -738,7 +736,7 @@ class JobPackagerVertical(object):
             if child is not None and len(str(child)) > 0:
                 child.update_parameters(wrapper_info[-1], {})
                 self.total_wallclock = sum_str_hours(self.total_wallclock, child.wallclock)
-                if self.total_wallclock <= self.max_wallclock:
+                if self.total_wallclock <= self.max_wallclock or not self.max_wallclock:
                     child.packed = True
                     child.level = level
                     self.jobs_list.append(child)
@@ -850,22 +848,14 @@ class JobPackagerVerticalMixed(JobPackagerVertical):
         :rtype: Job Object
         """
         sorted_jobs = self.sorted_jobs
-
+        child = None
         for index in range(self.index, len(sorted_jobs)):
-            child = sorted_jobs[index]
-            if self._is_wrappable(child):
+            child_ = sorted_jobs[index]
+            if child_.name != job.name and self._is_wrappable(child_):
+                child = child_
                 self.index = index + 1
-                return child
-            continue
-        return None
-        # Not passing tests but better wrappers result to check
-        # for child in job.children:
-        #     if child.name != job.name:
-        #         if self._is_wrappable(child):
-        #             self.index = self.index + 1
-        #             return child
-        #     continue
-        # return None
+                break
+        return child
 
     def _is_wrappable(self, job):
         """
diff --git a/autosubmit/job/job_packages.py b/autosubmit/job/job_packages.py
index f99654fe8..17bffecc5 100644
--- a/autosubmit/job/job_packages.py
+++ b/autosubmit/job/job_packages.py
@@ -249,6 +249,7 @@ class JobPackageSimple(JobPackageBase):
         if len(job_scripts) == 0:
             job_scripts = self._job_scripts
         for job in self.jobs:
+            job.update_local_logs()
             #CLEANS PREVIOUS RUN ON LOCAL
             log_completed = os.path.join(self._tmp_path, job.name + '_COMPLETED')
             log_stat = os.path.join(self._tmp_path, job.name + '_STAT')
@@ -262,7 +263,9 @@ class JobPackageSimple(JobPackageBase):
             if job.id is None or not job.id:
                 continue
             Log.info("{0} submitted", job.name)
-            job.status = Status.SUBMITTED            
+            job.status = Status.SUBMITTED
+            job.wrapper_name = job.name
+
 
 
 class JobPackageSimpleWrapped(JobPackageSimple):
@@ -342,6 +345,7 @@ class JobPackageArray(JobPackageBase):
 
     def _do_submission(self, job_scripts=None, hold=False):
         for job in self.jobs:
+            job.update_local_logs()
             self.platform.remove_stat_file(job)
             self.platform.remove_completed_file(job.name)
 
@@ -353,9 +357,8 @@ class JobPackageArray(JobPackageBase):
         for i in range(0, len(self.jobs)): # platforms without a submit.cmd
             Log.info("{0} submitted", self.jobs[i].name)
             self.jobs[i].id = str(package_id) + '[{0}]'.format(i)
-            self.jobs[i].status = Status.SUBMITTED            
-            self.jobs[i].write_submit_time(hold=hold,wrapper_submit_time=wrapper_time)
-            wrapper_time = self.jobs[i].write_submit_time
+            self.jobs[i].status = Status.SUBMITTED
+            self.jobs[i].wrapper_name = self.name
 
 
 class JobPackageThread(JobPackageBase):
@@ -572,12 +575,14 @@ class JobPackageThread(JobPackageBase):
         if callable(getattr(self.platform, 'remove_multiple_files')):
             filenames = str()
             for job in self.jobs:
+                job.update_local_logs()
                 filenames += " " + self.platform.remote_log_dir + "/" + job.name + "_STAT " + \
                              self.platform.remote_log_dir + "/" + job.name + "_COMPLETED"
             self.platform.remove_multiple_files(filenames)
 
         else:
             for job in self.jobs:
+                job.update_local_logs()
                 self.platform.remove_stat_file(job)
                 self.platform.remove_completed_file(job.name)
                 if hold:
@@ -588,13 +593,12 @@ class JobPackageThread(JobPackageBase):
 
         if package_id is None or not package_id:
             return
-        wrapper_time = None
         for i in range(0, len(self.jobs)):
             Log.info("{0} submitted", self.jobs[i].name)
             self.jobs[i].id = str(package_id)
             self.jobs[i].status = Status.SUBMITTED
-            self.jobs[i].write_submit_time(hold=hold,wrapper_submit_time=wrapper_time)
-            wrapper_time = self.jobs[i].write_submit_time
+            self.jobs[i].wrapper_name = self.name
+
 
     def _common_script_content(self):
         pass
@@ -658,6 +662,7 @@ class JobPackageThreadWrapped(JobPackageThread):
 
     def _do_submission(self, job_scripts=None, hold=False):
         for job in self.jobs:
+            job.update_local_logs()
             self.platform.remove_stat_file(job)
             self.platform.remove_completed_file(job.name)
             if hold:
@@ -668,13 +673,13 @@ class JobPackageThreadWrapped(JobPackageThread):
 
         if package_id is None or not package_id:
             raise Exception('Submission failed')
-        wrapper_time = None
         for i in range(0, len(self.jobs)):
             Log.info("{0} submitted", self.jobs[i].name)
             self.jobs[i].id = str(package_id)
             self.jobs[i].status = Status.SUBMITTED
-            self.jobs[i].write_submit_time(hold=hold,wrapper_submit_time=wrapper_time)
-            wrapper_time = self.jobs[i].write_submit_time
+            self.jobs[i].wrapper_name = self.name
+
+
 
 class JobPackageVertical(JobPackageThread):
     """
@@ -752,7 +757,7 @@ class JobPackageVertical(JobPackageThread):
                 self._wallclock = "{0}:{1}".format(hh_str,mm_str)
                 Log.info("Submitting {2} with wallclock {0}:{1}".format(hh_str,mm_str,self._name))
         else:
-            wallclock_by_level = None
+            wallclock_by_level = 0 # command: "timeout 0 sleep 2" == command: "sleep 2"
 
         return self._wrapper_factory.get_wrapper(self._wrapper_factory.vertical_wrapper, name=self._name,
                                                  queue=self._queue, project=self._project, wallclock=self._wallclock,
diff --git a/autosubmit/platforms/locplatform.py b/autosubmit/platforms/locplatform.py
index f6356ab26..61f96832a 100644
--- a/autosubmit/platforms/locplatform.py
+++ b/autosubmit/platforms/locplatform.py
@@ -18,12 +18,15 @@
 # along with Autosubmit.  If not, see <http://www.gnu.org/licenses/>.
 import locale
 import os
+from pathlib import Path
 from xml.dom.minidom import parseString
 import subprocess
 
+from matplotlib.patches import PathPatch
 
 from autosubmit.platforms.paramiko_platform import ParamikoPlatform
 from autosubmit.platforms.headers.local_header import LocalHeader
+from autosubmit.platforms.wrappers.wrapper_factory import LocalWrapperFactory
 
 from autosubmitconfigparser.config.basicconfig import BasicConfig
 from time import sleep
@@ -64,6 +67,9 @@ class LocalPlatform(ParamikoPlatform):
         self.job_status['RUNNING'] = ['0']
         self.job_status['QUEUING'] = []
         self.job_status['FAILED'] = []
+        self._allow_wrappers = True
+        self._wrapper = LocalWrapperFactory(self)
+
         self.update_cmds()
 
     def update_cmds(self):
@@ -100,20 +106,24 @@ class LocalPlatform(ParamikoPlatform):
         return [int(element.firstChild.nodeValue) for element in jobs_xml]
 
     def get_submit_cmd(self, job_script, job, hold=False, export=""):
-        wallclock = self.parse_time(job.wallclock)
-        seconds = int(wallclock.days * 86400 + wallclock.seconds * 60)
+        if job:
+            wallclock = self.parse_time(job.wallclock)
+            seconds = int(wallclock.days * 86400 + wallclock.seconds * 60)
+        else:
+            seconds = 24 * 3600
         if export == "none" or export == "None" or export is None or export == "":
             export = ""
         else:
             export += " ; "
-        return self.get_call(job_script, job, export=export,timeout=seconds)
-
+        command = self.get_call(job_script, job, export=export,timeout=seconds)
+        return f"cd {self.remote_log_dir} ; {command}"
     def get_checkjob_cmd(self, job_id):
         return self.get_pscall(job_id)
 
     def connect(self, as_conf={}, reconnect=False):
         self.connected = True
         self.spawn_log_retrieval_process(as_conf)
+
     def test_connection(self,as_conf):
         if not self.connected:
             self.connect(as_conf)
@@ -144,10 +154,7 @@ class LocalPlatform(ParamikoPlatform):
         return True
 
     def send_file(self, filename, check=True):
-        self.check_remote_log_dir()
-        self.delete_file(filename,del_cmd=True)
-        command = '{0} {1} {2}'.format(self.put_cmd, os.path.join(self.tmp_path, filename),
-                                       os.path.join(self.tmp_path, 'LOG_' + self.expid, filename))
+        command = f'{self.put_cmd} {os.path.join(self.tmp_path, Path(filename).name)} {os.path.join(self.tmp_path, "LOG_" + self.expid, Path(filename).name)}'
         try:
             subprocess.check_call(command, shell=True)
         except subprocess.CalledProcessError:
@@ -157,6 +164,17 @@ class LocalPlatform(ParamikoPlatform):
             raise
         return True
 
+    def remove_multiple_files(self, filenames):
+        log_dir = os.path.join(self.tmp_path, 'LOG_{0}'.format(self.expid))
+        multiple_delete_previous_run = os.path.join(
+            log_dir, "multiple_delete_previous_run.sh")
+        if os.path.exists(log_dir):
+            lang = locale.getlocale()[1]
+            if lang is None:
+                lang = 'UTF-8'
+            open(multiple_delete_previous_run, 'wb+').write(("rm -f" + filenames).encode(lang))
+            os.chmod(multiple_delete_previous_run, 0o770)
+        return ""
 
     def get_file(self, filename, must_exist=True, relative_path='',ignore_log = False,wrapper_failed=False):
         local_path = os.path.join(self.tmp_path, relative_path)
@@ -180,26 +198,28 @@ class LocalPlatform(ParamikoPlatform):
         return True
 
     # Moves .err .out
-    def check_file_exists(self, src, wrapper_failed=False, sleeptime=None, max_retries=None, first=None):
+    def check_file_exists(self, src, wrapper_failed=False, sleeptime=1, max_retries=1):
         """
         Checks if a file exists in the platform
         :param src: source name
         :type src: str
-        :param wrapper_failed: if the wrapper failed
+        :param wrapper_failed: checks inner jobs files
         :type wrapper_failed: bool
         :param sleeptime: time to sleep
         :type sleeptime: int
         :param max_retries: maximum number of retries
         :type max_retries: int
-        :param first: if it is the first time
-        :type first: bool
         :return: True if the file exists, False otherwise
         :rtype: bool
         """
-        file_exist = os.path.isfile(os.path.join(self.get_files_path(), src))
-        if not file_exist:
-            Log.debug(f"File {self.get_files_path()}/{src} does not exist")
-        return file_exist
+        sleeptime = 1
+        for i in range(max_retries):
+            if os.path.isfile(os.path.join(self.get_files_path(), src)):
+                return True
+            sleep(sleeptime)
+        Log.warning("File {0} does not exist".format(src))
+        return False
+
 
     def delete_file(self, filename,del_cmd  = False):
         if del_cmd:
@@ -260,3 +280,18 @@ class LocalPlatform(ParamikoPlatform):
         :type remote_logs: (str, str)
         """
         return
+
+    def check_completed_files(self, sections=None):
+        command = "find %s " % self.remote_log_dir
+        if sections:
+            for i, section in enumerate(sections.split()):
+                command += " -name *%s_COMPLETED" % section
+                if i < len(sections.split()) - 1:
+                    command += " -o "
+        else:
+            command += " -name *_COMPLETED"
+
+        if self.send_command(command, True):
+            return self._ssh_output
+        else:
+            return None
\ No newline at end of file
diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py
index a3be9202f..8b19730a8 100644
--- a/autosubmit/platforms/paramiko_platform.py
+++ b/autosubmit/platforms/paramiko_platform.py
@@ -1,5 +1,6 @@
 import locale
 from contextlib import suppress
+from pathlib import Path
 from time import sleep
 import sys
 import socket
@@ -20,6 +21,7 @@ from threading import Thread
 import threading
 import getpass
 from paramiko.agent import Agent
+import time
 
 def threaded(fn):
     def wrapper(*args, **kwargs):
@@ -526,7 +528,8 @@ class ParamikoPlatform(Platform):
         if cmd is None:
             return None
         if self.send_command(cmd,x11=x11):
-            job_id = self.get_submitted_job_id(self.get_ssh_output(),x11=job.x11)
+            x11 = False if job is None else job.x11
+            job_id = self.get_submitted_job_id(self.get_ssh_output(),x11=x11)
             Log.debug("Job ID: {0}", job_id)
             return int(job_id)
         else:
@@ -618,7 +621,19 @@ class ParamikoPlatform(Platform):
                 self.get_ssh_output()).strip("\n")
             # URi: define status list in HPC Queue Class
             if job_status in self.job_status['COMPLETED'] or retries == 0:
-                job_status = Status.COMPLETED
+                # The Local platform has only 0 or 1, so it neccesary to look for the completed file.
+                # Not sure why it is called over_wallclock but is the only way to return a value
+                if self.type == "local":  # wrapper has a different check completion
+                    if not job.is_wrapper:
+                        job_status = job.check_completion(over_wallclock=True)
+                    else:
+                        if Path(f"{self.remote_log_dir}/WRAPPER_FAILED").exists():
+                            job_status = Status.FAILED
+                        else:
+                            job_status = Status.COMPLETED
+                else:
+                    job_status = Status.COMPLETED
+
             elif job_status in self.job_status['RUNNING']:
                 job_status = Status.RUNNING
                 if not is_wrapper:
@@ -650,8 +665,14 @@ class ParamikoPlatform(Platform):
             Log.error(
                 'check_job() The job id ({0}) status is {1}.', job_id, job_status)
 
-        if job_status in [Status.FAILED, Status.COMPLETED]:
+        if job_status in [Status.FAILED, Status.COMPLETED, Status.UNKNOWN]:
             job.updated_log = False
+            # backup for end time in case that the stat file is not found
+            job.end_time_placeholder = int(time.time())
+        if job_status in [Status.RUNNING, Status.COMPLETED] and job.new_status in [Status.QUEUING, Status.SUBMITTED]:
+            # backup for start time in case that the stat file is not found
+            job.start_time_timestamp = int(time.time())
+
         if submit_hold_check:
             return job_status
         else:
@@ -1216,18 +1237,23 @@ class ParamikoPlatform(Platform):
         :return: command to execute script
         :rtype: str
         """
-        executable = ''
-        if job.type == Type.BASH:
-            executable = 'bash'
-        elif job.type == Type.PYTHON:
-            executable = 'python3'
-        elif job.type == Type.PYTHON2:
-            executable = 'python2'
-        elif job.type == Type.R:
-            executable = 'Rscript'
-        if job.executable != '':
-            executable = '' # Alternative: use job.executable with substituted placeholders
-        remote_logs = (job.script_name + ".out."+str(job.fail_count), job.script_name + ".err."+str(job.fail_count))
+        if job: # If job is None, it is a wrapper
+            executable = ''
+            if job.type == Type.BASH:
+                executable = 'bash'
+            elif job.type == Type.PYTHON:
+                executable = 'python3'
+            elif job.type == Type.PYTHON2:
+                executable = 'python2'
+            elif job.type == Type.R:
+                executable = 'Rscript'
+            if job.executable != '':
+                executable = '' # Alternative: use job.executable with substituted placeholders
+            remote_logs = (job.script_name + ".out."+str(job.fail_count), job.script_name + ".err."+str(job.fail_count))
+        else:
+            executable = 'python3' # wrappers are always python3
+            remote_logs = (f"{job_script}.out", f"{job_script}.err")
+
         if timeout < 1:
             command = export + ' nohup ' + executable + ' {0} > {1} 2> {2} & echo $!'.format(
                 os.path.join(self.remote_log_dir, job_script),
diff --git a/autosubmit/platforms/pjmplatform.py b/autosubmit/platforms/pjmplatform.py
index 4f198d819..7291377b7 100644
--- a/autosubmit/platforms/pjmplatform.py
+++ b/autosubmit/platforms/pjmplatform.py
@@ -151,7 +151,8 @@ class PJMPlatform(ParamikoPlatform):
                         job.hold = hold
                         job.id = str(jobs_id[i])
                         job.status = Status.SUBMITTED
-                        job.write_submit_time(hold=hold)
+                        job.wrapper_name = package.name
+
                     i += 1
             save = True
         except AutosubmitError as e:
@@ -483,13 +484,10 @@ class PJMPlatform(ParamikoPlatform):
     def allocated_nodes():
         return """os.system("scontrol show hostnames $SLURM_JOB_NODELIST > node_list_{0}".format(node_id))"""
 
-    def check_file_exists(self, filename, wrapper_failed=False, sleeptime=5, max_retries=3, first=True):
+    def check_file_exists(self, filename, wrapper_failed=False, sleeptime=5, max_retries=3):
         file_exist = False
         retries = 0
-        # Not first is meant for vertical_wrappers. There you have to download STAT_{MAX_LOGS} then STAT_{MAX_LOGS-1} and so on
-        if not first:
-            max_retries = 1
-            sleeptime = 0
+
         while not file_exist and retries < max_retries:
             try:
                 # This return IOError if path doesn't exist
@@ -497,9 +495,6 @@ class PJMPlatform(ParamikoPlatform):
                     self.get_files_path(), filename))
                 file_exist = True
             except IOError as e:  # File doesn't exist, retry in sleeptime
-                if first:
-                    Log.debug("{2} File does not exist.. waiting {0}s for a new retry (retries left: {1})", sleeptime,
-                              max_retries - retries, os.path.join(self.get_files_path(), filename))
                 if not wrapper_failed:
                     sleep(sleeptime)
                     sleeptime = sleeptime + 5
diff --git a/autosubmit/platforms/platform.py b/autosubmit/platforms/platform.py
index 9bca11920..d504d5d08 100644
--- a/autosubmit/platforms/platform.py
+++ b/autosubmit/platforms/platform.py
@@ -25,10 +25,13 @@ class UniqueQueue(
         super().__init__(maxsize, ctx=multiprocessing.get_context())
 
     def put(self, job, block=True, timeout=None):
-        name_with_retrial = job.name+str(job.fail_count)
-        if name_with_retrial not in self.all_items:
-            self.all_items.add(name_with_retrial)
-            Queue.put(self, job, block, timeout)
+        if job.wrapper_type == "vertical":
+            unique_name = job.name
+        else:
+            unique_name = job.name+str(job.fail_count)
+        if unique_name not in self.all_items:
+            self.all_items.add(unique_name)
+            super().put(job, block, timeout)
 
 
 class Platform(object):
@@ -109,7 +112,8 @@ class Platform(object):
         self.work_event = Event()
         self.cleanup_event = Event()
         self.log_recovery_process = None
-
+        self.keep_alive_timeout = 60 * 5 # Useful in case of kill -9
+        self.processed_wrapper_logs = set()
     @classmethod
     def update_workers(cls, event_worker):
         cls.worker_events.append(event_worker)
@@ -352,7 +356,7 @@ class Platform(object):
                         raise
 
             except AutosubmitCritical as e:
-                raise AutosubmitCritical(e.message, e.code, e.trace)
+                raise
             except AutosubmitError as e:
                 raise
             except Exception as e:
@@ -663,7 +667,7 @@ class Platform(object):
         if self.check_file_exists(filename):
             self.delete_file(filename)
 
-    def check_file_exists(self, src, wrapper_failed=False, sleeptime=5, max_retries=3, first=True):
+    def check_file_exists(self, src, wrapper_failed=False, sleeptime=5, max_retries=3):
         return True
 
     def get_stat_file(self, job, count=-1):
@@ -680,7 +684,7 @@ class Platform(object):
             if self.get_file(filename, True):
                 Log.debug('{0}_STAT file have been transferred', job.name)
                 return True
-        Log.debug('{0}_STAT file not found', job.name)
+        Log.warning('{0}_STAT file not found', job.name)
         return False
 
     @autosubmit_parameter(name='current_logdir')
@@ -848,68 +852,63 @@ class Platform(object):
             self.cleanup_event.set()
             self.log_recovery_process.join()
 
-    def wait_for_work(self, timeout=60):
+    def wait_for_work(self, sleep_time=60):
         """
-        This function, waits for the work_event to be set by the main process. If it is not set, it returns False and the log recovery process ends.
+        This function waits for the work_event to be set or the cleanup_event to be set.
         """
-        keep_working = False
-        # The whole point of this if is to make the regression tests faster to run.
-        if timeout >= 60:
-            unstoppable_timeout = 60
-            timeout = timeout - 60
-        else:
-            unstoppable_timeout = timeout
-            timeout = 0
-        for remaining in range(unstoppable_timeout, 0, -1):
+        process_log = False
+        for remaining in range(sleep_time, 0, -1):  # Min time to wait unless clean-up signal is set
             time.sleep(1)
             if self.work_event.is_set() or not self.recovery_queue.empty():
-                keep_working = True
-            if not self.recovery_queue.empty() or self.cleanup_event.is_set():
+                process_log = True
+            if self.cleanup_event.is_set():
+                process_log = True
                 break
-        if not keep_working:
-            while timeout > 0 and self.recovery_queue.empty() and not self.cleanup_event.is_set() and not self.work_event.is_set():
+        if not process_log: # If no work, wait until the keep_alive_timeout is reached or any signal is set to end the process.
+            timeout = self.keep_alive_timeout - sleep_time
+            while timeout > 0 or not self.recovery_queue.empty() or self.cleanup_event.is_set() or self.work_event.is_set():
                 time.sleep(1)
                 timeout -= 1
             if not self.recovery_queue.empty() or self.cleanup_event.is_set() or self.work_event.is_set():
-                keep_working = True
+                process_log = True
         self.work_event.clear()
-        return keep_working
+        return process_log
 
     def recover_job_log(self, identifier, jobs_pending_to_process):
         job = None
-        # try:
-        while not self.recovery_queue.empty():
-            try:
-                job = self.recovery_queue.get(
-                    timeout=1)  # Should be non-empty, but added a timeout for other possible errors.
-                job.children = set()  # Children can't be serialized, so we set it to an empty set for this process.
-                job.platform = self  # change the original platform to this process platform.
-                job._log_recovery_retries = 0  # reset the log recovery retries.
-                Log.debug(f"{identifier} Recovering log files for job {job.name}")
+        try:
+            while not self.recovery_queue.empty():
+                try:
+                    job = self.recovery_queue.get(
+                        timeout=1)  # Should be non-empty, but added a timeout for other possible errors.
+                    job.children = set()  # Children can't be serialized, so we set it to an empty set for this process.
+                    job.platform = self  # change the original platform to this process platform.
+                    job._log_recovery_retries = 0  # reset the log recovery retries.
+                    Log.debug(f"{identifier} Recovering log files for job {job.name} and retrial:{job.fail_count}")
+                    job.retrieve_logfiles(self, raise_error=True)
+                    if job.status == Status.FAILED:
+                        Log.result(f"{identifier} Sucessfully recovered log files for job {job.name} and retrial:{job.fail_count}")
+                except queue.Empty:
+                    pass
+            # This second while is to keep retring the failed jobs.
+            while len(jobs_pending_to_process) > 0:  # jobs that had any issue during the log retrieval
+                job = jobs_pending_to_process.pop()
+                job._log_recovery_retries += 1
+                Log.debug(
+                    f"{identifier} (Retrial number: {job._log_recovery_retries}) Recovering log files for job {job.name}")
                 job.retrieve_logfiles(self, raise_error=True)
-                if job.status == Status.FAILED:
-                    Log.result(f"{identifier} Sucessfully recovered log files for job {job.name} and retrial:{job.fail_count}")
-            except queue.Empty:
+                Log.result(f"{identifier} (Retrial) Successfully recovered log files for job {job.name}")
+        except Exception as e:
+            Log.info(f"{identifier} Error while recovering logs: {str(e)}")
+            try:
+                if job and job._log_recovery_retries < 5:  # If log retrieval failed, add it to the pending jobs to process. Avoids to keep trying the same job forever.
+                    jobs_pending_to_process.add(job)
+                self.connected = False
+                Log.info(f"{identifier} Attempting to restore connection")
+                self.restore_connection(None)  # Always restore the connection on a failure.
+                Log.result(f"{identifier} Sucessfully reconnected.")
+            except:
                 pass
-        # This second while is to keep retring the failed jobs.
-        while len(jobs_pending_to_process) > 0:  # jobs that had any issue during the log retrieval
-            job = jobs_pending_to_process.pop()
-            job._log_recovery_retries += 1
-            Log.debug(
-                f"{identifier} (Retrial number: {job._log_recovery_retries}) Recovering log files for job {job.name}")
-            job.retrieve_logfiles(self, raise_error=True)
-            Log.result(f"{identifier} (Retrial) Successfully recovered log files for job {job.name}")
-        # except Exception as e:
-        #     Log.info(f"{identifier} Error while recovering logs: {str(e)}")
-        #     try:
-        #         if job and job._log_recovery_retries < 5:  # If log retrieval failed, add it to the pending jobs to process. Avoids to keep trying the same job forever.
-        #             jobs_pending_to_process.add(job)
-        #         self.connected = False
-        #         Log.info(f"{identifier} Attempting to restore connection")
-        #         self.restore_connection(None)  # Always restore the connection on a failure.
-        #         Log.result(f"{identifier} Sucessfully reconnected.")
-        #     except:
-        #         pass
         return jobs_pending_to_process
 
     def recover_platform_job_logs(self):
@@ -925,11 +924,11 @@ class Platform(object):
         self.restore_connection(None)
         Log.get_logger("Autosubmit")  # Log needs to be initialised in the new process
         Log.result(f"{identifier} Sucessfully connected.")
-        default_timeout = 60
-        timeout = max(int(self.config.get("LOG_RECOVERY_TIMEOUT", default_timeout)), default_timeout)
-        while self.wait_for_work(timeout=timeout):
+        log_recovery_timeout = self.config.get("LOG_RECOVERY_TIMEOUT", 60)
+        self.keep_alive_timeout = max(log_recovery_timeout*5, 60*5)
+        while self.wait_for_work(sleep_time=max(log_recovery_timeout, 60)):
             jobs_pending_to_process = self.recover_job_log(identifier, jobs_pending_to_process)
-            if self.cleanup_event.is_set(): # Check if main process is waiting for this child to end.
+            if self.cleanup_event.is_set():  # Check if main process is waiting for this child to end.
                 self.recover_job_log(identifier, jobs_pending_to_process)
                 break
         Log.info(f"{identifier} Exiting.")
\ No newline at end of file
diff --git a/autosubmit/platforms/slurmplatform.py b/autosubmit/platforms/slurmplatform.py
index c48b98e9c..1884a6739 100644
--- a/autosubmit/platforms/slurmplatform.py
+++ b/autosubmit/platforms/slurmplatform.py
@@ -189,6 +189,8 @@ class SlurmPlatform(ParamikoPlatform):
                         job.hold = hold
                         job.id = str(jobs_id[i])
                         job.status = Status.SUBMITTED
+                        job.wrapper_name = package.name
+
                     # Check if there are duplicated jobnames
                     if not duplicated_jobs_already_checked:
                         job_name = package.name if hasattr(package, "name") else package.jobs[0].name
@@ -651,37 +653,30 @@ class SlurmPlatform(ParamikoPlatform):
     def allocated_nodes():
         return """os.system("scontrol show hostnames $SLURM_JOB_NODELIST > node_list_{0}".format(node_id))"""
 
-    def check_file_exists(self, filename, wrapper_failed=False, sleeptime=5, max_retries=3, first=True):
+    def check_file_exists(self, filename, wrapper_failed=False, sleeptime=5, max_retries=3):
         file_exist = False
         retries = 0
-        # Not first is meant for vertical_wrappers. There you have to download STAT_{MAX_LOGS} then STAT_{MAX_LOGS-1} and so on
-        if not first:
-            max_retries = 1
-            sleeptime = 0
         while not file_exist and retries < max_retries:
             try:
-                # This return IOError if path doesn't exist
+                # This return IOError if a path doesn't exist
                 self._ftpChannel.stat(os.path.join(
                     self.get_files_path(), filename))
                 file_exist = True
             except IOError as e:  # File doesn't exist, retry in sleeptime
-                if first:
-                    Log.debug("{2} File does not exist.. waiting {0}s for a new retry (retries left: {1})", sleeptime,
-                              max_retries - retries, os.path.join(self.get_files_path(), filename))
                 if not wrapper_failed:
                     sleep(sleeptime)
-                    sleeptime = sleeptime + 5
                     retries = retries + 1
                 else:
-                    retries = 9999
+                    sleep(2)
+                    retries = retries + 1
             except BaseException as e:  # Unrecoverable error
                 if str(e).lower().find("garbage") != -1:
-                    if not wrapper_failed:
-                        sleep(sleeptime)
-                        sleeptime = sleeptime + 5
-                        retries = retries + 1
+                    sleep(sleeptime)
+                    sleeptime = sleeptime + 5
+                    retries = retries + 1
                 else:
-                    Log.printlog("remote logs {0} couldn't be recovered".format(filename), 6001)
                     file_exist = False  # won't exist
                     retries = 999  # no more retries
+        if not file_exist:
+            Log.warning("File {0} couldn't be found".format(filename))
         return file_exist
diff --git a/autosubmit/platforms/wrappers/wrapper_builder.py b/autosubmit/platforms/wrappers/wrapper_builder.py
index 7ff5552db..0135e2976 100644
--- a/autosubmit/platforms/wrappers/wrapper_builder.py
+++ b/autosubmit/platforms/wrappers/wrapper_builder.py
@@ -60,8 +60,10 @@ class WrapperBuilder(object):
         self.exit_thread = ''
         if "wallclock_by_level" in list(kwargs.keys()):
             self.wallclock_by_level = kwargs['wallclock_by_level']
+
     def build_header(self):
         return textwrap.dedent(self.header_directive) + self.build_imports()
+
     def build_imports(self):
         pass
     def build_job_thread(self):
@@ -153,7 +155,6 @@ class PythonWrapperBuilder(WrapperBuilder):
 
             def run(self):
                 jobname = self.template.replace('.cmd', '')
-                #os.system("echo $(date +%s) > "+jobname+"_STAT")
                 out = str(self.template) + ".out." + str(0)
                 err = str(self.template) + ".err." + str(0)
                 print(out+"\\n")
@@ -446,8 +447,11 @@ class PythonVerticalWrapperBuilder(PythonWrapperBuilder):
         sequential_threads_launcher = textwrap.dedent("""
         failed_wrapper = os.path.join(os.getcwd(),wrapper_id)
         retrials = {2}
-        total_steps = 0 
-        print("JOB.ID:"+ os.getenv('SLURM_JOBID'))
+        total_steps = 0
+        try: 
+            print("JOB.ID:"+ os.getenv('SLURM_JOBID'))
+        except:
+            print("JOB.ID")
         for i in range(len({0})):
             job_retrials = retrials
             completed = False
@@ -455,8 +459,7 @@ class PythonVerticalWrapperBuilder(PythonWrapperBuilder):
             while fail_count <= job_retrials and not completed:
                 current = {1}
                 current.start()
-                timer = int(time.time())
-                os.system("echo "+str(timer)+" >> "+scripts[i][:-4]+"_STAT_"+str(fail_count)) #Completed
+                start = int(time.time())
                 current.join({3})
                 total_steps = total_steps + 1
         """).format(jobs_list, thread,self.retrials,str(self.wallclock_by_level),'\n'.ljust(13))
@@ -468,26 +471,42 @@ class PythonVerticalWrapperBuilder(PythonWrapperBuilder):
                 failed_filename = {0}[i].replace('.cmd', '_FAILED')
                 failed_path = os.path.join(os.getcwd(), failed_filename)
                 failed_wrapper = os.path.join(os.getcwd(), wrapper_id)
-                timer = int(time.time())
-                os.system("echo "+str(timer)+" >> "+scripts[i][:-4]+"_STAT_"+str(fail_count)) #Completed
+                finish = int(time.time())
+                stat_filename = {0}[i].replace(".cmd", f"_STAT_{{fail_count}}")
+                stat_path_tmp = os.path.join(os.getcwd(),f"{{stat_filename}}.tmp")
+                print(f"Completed_file:{{completed_path}}")
+                print(f"Writting:{{stat_path_tmp}}")
+                print(f"[Start:{{start}}, Finish:{{finish}}, Fail_count:{{fail_count}}]")
+                with open(f"{{stat_path_tmp}}", "w") as file:
+                    file.write(f"{{start}}\\n")
+                    file.write(f"{{finish}}\\n")
                 if os.path.exists(completed_path):
                     completed = True
                     print(datetime.now(), "The job ", current.template," has been COMPLETED")
-                    os.system("echo COMPLETED >>  " + scripts[i][:-4]+"_STAT_"+str(fail_count))
                 else:
                     print(datetime.now(), "The job ", current.template," has FAILED")
-                    os.system("echo FAILED >>  " + scripts[i][:-4]+"_STAT_"+str(fail_count))
                     #{1}
                 fail_count = fail_count + 1
 
             """).format(jobs_list, self.exit_thread, '\n'.ljust(13)), 8)
             sequential_threads_launcher += self._indent(textwrap.dedent("""
+            from pathlib import Path
+            fail_count = 0 
+            while fail_count <= job_retrials:
+                try:
+                    stat_filename = {0}[i].replace(".cmd", f"_STAT_{{fail_count}}")
+                    stat_path_tmp = os.path.join(os.getcwd(),f"{{stat_filename}}.tmp")
+                    Path(stat_path_tmp).replace(stat_path_tmp.replace(".tmp",""))
+                except:
+                    print(f"Couldn't write the stat file:{{stat_path_tmp}}")
+                fail_count = fail_count + 1
             if not os.path.exists(completed_path):
                 open(failed_wrapper,'wb').close()
                 open(failed_path, 'wb').close()
                 
             if os.path.exists(failed_wrapper):
                 os.remove(os.path.join(os.getcwd(),wrapper_id))
+                print("WRAPPER_FAILED")
                 wrapper_failed = os.path.join(os.getcwd(),"WRAPPER_FAILED")
                 open(wrapper_failed, 'wb').close()
                 os._exit(1)
@@ -505,15 +524,17 @@ class PythonVerticalWrapperBuilder(PythonWrapperBuilder):
                 self.fail_count = fail_count
 
             def run(self):
+                print("\\n")
                 jobname = self.template.replace('.cmd', '')
                 out = str(self.template) + ".out." + str(self.fail_count)
                 err = str(self.template) + ".err." + str(self.fail_count)
-                print((out+"\\n"))
-                command = "./" + str(self.template) + " " + str(self.id_run) + " " + os.getcwd()
-                print((command+"\\n"))
-                (self.status) = getstatusoutput("timeout {0} " + command + " > " + out + " 2> " + err)
-                for i in self.status:
-                    print((str(i)+"\\n"))
+                out_path = os.path.join(os.getcwd(), out)
+                err_path = os.path.join(os.getcwd(), err)
+                template_path = os.path.join(os.getcwd(), self.template)
+                command = f"timeout {0} {{template_path}} > {{out_path}} 2> {{err_path}}"
+                print(command)
+                getstatusoutput(command)
+                
 
                 
         """).format(str(self.wallclock_by_level),'\n'.ljust(13))
diff --git a/autosubmit/platforms/wrappers/wrapper_factory.py b/autosubmit/platforms/wrappers/wrapper_factory.py
index 1f47996a9..1bd97c831 100644
--- a/autosubmit/platforms/wrappers/wrapper_factory.py
+++ b/autosubmit/platforms/wrappers/wrapper_factory.py
@@ -140,6 +140,58 @@ class WrapperFactory(object):
     def threads_directive(self, threads):
         raise NotImplemented(self.exception)
 
+
+class LocalWrapperFactory(WrapperFactory):
+
+    def vertical_wrapper(self, **kwargs):
+        return PythonVerticalWrapperBuilder(**kwargs)
+
+    def horizontal_wrapper(self, **kwargs):
+
+        if kwargs["method"] == 'srun':
+            return SrunHorizontalWrapperBuilder(**kwargs)
+        else:
+            return PythonHorizontalWrapperBuilder(**kwargs)
+
+    def hybrid_wrapper_horizontal_vertical(self, **kwargs):
+        return PythonHorizontalVerticalWrapperBuilder(**kwargs)
+
+    def hybrid_wrapper_vertical_horizontal(self, **kwargs):
+        if kwargs["method"] == 'srun':
+            return SrunVerticalHorizontalWrapperBuilder(**kwargs)
+        else:
+            return PythonVerticalHorizontalWrapperBuilder(**kwargs)
+
+    def reservation_directive(self, reservation):
+        return '#'
+
+    def dependency_directive(self, dependency):
+        return '#'
+
+    def queue_directive(self, queue):
+        return '#'
+
+    def processors_directive(self, processors):
+        return '#'
+
+    def nodes_directive(self, nodes):
+        return '#'
+
+    def tasks_directive(self, tasks):
+        return '#'
+
+    def partition_directive(self, partition):
+        return '#'
+
+    def exclusive_directive(self, exclusive):
+        return '#'
+
+    def threads_directive(self, threads):
+        return '#'
+
+    def header_directives(self, **kwargs):
+        return ""
+
 class SlurmWrapperFactory(WrapperFactory):
 
     def vertical_wrapper(self, **kwargs):
diff --git a/test/unit/test_database_regression.py b/test/unit/test_database_regression.py
index 3e5032a7a..7d12280fd 100644
--- a/test/unit/test_database_regression.py
+++ b/test/unit/test_database_regression.py
@@ -76,16 +76,7 @@ def prepare_db(db_tmpdir):
         f.write(f"""
 PLATFORMS:
     dummy:
-        type: pjm
-        host: 127.0.0.1
-        user: {db_tmpdir.owner}
-        project: whatever
-        scratch_dir: {db_tmpdir}/scratch   
-        MAX_WALLCLOCK: 48:00
-        TEMP_DIR: ''
-        MAX_PROCESSORS: 99999
-        queue: dummy
-        DISABLE_RECOVERY_THREADS: True
+        type: dummy
         """)
 
     with main_path.open('w') as f:
@@ -129,7 +120,7 @@ project:
     expid_dir = Path(f"{db_tmpdir.strpath}/scratch/whatever/{db_tmpdir.owner}/t000")
     dummy_dir = Path(f"{db_tmpdir.strpath}/scratch/whatever/{db_tmpdir.owner}/t000/dummy_dir")
     real_data = Path(f"{db_tmpdir.strpath}/scratch/whatever/{db_tmpdir.owner}/t000/real_data")
-    # write some dummy data inside scratch dir
+    # We write some dummy data inside the scratch_dir
     os.makedirs(expid_dir, exist_ok=True)
     os.makedirs(dummy_dir, exist_ok=True)
     os.makedirs(real_data, exist_ok=True)
@@ -141,88 +132,100 @@ project:
     return db_tmpdir
 
 
-@pytest.fixture
-def success_jobs_file(db_tmpdir):
-    jobs_path = Path(f"{db_tmpdir.strpath}/t000/conf/jobs.yml")
-    with jobs_path.open('w') as f:
-        f.write(f"""
-        JOBS:
-            job:
-                SCRIPT: |
-                    echo "Hello World"
-                PLATFORM: local
-                RUNNING: chunk
-                wallclock: 00:01
-        """)
-
-
-@pytest.fixture
-def failure_jobs_file(db_tmpdir):
-    jobs_path = Path(f"{db_tmpdir.strpath}/t000/conf/jobs.yml")
-    with jobs_path.open('w') as f:
-        f.write(f"""
+@pytest.mark.parametrize("jobs_data, expected_count, final_status", [
+    # Success
+    ("""
+    JOBS:
+        job:
+            SCRIPT: |
+                echo "Hello World"
+                sleep 1
+            PLATFORM: local
+            DEPENDENCIES: job-1
+            RUNNING: chunk
+            wallclock: 00:01
+            retrials: 2
+    """, 1, "COMPLETED"),
+    # Success wrapper
+    ("""
+    JOBS:
+        job:
+            SCRIPT: |
+                echo "Hello World"
+                sleep 1
+            DEPENDENCIES: job-1
+            PLATFORM: local
+            RUNNING: chunk
+            wallclock: 00:01
+            retrials: 2
+    wrappers:
+        wrapper:
+            JOBS_IN_WRAPPER: job
+            TYPE: vertical
+    """, 1, "COMPLETED"),
+    # Failure
+    ("""
     JOBS:
         job:
             SCRIPT: |
                 echo "Hello World"
+                sleep 1
                 exit 1
+            DEPENDENCIES: job-1
             PLATFORM: local
             RUNNING: chunk
             wallclock: 00:01
             retrials: 2
-    """)
-
+    """, 3, "FAILED"),
+    # Failure wrappers
+    ("""
+    JOBS:
+        job:
+            SCRIPT: |
+                echo "Hello World"
+                sleep 1
+                exit 1
+            PLATFORM: local
+            DEPENDENCIES: job-1
+            RUNNING: chunk
+            wallclock: 00:10
+            retrials: 2
+    wrappers:
+        wrapper:
+            JOBS_IN_WRAPPER: job
+            TYPE: vertical
+    """, 3, "FAILED"),
+], ids=["Success", "Success with wrapper", "Failure", "Failure with wrapper"])
+def test_db(db_tmpdir, prepare_db, jobs_data, expected_count, final_status, mocker):
+    # write jobs_data
+    jobs_path = Path(f"{db_tmpdir.strpath}/t000/conf/jobs.yml")
+    with jobs_path.open('w') as f:
+        f.write(jobs_data)
 
-@pytest.fixture
-def run_experiment_success(prepare_db, db_tmpdir, success_jobs_file, mocker):
+    # Create
     init_expid(os.environ["AUTOSUBMIT_CONFIGURATION"], platform='local', expid='t000', create=True)
-    # job_list, submitter, exp_history, host, as_conf, platforms_to_test, packages_persistence, _ = Autosubmit.prepare_run("t000")
+
+    # This is set in _init_log which is not called
     as_misc = Path(f"{db_tmpdir.strpath}/t000/conf/as_misc.yml")
     with as_misc.open('w') as f:
         f.write(f"""
-AS_MISC: True
-ASMISC:
-    COMMAND: run
-AS_COMMAND: run
-        """)
-    # Completed
-    with mocker.patch('autosubmit.platforms.platform.max', return_value=50):
-        Autosubmit.run_experiment(expid='t000')
-    # Access to the job_historical.db
-    job_data = Path(f"{db_tmpdir.strpath}/job_data_t000.db")
-    autosubmit_db = Path(f"{db_tmpdir.strpath}/tests.db")
-    assert job_data.exists()
-    assert autosubmit_db.exists()
-    return prepare_db
+    AS_MISC: True
+    ASMISC:
+        COMMAND: run
+    AS_COMMAND: run
+            """)
 
+    # Run the experiment
+    with mocker.patch('autosubmit.platforms.platform.max', return_value=20):
+        Autosubmit.run_experiment(expid='t000')
 
-@pytest.fixture
-def run_experiment_failure(prepare_db, db_tmpdir, failure_jobs_file, mocker):
-    init_expid(os.environ["AUTOSUBMIT_CONFIGURATION"], platform='local', expid='t000', create=True)
-    # job_list, submitter, exp_history, host, as_conf, platforms_to_test, packages_persistence, _ = Autosubmit.prepare_run("t000")
-    as_misc = Path(f"{db_tmpdir.strpath}/t000/conf/as_misc.yml")
-    with as_misc.open('w') as f:
-        f.write(f"""
-AS_MISC: True
-ASMISC:
-    COMMAND: run
-AS_COMMAND: run
-        """)
-    # Completed
-    # mock platform.localplatform.check_exists
-    with mocker.patch('autosubmit.platforms.platform.max', return_value=10):
-        with mocker.patch('autosubmit.platforms.platform.Platform.get_completed_files', return_value=False):
-            Autosubmit.run_experiment(expid='t000')
-    # Access to the job_historical.db
+    # Test database exists.
     job_data = Path(f"{db_tmpdir.strpath}/job_data_t000.db")
     autosubmit_db = Path(f"{db_tmpdir.strpath}/tests.db")
     assert job_data.exists()
     assert autosubmit_db.exists()
-    return prepare_db
 
-
-def test_db_success(run_experiment_success, db_tmpdir):
-    job_data = Path(f"{db_tmpdir.strpath}/job_data_t000.db")
+    # Check job_data info
     conn = sqlite3.connect(job_data)
     conn.row_factory = sqlite3.Row
     c = conn.cursor()
@@ -234,63 +237,28 @@ def test_db_success(run_experiment_success, db_tmpdir):
     column_names = rows_as_dicts[0].keys() if rows_as_dicts else []
     column_widths = [max(len(str(row[col])) for row in rows_as_dicts + [dict(zip(column_names, column_names))]) for col
                      in column_names]
+    print(f"Experiment folder: {db_tmpdir.strpath}")
     header = " | ".join(f"{name:<{width}}" for name, width in zip(column_names, column_widths))
     print(f"\n{header}")
     print("-" * len(header))
     # Print the rows
-    for row_dict in rows_as_dicts:
+    for row_dict in rows_as_dicts:  # always print, for debug proposes
         print(" | ".join(f"{str(row_dict[col]):<{width}}" for col, width in zip(column_names, column_widths)))
-        # Check that all fields contain data, except extra_data, children, and platform_output
-        # Check that submit, start and finish are > 0
-        assert row_dict["submit"] > 0 and row_dict["finish"] != 1970010101
-        assert row_dict["start"] > 0 and row_dict["finish"] != 1970010101
-        assert row_dict["finish"] > 0 and row_dict["finish"] != 1970010101
-        assert row_dict["status"] == "COMPLETED"
-        for key in [key for key in row_dict.keys() if
-                    key not in ["status", "finish", "submit", "start", "extra_data", "children", "platform_output"]]:
-            assert str(row_dict[key]) != ""
-    # Check that the job_data table has the expected number of entries
-    c.execute("SELECT job_name, COUNT(*) as count FROM job_data GROUP BY job_name")
-    count_rows = c.fetchall()
-    for row in count_rows:
-        assert row["count"] == 1
-    # Close the cursor and connection
-    c.close()
-    conn.close()
-
-def test_db_failure(run_experiment_failure, db_tmpdir):
-    job_data = Path(f"{db_tmpdir.strpath}/job_data_t000.db")
-    conn = sqlite3.connect(job_data)
-    conn.row_factory = sqlite3.Row
-    c = conn.cursor()
-    c.execute("SELECT * FROM job_data")
-    rows = c.fetchall()
-    # Convert rows to a list of dictionaries
-    rows_as_dicts = [dict(row) for row in rows]
-    # Tune the print so it is more readable, so it is easier to debug in case of failure
-    column_names = rows_as_dicts[0].keys() if rows_as_dicts else []
-    column_widths = [max(len(str(row[col])) for row in rows_as_dicts + [dict(zip(column_names, column_names))]) for col
-                     in column_names]
-    header = " | ".join(f"{name:<{width}}" for name, width in zip(column_names, column_widths))
-    print(f"\n{header}")
-    print("-" * len(header))
-    # Print the rows
     for row_dict in rows_as_dicts:
-        print(" | ".join(f"{str(row_dict[col]):<{width}}" for col, width in zip(column_names, column_widths)))
         # Check that all fields contain data, except extra_data, children, and platform_output
         # Check that submit, start and finish are > 0
         assert row_dict["submit"] > 0 and row_dict["finish"] != 1970010101
         assert row_dict["start"] > 0 and row_dict["finish"] != 1970010101
         assert row_dict["finish"] > 0 and row_dict["finish"] != 1970010101
-        assert row_dict["status"] == "FAILED"
+        assert row_dict["status"] == final_status
         for key in [key for key in row_dict.keys() if
                     key not in ["status", "finish", "submit", "start", "extra_data", "children", "platform_output"]]:
-            assert str(row_dict[key]) != ""
+            assert str(row_dict[key]) != str("")
     # Check that the job_data table has the expected number of entries
     c.execute("SELECT job_name, COUNT(*) as count FROM job_data GROUP BY job_name")
     count_rows = c.fetchall()
     for row in count_rows:
-        assert row["count"] == 3 # three retrials
+        assert row["count"] == expected_count
     # Close the cursor and connection
     c.close()
     conn.close()
diff --git a/test/unit/test_log_recovery.py b/test/unit/test_log_recovery.py
index 036cffb31..83bc74a30 100644
--- a/test/unit/test_log_recovery.py
+++ b/test/unit/test_log_recovery.py
@@ -100,6 +100,7 @@ def as_conf(prepare_test, mocker):
 
 def test_log_recovery_no_keep_alive(prepare_test, local, mocker, as_conf):
     mocker.patch('autosubmit.platforms.platform.max', return_value=1)
+    local.keep_alive_timeout = 0
     local.spawn_log_retrieval_process(as_conf)
     assert local.log_recovery_process.is_alive()
     time.sleep(2)
@@ -109,6 +110,7 @@ def test_log_recovery_no_keep_alive(prepare_test, local, mocker, as_conf):
 
 def test_log_recovery_keep_alive(prepare_test, local, mocker, as_conf):
     mocker.patch('autosubmit.platforms.platform.max', return_value=1)
+    local.keep_alive_timeout = 0
     local.spawn_log_retrieval_process(as_conf)
     assert local.log_recovery_process.is_alive()
     local.work_event.set()
@@ -124,6 +126,7 @@ def test_log_recovery_keep_alive(prepare_test, local, mocker, as_conf):
 
 def test_log_recovery_keep_alive_cleanup(prepare_test, local, mocker, as_conf):
     mocker.patch('autosubmit.platforms.platform.max', return_value=1)
+    local.keep_alive_timeout = 0
     local.spawn_log_retrieval_process(as_conf)
     assert local.log_recovery_process.is_alive()
     local.work_event.set()
@@ -137,8 +140,9 @@ def test_log_recovery_keep_alive_cleanup(prepare_test, local, mocker, as_conf):
 
 
 def test_log_recovery_recover_log(prepare_test, local, mocker, as_conf):
-    mocker.patch('autosubmit.platforms.platform.max', return_value=20)
-    mocker.patch('autosubmit.job.job.Job.write_stats')  # not sure how to test this
+    mocker.patch('autosubmit.platforms.platform.max', return_value=0)
+    local.keep_alive_timeout = 20
+    mocker.patch('autosubmit.job.job.Job.write_stats')  # Tested in test_database_regression.py
     local.spawn_log_retrieval_process(as_conf)
     local.work_event.set()
     job = Job('t000', '0000', Status.COMPLETED, 0)
-- 
GitLab


From 5e8c9e6273a79f167dfcd596883c63e6d9028fa3 Mon Sep 17 00:00:00 2001
From: dbeltran <daniel.beltran@bsc.es>
Date: Thu, 26 Sep 2024 12:23:57 +0200
Subject: [PATCH 21/25] Fix tests, added pytest-xdist

---
 autosubmit/job/job.py                 | 3 +--
 autosubmit/job/job_packages.py        | 2 +-
 autosubmit/platforms/platform.py      | 4 +++-
 pytest.ini                            | 2 +-
 setup.py                              | 1 +
 test/unit/test_database_regression.py | 6 ------
 test/unit/test_job_package.py         | 3 ++-
 test/unit/test_log_recovery.py        | 2 +-
 8 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/autosubmit/job/job.py b/autosubmit/job/job.py
index 23491b6db..f9ce861d7 100644
--- a/autosubmit/job/job.py
+++ b/autosubmit/job/job.py
@@ -2696,8 +2696,7 @@ class WrapperJob(Job):
             os.chmod(multiple_checker_inner_jobs, 0o770)
             if self.platform.name != "local":  # already "sent"...
                 self._platform.send_file(multiple_checker_inner_jobs, False)
-                command = f"cd {self._platform.get_files_path()}; {os.path.join(
-                    self._platform.get_files_path(), 'inner_jobs_checker.sh')}"
+                command = f"cd {self._platform.get_files_path()}; {os.path.join(self._platform.get_files_path(), 'inner_jobs_checker.sh')}"
             else:
                 command = os.path.join(
                     self._platform.get_files_path(), "inner_jobs_checker.sh")
diff --git a/autosubmit/job/job_packages.py b/autosubmit/job/job_packages.py
index 17bffecc5..9532e6583 100644
--- a/autosubmit/job/job_packages.py
+++ b/autosubmit/job/job_packages.py
@@ -54,7 +54,7 @@ def jobs_in_wrapper_str(as_conf, current_wrapper):
     else:
         jobs_in_wrapper = jobs_in_wrapper.split(" ")
     jobs_in_wrapper = [job.strip(" ,") for job in jobs_in_wrapper]
-    return "&".join(jobs_in_wrapper)
+    return "_".join(jobs_in_wrapper)
 class JobPackageBase(object):
     """
     Class to manage the package of jobs to be submitted by autosubmit
diff --git a/autosubmit/platforms/platform.py b/autosubmit/platforms/platform.py
index d504d5d08..9ec8828cc 100644
--- a/autosubmit/platforms/platform.py
+++ b/autosubmit/platforms/platform.py
@@ -866,7 +866,9 @@ class Platform(object):
                 break
         if not process_log: # If no work, wait until the keep_alive_timeout is reached or any signal is set to end the process.
             timeout = self.keep_alive_timeout - sleep_time
-            while timeout > 0 or not self.recovery_queue.empty() or self.cleanup_event.is_set() or self.work_event.is_set():
+            while timeout > 0:
+                if self.recovery_queue.empty() or  self.cleanup_event.is_set() or  self.work_event.is_set():
+                    break
                 time.sleep(1)
                 timeout -= 1
             if not self.recovery_queue.empty() or self.cleanup_event.is_set() or self.work_event.is_set():
diff --git a/pytest.ini b/pytest.ini
index 5ab9b5fe7..b511be9ac 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -3,7 +3,7 @@ addopts =
     --verbose --ignore=regression
     --cov=autosubmit --cov-config=.coveragerc
     --cov-report=html:test/htmlcov --cov-report=xml:test/coverage.xml
-    --strict-markers
+    --strict-markers -n auto
 testpaths =
     test/unit
 doctest_optionflags =
diff --git a/setup.py b/setup.py
index 8d1f5b100..9f6eeae37 100644
--- a/setup.py
+++ b/setup.py
@@ -71,6 +71,7 @@ tests_require = [
     'pytest==8.2.*',
     'pytest-cov',
     'pytest-mock',
+    'pytest-xdist',
     # 'testcontainers'  # TODO: pending Postgres MR
 ]
 
diff --git a/test/unit/test_database_regression.py b/test/unit/test_database_regression.py
index 7d12280fd..eb57d0534 100644
--- a/test/unit/test_database_regression.py
+++ b/test/unit/test_database_regression.py
@@ -1,13 +1,9 @@
 import shutil
-
 import pytest
 from pathlib import Path
 from autosubmit.autosubmit import Autosubmit
-from log.log import Log
 import os
 import pwd
-from autosubmit.platforms.locplatform import LocalPlatform
-
 from test.unit.utils.common import create_database, init_expid
 import sqlite3
 
@@ -16,8 +12,6 @@ def _get_script_files_path() -> Path:
     return Path(__file__).resolve().parent / 'files'
 
 
-# Maybe this should be a regression test
-
 @pytest.fixture
 def db_tmpdir(tmpdir_factory):
     folder = tmpdir_factory.mktemp(f'db_tests')
diff --git a/test/unit/test_job_package.py b/test/unit/test_job_package.py
index 3808dba35..a11323df1 100644
--- a/test/unit/test_job_package.py
+++ b/test/unit/test_job_package.py
@@ -222,8 +222,9 @@ def mock_as_conf():
         }
     return MockAsConf()
 
+
 def test_jobs_in_wrapper_str(mock_as_conf):
     # Arrange
     current_wrapper = "current_wrapper"
     result = jobs_in_wrapper_str(mock_as_conf, current_wrapper)
-    assert result == "job1_job2_job3"
\ No newline at end of file
+    assert result == "job1_job2_job3"
diff --git a/test/unit/test_log_recovery.py b/test/unit/test_log_recovery.py
index 83bc74a30..d45f15791 100644
--- a/test/unit/test_log_recovery.py
+++ b/test/unit/test_log_recovery.py
@@ -100,7 +100,6 @@ def as_conf(prepare_test, mocker):
 
 def test_log_recovery_no_keep_alive(prepare_test, local, mocker, as_conf):
     mocker.patch('autosubmit.platforms.platform.max', return_value=1)
-    local.keep_alive_timeout = 0
     local.spawn_log_retrieval_process(as_conf)
     assert local.log_recovery_process.is_alive()
     time.sleep(2)
@@ -140,6 +139,7 @@ def test_log_recovery_keep_alive_cleanup(prepare_test, local, mocker, as_conf):
 
 
 def test_log_recovery_recover_log(prepare_test, local, mocker, as_conf):
+    print(prepare_test.strpath)
     mocker.patch('autosubmit.platforms.platform.max', return_value=0)
     local.keep_alive_timeout = 20
     mocker.patch('autosubmit.job.job.Job.write_stats')  # Tested in test_database_regression.py
-- 
GitLab


From 8df256ceb14e9d617924a2469678230b6155fd16 Mon Sep 17 00:00:00 2001
From: dbeltran <daniel.beltran@bsc.es>
Date: Thu, 26 Sep 2024 12:31:18 +0200
Subject: [PATCH 22/25] Update as version

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 9f6eeae37..7c3719216 100644
--- a/setup.py
+++ b/setup.py
@@ -48,7 +48,7 @@ install_requires = [
     'numpy<2',
     'ruamel.yaml==0.17.21',
     'rocrate==0.*',
-    'autosubmitconfigparser==1.0.67',
+    'autosubmitconfigparser==1.0.70',
     'configparser',
     'setproctitle',
     # 'sqlalchemy[mypy]' # TODO: pending Postgres MR
-- 
GitLab


From 89aef5e98eaf9c9358c9c05f84ac9ee400fdef3d Mon Sep 17 00:00:00 2001
From: dbeltran <daniel.beltran@bsc.es>
Date: Thu, 26 Sep 2024 14:36:14 +0200
Subject: [PATCH 23/25] Test pipeline

---
 test/unit/test_database_regression.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/unit/test_database_regression.py b/test/unit/test_database_regression.py
index eb57d0534..58116b581 100644
--- a/test/unit/test_database_regression.py
+++ b/test/unit/test_database_regression.py
@@ -210,8 +210,8 @@ def test_db(db_tmpdir, prepare_db, jobs_data, expected_count, final_status, mock
             """)
 
     # Run the experiment
-    with mocker.patch('autosubmit.platforms.platform.max', return_value=20):
-        Autosubmit.run_experiment(expid='t000')
+    mocker.patch('autosubmit.platforms.platform.max', return_value=60)
+    Autosubmit.run_experiment(expid='t000')
 
     # Test database exists.
     job_data = Path(f"{db_tmpdir.strpath}/job_data_t000.db")
-- 
GitLab


From 6b918df10bdb8fdd59d7825f59cb546a713933a0 Mon Sep 17 00:00:00 2001
From: dbeltran <daniel.beltran@bsc.es>
Date: Thu, 26 Sep 2024 14:51:40 +0200
Subject: [PATCH 24/25] Test pipeline ( now with output)

---
 setup.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/setup.py b/setup.py
index 7c3719216..013dcfc21 100644
--- a/setup.py
+++ b/setup.py
@@ -71,7 +71,6 @@ tests_require = [
     'pytest==8.2.*',
     'pytest-cov',
     'pytest-mock',
-    'pytest-xdist',
     # 'testcontainers'  # TODO: pending Postgres MR
 ]
 
-- 
GitLab


From 8999df2a952e8869e17c2b05aa133d440871c7d9 Mon Sep 17 00:00:00 2001
From: dbeltran <daniel.beltran@bsc.es>
Date: Thu, 26 Sep 2024 14:55:07 +0200
Subject: [PATCH 25/25] Test pipeline ( now with output)

---
 pytest.ini | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytest.ini b/pytest.ini
index b511be9ac..5ab9b5fe7 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -3,7 +3,7 @@ addopts =
     --verbose --ignore=regression
     --cov=autosubmit --cov-config=.coveragerc
     --cov-report=html:test/htmlcov --cov-report=xml:test/coverage.xml
-    --strict-markers -n auto
+    --strict-markers
 testpaths =
     test/unit
 doctest_optionflags =
-- 
GitLab