diff --git a/autosubmit/platforms/ecplatform.py b/autosubmit/platforms/ecplatform.py index 5fb9914aa889f46bc540f7e0fada8d7c2e4bcbea..2307baf8ecbad4af2eb60e4132e62f855fc6bd48 100644 --- a/autosubmit/platforms/ecplatform.py +++ b/autosubmit/platforms/ecplatform.py @@ -202,6 +202,7 @@ class EcPlatform(ParamikoPlatform): :return: True :rtype: bool """ + self.main_process_id = os.getpid() output = subprocess.check_output(self._checkvalidcert_cmd, shell=True).decode(locale.getlocale()[1]) if not output: output = "" diff --git a/autosubmit/platforms/locplatform.py b/autosubmit/platforms/locplatform.py index 0af2d65cd0cae22eba32fbb6f21d5889ca86c310..b6109c39a22130b23df47cf5ddc8566aa01151b0 100644 --- a/autosubmit/platforms/locplatform.py +++ b/autosubmit/platforms/locplatform.py @@ -121,6 +121,7 @@ class LocalPlatform(ParamikoPlatform): def test_connection(self,as_conf): + self.main_process_id = os.getpid() if not self.connected: self.connect(as_conf) diff --git a/autosubmit/platforms/paramiko_platform.py b/autosubmit/platforms/paramiko_platform.py index 7dda872dbe58ec80f347fe12d78d6aaa17f0a905..1c9a3dd7266d19baa91bcf29e9dd564b0c2d6f25 100644 --- a/autosubmit/platforms/paramiko_platform.py +++ b/autosubmit/platforms/paramiko_platform.py @@ -121,6 +121,8 @@ class ParamikoPlatform(Platform): """ Test if the connection is still alive, reconnect if not. """ + self.main_process_id = os.getpid() + try: if not self.connected: self.reset() diff --git a/autosubmit/platforms/paramiko_submitter.py b/autosubmit/platforms/paramiko_submitter.py index 4ee256be2dfd8a3fd61ed048aa4ede83c60c68d0..67114ff2c2cea1a51c76906057366e0df1667bdf 100644 --- a/autosubmit/platforms/paramiko_submitter.py +++ b/autosubmit/platforms/paramiko_submitter.py @@ -155,7 +155,7 @@ class ParamikoSubmitter(Submitter): else: raise Exception( "Queue type not specified on platform {0}".format(section)) - + remote_platform.main_process_id = os.getpid() except ParamikoPlatformException as e: Log.error("Queue exception: {0}".format(str(e))) return None diff --git a/autosubmit/platforms/platform.py b/autosubmit/platforms/platform.py index 5ccf6e91e2beff43eb77d9cdc7288a0fa7cd3404..2eb58214d820ccd648b9e806731938b2282ecd25 100644 --- a/autosubmit/platforms/platform.py +++ b/autosubmit/platforms/platform.py @@ -104,7 +104,7 @@ class Platform(object): self.pw = None self.recovery_queue = Queue() self.log_retrieval_process_active = False - + self.main_process_id = None @property @autosubmit_parameter(name='current_arch') @@ -831,9 +831,10 @@ class Platform(object): job_names_processed = set() self.connected = False self.restore_connection(None) - while not event.is_set(): + # check if id of self.main_process exists with ps ax | grep self.main_process_id + while not event.is_set() and os.system(f"ps ax | grep {str(self.main_process_id)} | grep -v grep > /dev/null 2>&1") == 0: try: - job,children = self.recovery_queue.get() + job,children = self.recovery_queue.get(block=False) if job.wrapper_type != "vertical": if f'{job.name}_{job.fail_count}' in job_names_processed: continue diff --git a/autosubmit/platforms/sgeplatform.py b/autosubmit/platforms/sgeplatform.py index 9c5e813f1c96f5c6fcc21257ef630c6d70f82d31..f44fb66294006e247974014f03c100326dd5b712 100644 --- a/autosubmit/platforms/sgeplatform.py +++ b/autosubmit/platforms/sgeplatform.py @@ -144,6 +144,7 @@ class SgePlatform(ParamikoPlatform): :return: True :rtype: bool """ + self.main_process_id = os.getpid() self.connected = True self.connected(as_conf,True)