From 274a9cede9215392e697f3216a3ca53696dbd1db Mon Sep 17 00:00:00 2001
From: Javier Vegas-Regidor <javier.vegas@bsc.es>
Date: Tue, 22 Nov 2016 13:26:23 +0100
Subject: [PATCH 1/8] Added some changes to

---
 diags.conf                         | 13 +++++++------
 earthdiagnostics/config.py         |  6 +++++-
 earthdiagnostics/threddsmanager.py | 19 +++++++++++--------
 3 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/diags.conf b/diags.conf
index 1b3cae91..f57b1cc3 100644
--- a/diags.conf
+++ b/diags.conf
@@ -4,7 +4,8 @@ DATA_ADAPTOR = THREDDS
 # Path to the folder where you want to create the temporary files
 SCRATCH_DIR = /scratch/Earth/$USER
 # Root path for the cmorized data to use
-DATA_DIR = /esnas/exp/:/esarchive/exp/
+DATA_DIR = /esnas:/esarchive
+DATA_TYPE = exp
 
 # Path to NEMO's mask and grid files needed for CDFTools
 CON_FILES = /esnas/autosubmit/con_files/
@@ -63,8 +64,8 @@ SERVER_URL = http://earth.bsc.es/thredds
 
 [EXPERIMENT]
 # Experiments parameters as defined in CMOR standard
-INSTITUTE = meteofrance
-MODEL = system4_m1
+INSTITUTE = ecmwf
+MODEL = erainterim
 # Model version: Available versions
 MODEL_VERSION =Ec2.3_O1L46
 # Atmospheric output timestep in hours
@@ -81,11 +82,11 @@ OCEAN_TIMESTEP = 6
 # CHUNK_SIZE is the size of each data file, given in months
 # CHUNKS is the number of chunks. You can specify less chunks than present on the experiment
 EXPID = resilience
-STARTDATES = 19911101
+STARTDATES = 19790101
 MEMBERS = 0
 MEMBER_DIGITS = 1
-CHUNK_SIZE = 7
-CHUNKS = 1
+CHUNK_SIZE = 1
+CHUNKS = 10
 # CHUNKS = 1
 
 
diff --git a/earthdiagnostics/config.py b/earthdiagnostics/config.py
index 152fbce4..ab206c12 100644
--- a/earthdiagnostics/config.py
+++ b/earthdiagnostics/config.py
@@ -29,13 +29,17 @@ class Config(object):
         "Scratch folder path"
         self.data_dir = Utils.expand_path(parser.get_option('DIAGNOSTICS', 'DATA_DIR'))
         "Root data folder path"
+        self.data_type = Utils.expand_path(parser.get_option('DIAGNOSTICS', 'DATA_TYPE', 'exp')).lower()
+        "Data type (experiment, observation or reconstruction)"
+        if self.data_type not in ('exp', 'obs', 'recon'):
+            raise Exception('Data type must be exp, obs or recon')
         self.con_files = Utils.expand_path(parser.get_option('DIAGNOSTICS', 'CON_FILES'))
         "Mask and meshes folder path"
         self._diags = parser.get_option('DIAGNOSTICS', 'DIAGS')
         self.frequency = parser.get_option('DIAGNOSTICS', 'FREQUENCY').lower()
+        "Default data frequency to be used by the diagnostics"
         if self.frequency == 'month':
             self.frequency = 'mon'
-        "Default data frequency to be used by the diagnostics"
         self.cdftools_path = Utils.expand_path(parser.get_option('DIAGNOSTICS', 'CDFTOOLS_PATH'))
         "Path to CDFTOOLS executables"
         self.max_cores = parser.get_int_option('DIAGNOSTICS', 'MAX_CORES', 100000)
diff --git a/earthdiagnostics/threddsmanager.py b/earthdiagnostics/threddsmanager.py
index cb60f22b..3e6152e4 100644
--- a/earthdiagnostics/threddsmanager.py
+++ b/earthdiagnostics/threddsmanager.py
@@ -19,7 +19,7 @@ class THREDDSManager(DataManager):
         data_folders = self.config.data_dir.split(':')
         self.config.data_dir = None
         for data_folder in data_folders:
-            if os.path.isdir(os.path.join(data_folder, self.experiment.institute.lower(),
+            if os.path.isdir(os.path.join(data_folder, self.config.data_type,  self.experiment.institute.lower(),
                                           self.experiment.model.lower())):
                 self.config.data_dir = data_folder
                 break
@@ -160,16 +160,13 @@ class THREDDSManager(DataManager):
         var = self._get_final_var_name(box, var)
 
         folder_path = self._get_folder_path(frequency, domain, var, grid, vartype)
-        if startdate:
-            file_name = '{0}_{1}.nc'.format(var, startdate)
-        else:
-            file_name = '{0}.nc'.format(var)
+        file_name = self._get_file_name(startdate, var)
 
         filepath = os.path.join(folder_path, file_name)
         return filepath
 
     def _get_folder_path(self, frequency, domain, variable, grid, vartype):
-        folder_path = os.path.join(self.config.data_dir,
+        folder_path = os.path.join(self.config.data_dir, self.config.data_type,
                                    self.experiment.institute.lower(),
                                    self.experiment.model.lower(),
                                    self.frequency_folder_name(frequency, vartype),
@@ -202,9 +199,15 @@ class THREDDSManager(DataManager):
             protocol = 'fileServer'
         else:
             protocol = 'dodsC'
-        return os.path.join(self.server_url, protocol, 'exp', self.experiment.institute,
+        return os.path.join(self.server_url, protocol, self.config.data_type, self.experiment.institute,
                             self.experiment.model, self.frequency_folder_name(frequency, vartype),
-                            var, '{0}_{1}.nc'.format(var, startdate))
+                            var, self._get_file_name(startdate, var))
+
+    def _get_file_name(self, startdate, var):
+        if startdate and self.config.data_type == 'exp':
+            return '{0}_{1}.nc'.format(var, startdate)
+        else:
+            return '{0}.nc'.format(var)
 
     def link_file(self, domain, var, startdate, member, chunk=None, grid=None, box=None,
                   frequency=None, year=None, date_str=None, move_old=False, vartype=VarType.MEAN):
-- 
GitLab


From 793cc7aff2dd6ff61672d084e05bf2233d05624f Mon Sep 17 00:00:00 2001
From: Javier Vegas-Regidor <javier.vegas@bsc.es>
Date: Wed, 23 Nov 2016 15:12:22 +0100
Subject: [PATCH 2/8] Changed thredds manager to use dodosC and nccopy to
 retrieve files

---
 diags.conf                         |  2 +-
 earthdiagnostics/threddsmanager.py | 52 +++++++++++++++++++++++-------
 2 files changed, 42 insertions(+), 12 deletions(-)

diff --git a/diags.conf b/diags.conf
index f57b1cc3..886679cb 100644
--- a/diags.conf
+++ b/diags.conf
@@ -5,7 +5,7 @@ DATA_ADAPTOR = THREDDS
 SCRATCH_DIR = /scratch/Earth/$USER
 # Root path for the cmorized data to use
 DATA_DIR = /esnas:/esarchive
-DATA_TYPE = exp
+DATA_TYPE = recon
 
 # Path to NEMO's mask and grid files needed for CDFTools
 CON_FILES = /esnas/autosubmit/con_files/
diff --git a/earthdiagnostics/threddsmanager.py b/earthdiagnostics/threddsmanager.py
index 3e6152e4..5d6e6081 100644
--- a/earthdiagnostics/threddsmanager.py
+++ b/earthdiagnostics/threddsmanager.py
@@ -1,6 +1,7 @@
 # coding=utf-8
 import os
-from autosubmit.date.chunk_date_lib import parse_date, add_months
+from autosubmit.config.log import Log
+from autosubmit.date.chunk_date_lib import parse_date, add_months, chunk_start_date, chunk_end_date
 
 from earthdiagnostics.datamanager import DataManager, NetCDFFile
 from earthdiagnostics.utils import TempFile, Utils
@@ -30,7 +31,7 @@ class THREDDSManager(DataManager):
     def get_leadtimes(self, domain, variable, startdate, member, leadtimes, frequency=None, vartype=VarType.MEAN):
         if not frequency:
             frequency = self.config.frequency
-        aggregation_path = self.get_var_url(variable, startdate, frequency, None, False, vartype)
+        aggregation_path = self.get_var_url(variable, startdate, frequency, None, vartype)
         temp = TempFile.get()
         startdate = parse_date(startdate)
         selected_months = ','.join([str(add_months(startdate, i, 'standard').month) for i in leadtimes])
@@ -65,9 +66,38 @@ class THREDDSManager(DataManager):
         """
         if not frequency:
             frequency = self.config.frequency
-        aggregation_path = self.get_var_url(var, startdate, frequency, box, True, vartype)
+        aggregation_path = self.get_var_url(var, startdate, frequency, box, vartype)
         temp = TempFile.get()
-        urllib.urlretrieve(aggregation_path, temp)
+
+        start_chunk = chunk_start_date(parse_date(startdate), chunk, self.experiment.chunk_size, 'month', 'standard')
+        end_chunk = chunk_end_date(start_chunk, self.experiment.chunk_size, 'month', 'standard')
+
+        handler = Utils.openCdf(aggregation_path)
+        times = Utils.get_datetime_from_netcdf(handler)
+        lat_size = handler.dimensions['latitude'].size
+        lon_size = handler.dimensions['longitude'].size
+        handler.close()
+        time_start = 0
+        while time_start < times.size and times[time_start] < start_chunk:
+            time_start += 1
+        if time_start == times.size:
+            raise Exception('Timesteps not available for chunk {0}'.format(chunk))
+
+        time_end = time_start
+
+        if times[time_end] >= end_chunk:
+            raise Exception('Timesteps not available for chunk {0}'.format(chunk))
+
+        while time_end < times.size-1 and times[time_end+1] < end_chunk:
+            time_end += 1
+
+        slice_path = '{0}?time[{1},1,{2}],latitude[0,1,{3}],longitude[0,1,{4}],' \
+                     '{5}[{1},1,{2}][0,1,{3}][0,1,{4}],{5}'.format(aggregation_path, time_start, time_end,
+                                                                   lat_size, lon_size, var)
+        Log.debug(slice_path)
+
+        Utils.nco.nccopy(slice_path, temp)
+
         if not Utils.check_netcdf_file(temp):
             raise THREDDSError('Can not retrieve {0} from server'.format(aggregation_path))
         return temp
@@ -193,15 +223,15 @@ class THREDDSManager(DataManager):
         :return:
         """
 
-    def get_var_url(self, var, startdate, frequency, box, fileserver, vartype):
+    def get_var_url(self, var, startdate, frequency, box, vartype):
         var = self._get_final_var_name(box, var)
-        if fileserver:
-            protocol = 'fileServer'
+        full_path = os.path.join(self.server_url, 'dodsC', self.config.data_type, self.experiment.institute,
+                                 self.experiment.model, self.frequency_folder_name(frequency, vartype))
+        if self.config.data_type == 'exp':
+            full_path = os.path.join(full_path, var, self._get_file_name(startdate, var))
         else:
-            protocol = 'dodsC'
-        return os.path.join(self.server_url, protocol, self.config.data_type, self.experiment.institute,
-                            self.experiment.model, self.frequency_folder_name(frequency, vartype),
-                            var, self._get_file_name(startdate, var))
+            full_path = os.path.join(full_path, self._get_file_name(startdate, var))
+        return full_path
 
     def _get_file_name(self, startdate, var):
         if startdate and self.config.data_type == 'exp':
-- 
GitLab


From 29e8c6ddaf11c914d8bd759d5030413e417f45c9 Mon Sep 17 00:00:00 2001
From: Javier Vegas-Regidor <javier.vegas@bsc.es>
Date: Wed, 23 Nov 2016 16:10:22 +0100
Subject: [PATCH 3/8] Added support for obs and recon to THREDDSmanager

---
 earthdiagnostics/threddsmanager.py | 35 ++++++++++++++++++++----------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/earthdiagnostics/threddsmanager.py b/earthdiagnostics/threddsmanager.py
index 5d6e6081..321d2f2e 100644
--- a/earthdiagnostics/threddsmanager.py
+++ b/earthdiagnostics/threddsmanager.py
@@ -1,7 +1,7 @@
 # coding=utf-8
 import os
 from autosubmit.config.log import Log
-from autosubmit.date.chunk_date_lib import parse_date, add_months, chunk_start_date, chunk_end_date
+from autosubmit.date.chunk_date_lib import parse_date, add_months, chunk_start_date, chunk_end_date, date2str
 
 from earthdiagnostics.datamanager import DataManager, NetCDFFile
 from earthdiagnostics.utils import TempFile, Utils
@@ -28,6 +28,10 @@ class THREDDSManager(DataManager):
         if not self.config.data_dir:
             raise Exception('Can not find model data')
 
+        if self.config.data_type in ('obs', 'recon') and self.experiment.chunk_size !=1 :
+            raise Exception('For obs and recon data chunk_size must be always 1')
+
+
     def get_leadtimes(self, domain, variable, startdate, member, leadtimes, frequency=None, vartype=VarType.MEAN):
         if not frequency:
             frequency = self.config.frequency
@@ -74,8 +78,8 @@ class THREDDSManager(DataManager):
 
         handler = Utils.openCdf(aggregation_path)
         times = Utils.get_datetime_from_netcdf(handler)
-        lat_size = handler.dimensions['latitude'].size
-        lon_size = handler.dimensions['longitude'].size
+        lat_size = handler.dimensions['latitude'].size -1
+        lon_size = handler.dimensions['longitude'].size -1
         handler.close()
         time_start = 0
         while time_start < times.size and times[time_start] < start_chunk:
@@ -91,12 +95,12 @@ class THREDDSManager(DataManager):
         while time_end < times.size-1 and times[time_end+1] < end_chunk:
             time_end += 1
 
-        slice_path = '{0}?time[{1},1,{2}],latitude[0,1,{3}],longitude[0,1,{4}],' \
-                     '{5}[{1},1,{2}][0,1,{3}][0,1,{4}],{5}'.format(aggregation_path, time_start, time_end,
-                                                                   lat_size, lon_size, var)
+        slice_path = '{0}?time[{1}:1:{2}],latitude[0:1:{3}],longitude[0:1:{4}],' \
+                     '{5}[{1}:1:{2}][0:1:{3}][0:1:{4}]'.format(aggregation_path, time_start, time_end,
+                                                               lat_size, lon_size, var)
         Log.debug(slice_path)
 
-        Utils.nco.nccopy(slice_path, temp)
+        Utils.execute_shell_command(['nccopy', slice_path, temp])
 
         if not Utils.check_netcdf_file(temp):
             raise THREDDSError('Can not retrieve {0} from server'.format(aggregation_path))
@@ -158,7 +162,10 @@ class THREDDSManager(DataManager):
         if not frequency:
             frequency = self.config.frequency
 
-        filepath = self.get_file_path(startdate, domain, var, frequency, vartype, box, grid)
+        start_chunk = chunk_start_date(parse_date(startdate), chunk, self.experiment.chunk_size, 'month',
+                                           'standard')
+
+        filepath = self.get_file_path(date2str(start_chunk)[0:6], domain, var, frequency, vartype, box, grid)
         netcdf_file = NetCDFFile(filepath, filetosend, domain, var, cmor_var)
         if diagnostic:
             netcdf_file.add_diagnostic_history(diagnostic)
@@ -196,11 +203,17 @@ class THREDDSManager(DataManager):
         return filepath
 
     def _get_folder_path(self, frequency, domain, variable, grid, vartype):
+
+        if self.config.data_type == 'exp':
+            var_folder = self.get_varfolder(domain, variable, grid)
+        else:
+            var_folder = variable
+
         folder_path = os.path.join(self.config.data_dir, self.config.data_type,
                                    self.experiment.institute.lower(),
                                    self.experiment.model.lower(),
                                    self.frequency_folder_name(frequency, vartype),
-                                   self.get_varfolder(domain, variable, grid))
+                                   var_folder)
         return folder_path
 
     def get_year(self, domain, var, startdate, member, year, grid=None, box=None):
@@ -230,11 +243,11 @@ class THREDDSManager(DataManager):
         if self.config.data_type == 'exp':
             full_path = os.path.join(full_path, var, self._get_file_name(startdate, var))
         else:
-            full_path = os.path.join(full_path, self._get_file_name(startdate, var))
+            full_path = os.path.join(full_path, self._get_file_name(None, var))
         return full_path
 
     def _get_file_name(self, startdate, var):
-        if startdate and self.config.data_type == 'exp':
+        if startdate:
             return '{0}_{1}.nc'.format(var, startdate)
         else:
             return '{0}.nc'.format(var)
-- 
GitLab


From 05ff93fc292765b413be6e5b5389e10e5a143594 Mon Sep 17 00:00:00 2001
From: Javier Vegas-Regidor <javier.vegas@bsc.es>
Date: Wed, 23 Nov 2016 17:27:47 +0100
Subject: [PATCH 4/8] Fixed cmorizer bug

---
 diags.conf                   | 2 +-
 earthdiagnostics/cmorizer.py | 4 ----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/diags.conf b/diags.conf
index 886679cb..a49265cc 100644
--- a/diags.conf
+++ b/diags.conf
@@ -12,7 +12,7 @@ CON_FILES = /esnas/autosubmit/con_files/
 # Diagnostics to run, space separated. You must provide for each one the name and the parameters (comma separated) or
 # an alias defined in the ALIAS section (see more below). If you are using the diagnpostics just to CMORize, leave it
 # empty
-DIAGS = monpercent,atmos,sfcWind,90 monpercent,atmos,sfcWind,10
+DIAGS = monpercent,atmos,sfcWind,66 monpercent,atmos,sfcWind,33
 # DIAGS = OHC
 # Frequency of the data you want to use by default. Some diagnostics do not use this value: i.e. monmean always stores
 # its results at monthly frequency (obvious) and has a parameter to specify input's frequency.
diff --git a/earthdiagnostics/cmorizer.py b/earthdiagnostics/cmorizer.py
index 88c9746b..02deb036 100644
--- a/earthdiagnostics/cmorizer.py
+++ b/earthdiagnostics/cmorizer.py
@@ -70,7 +70,6 @@ class Cmorizer(object):
         count = 1
         for tarfile in tar_files:
             Log.info('Unpacking oceanic file {0}/{1}'.format(count, len(tar_files)))
-            self._check_cmorization_required()
             self._unpack_tar_file(tarfile)
             self._cmorize_nc_files()
             Log.result('Oceanic file {0}/{1} finished'.format(count, len(tar_files)))
@@ -532,9 +531,6 @@ class Cmorizer(object):
         gribfiles = glob.glob(grb_path)
         return len(gribfiles) > 0
 
-    def _check_cmorization_required(self, tarfile):
-        pass
-
 
 class CMORException(Exception):
     pass
-- 
GitLab


From 7db3b8092332dc71839a027c08cdb0429eead75c Mon Sep 17 00:00:00 2001
From: Javier Vegas-Regidor <javier.vegas@bsc.es>
Date: Thu, 24 Nov 2016 12:03:47 +0100
Subject: [PATCH 5/8] Added THREDDSSubset class

---
 earthdiagnostics/threddsmanager.py | 129 ++++++++++++++++++++---------
 1 file changed, 90 insertions(+), 39 deletions(-)

diff --git a/earthdiagnostics/threddsmanager.py b/earthdiagnostics/threddsmanager.py
index 321d2f2e..4b5a1fad 100644
--- a/earthdiagnostics/threddsmanager.py
+++ b/earthdiagnostics/threddsmanager.py
@@ -1,11 +1,10 @@
 # coding=utf-8
 import os
-from autosubmit.config.log import Log
 from autosubmit.date.chunk_date_lib import parse_date, add_months, chunk_start_date, chunk_end_date, date2str
 
 from earthdiagnostics.datamanager import DataManager, NetCDFFile
 from earthdiagnostics.utils import TempFile, Utils
-import urllib
+from datetime import datetime
 
 from earthdiagnostics.variable import Variable, VarType
 
@@ -31,16 +30,19 @@ class THREDDSManager(DataManager):
         if self.config.data_type in ('obs', 'recon') and self.experiment.chunk_size !=1 :
             raise Exception('For obs and recon data chunk_size must be always 1')
 
-
     def get_leadtimes(self, domain, variable, startdate, member, leadtimes, frequency=None, vartype=VarType.MEAN):
-        if not frequency:
-            frequency = self.config.frequency
+
         aggregation_path = self.get_var_url(variable, startdate, frequency, None, vartype)
-        temp = TempFile.get()
         startdate = parse_date(startdate)
+        start_chunk = chunk_start_date(startdate, self.experiment.chunks, self.experiment.chunk_size, 'month', 'standard')
+        end_chunk = chunk_end_date(start_chunk, self.experiment.chunk_size, 'month', 'standard')
+
+        thredds_subset = THREDDSSubset(aggregation_path, variable, startdate, end_chunk)
         selected_months = ','.join([str(add_months(startdate, i, 'standard').month) for i in leadtimes])
-        select_months = '-selmonth,{0} {1}'.format(selected_months, aggregation_path)
+        select_months = '-selmonth,{0} {1}'.format(selected_months, thredds_subset.get_subset_url())
         selected_years = ','.join([str(add_months(startdate, i, 'standard').year) for i in leadtimes])
+
+        temp = TempFile.get()
         Utils.cdo.selyear(selected_years, input=select_months, output=temp)
         return temp
 
@@ -68,43 +70,14 @@ class THREDDSManager(DataManager):
         :return: path to the copy created on the scratch folder
         :rtype: str
         """
-        if not frequency:
-            frequency = self.config.frequency
         aggregation_path = self.get_var_url(var, startdate, frequency, box, vartype)
-        temp = TempFile.get()
 
         start_chunk = chunk_start_date(parse_date(startdate), chunk, self.experiment.chunk_size, 'month', 'standard')
         end_chunk = chunk_end_date(start_chunk, self.experiment.chunk_size, 'month', 'standard')
 
-        handler = Utils.openCdf(aggregation_path)
-        times = Utils.get_datetime_from_netcdf(handler)
-        lat_size = handler.dimensions['latitude'].size -1
-        lon_size = handler.dimensions['longitude'].size -1
-        handler.close()
-        time_start = 0
-        while time_start < times.size and times[time_start] < start_chunk:
-            time_start += 1
-        if time_start == times.size:
-            raise Exception('Timesteps not available for chunk {0}'.format(chunk))
-
-        time_end = time_start
-
-        if times[time_end] >= end_chunk:
-            raise Exception('Timesteps not available for chunk {0}'.format(chunk))
-
-        while time_end < times.size-1 and times[time_end+1] < end_chunk:
-            time_end += 1
+        thredds_subset = THREDDSSubset(aggregation_path, var, start_chunk, end_chunk)
+        return thredds_subset.download()
 
-        slice_path = '{0}?time[{1}:1:{2}],latitude[0:1:{3}],longitude[0:1:{4}],' \
-                     '{5}[{1}:1:{2}][0:1:{3}][0:1:{4}]'.format(aggregation_path, time_start, time_end,
-                                                               lat_size, lon_size, var)
-        Log.debug(slice_path)
-
-        Utils.execute_shell_command(['nccopy', slice_path, temp])
-
-        if not Utils.check_netcdf_file(temp):
-            raise THREDDSError('Can not retrieve {0} from server'.format(aggregation_path))
-        return temp
 
     def send_file(self, filetosend, domain, var, startdate, member, chunk=None, grid=None, region=None, box=None,
                   rename_var=None, frequency=None, year=None, date_str=None, move_old=False,
@@ -216,7 +189,7 @@ class THREDDSManager(DataManager):
                                    var_folder)
         return folder_path
 
-    def get_year(self, domain, var, startdate, member, year, grid=None, box=None):
+    def get_year(self, domain, var, startdate, member, year, grid=None, box=None, vartype=VarType.MEAN):
         """
         Ge a file containing all the data for one year for one variable
         :param domain: variable's domain
@@ -235,8 +208,14 @@ class THREDDSManager(DataManager):
         :type box: Box
         :return:
         """
+        aggregation_path = self.get_var_url(var, startdate, None, box, vartype)
+        thredds_subset = THREDDSSubset(aggregation_path, var, datetime(year, 1, 1), datetime(year+1, 1, 1))
+        return thredds_subset.download()
+
 
     def get_var_url(self, var, startdate, frequency, box, vartype):
+        if not frequency:
+            frequency = self.config.frequency
         var = self._get_final_var_name(box, var)
         full_path = os.path.join(self.server_url, 'dodsC', self.config.data_type, self.experiment.institute,
                                  self.experiment.model, self.frequency_folder_name(frequency, vartype))
@@ -287,3 +266,75 @@ class THREDDSManager(DataManager):
 class THREDDSError(Exception):
     pass
 
+
+class THREDDSSubset:
+    def __init__(self, thredds_path, var, start_time, end_time):
+        self.thredds_path = thredds_path
+        self.var = var
+        self.dimension_indexes = {}
+        self.handler = None
+        self.start_time = start_time
+        self.end_time = end_time
+
+    def get_url(self):
+        self.handler = Utils.openCdf(self.thredds_path)
+        self._read_metadata()
+        self.handler.close()
+
+        self._get_time_indexes()
+
+        return self._get_subset_url()
+
+    def download(self):
+        url = self.get_url()
+        return self._download_url(url)
+
+    def _read_metadata(self):
+        self.var_dimensions = self.handler.variables[self.var].dimensions
+        for dimension in self.var_dimensions:
+            if dimension == 'time':
+                continue
+            self.dimension_indexes[dimension] = (0, self.handler.dimensions[dimension].size - 1)
+
+        if 'time' in self.var_dimensions:
+            self.times = Utils.get_datetime_from_netcdf(self.handler)
+
+    def _get_time_indexes(self):
+        if 'time' not in self.var_dimensions:
+            return
+
+        time_start = 0
+        while time_start < self.times.size and self.times[time_start] < self.start_time:
+            time_start += 1
+        if time_start == self.times.size:
+            raise Exception('Timesteps not available for interval {0}-{1}'.format(self.start_time, self.end_time))
+        time_end = time_start
+        if self.times[time_end] >= self.end_time:
+            raise Exception('Timesteps not available for interval {0}-{1}'.format(self.start_time, self.end_time))
+        while time_end < self.times.size - 1 and self.times[time_end + 1] < self.end_time:
+            time_end += 1
+        self.dimension_indexes['time'] = (time_start, time_end)
+
+    def _download_url(self, url):
+        temp = TempFile.get()
+        Utils.execute_shell_command(['nccopy', url, temp])
+        if not Utils.check_netcdf_file(temp):
+            raise THREDDSError('Can not retrieve {0} from server'.format(url))
+        return temp
+
+    def _get_subset_url(self):
+        var_slice = self.var
+        dimensions_slice = ''
+
+        for dimension in self.var_dimensions:
+            slice_index = self._get_slice_index(self.dimension_indexes[dimension])
+            var_slice += slice_index
+            dimensions_slice += '{0}{1},'.format(dimension, slice_index)
+
+        return '{0}?{1}{2}'.format(self.thredds_path, dimensions_slice, var_slice)
+
+    def _get_slice_index(self, index_tuple):
+        return '[{0[0]}:1:{0[1]}]'.format(index_tuple)
+
+
+
-- 
GitLab


From 8214f7217d8190701a6bc4500f4d8537b50ffb29 Mon Sep 17 00:00:00 2001
From: Javier Vegas-Regidor <javier.vegas@bsc.es>
Date: Thu, 24 Nov 2016 17:13:52 +0100
Subject: [PATCH 6/8] Corrected climatological percentil to be able to work
 with recon data

---
 diags.conf                                    |  6 ++---
 .../statistics/climatologicalpercentile.py    | 22 +++++++++----------
 earthdiagnostics/threddsmanager.py            | 22 +++++++++----------
 3 files changed, 24 insertions(+), 26 deletions(-)

diff --git a/diags.conf b/diags.conf
index a49265cc..13d7fa7b 100644
--- a/diags.conf
+++ b/diags.conf
@@ -12,7 +12,7 @@ CON_FILES = /esnas/autosubmit/con_files/
 # Diagnostics to run, space separated. You must provide for each one the name and the parameters (comma separated) or
 # an alias defined in the ALIAS section (see more below). If you are using the diagnpostics just to CMORize, leave it
 # empty
-DIAGS = monpercent,atmos,sfcWind,66 monpercent,atmos,sfcWind,33
+DIAGS = climpercent,atmos,sfcWind,1
 # DIAGS = OHC
 # Frequency of the data you want to use by default. Some diagnostics do not use this value: i.e. monmean always stores
 # its results at monthly frequency (obvious) and has a parameter to specify input's frequency.
@@ -60,7 +60,7 @@ ATMOS_MONTHLY_VARS = 167, 201, 202, 165, 166, 151, 144, 228, 205, 182, 164, 146,
 # SOURCE = 'EC-Earthv2.3.0, ocean: Nemo3.1, ifs31r1, lim2
 
 [THREDDS]
-SERVER_URL = http://earth.bsc.es/thredds
+SERVER_URL = https://earth.bsc.es/thredds
 
 [EXPERIMENT]
 # Experiments parameters as defined in CMOR standard
@@ -86,7 +86,7 @@ STARTDATES = 19790101
 MEMBERS = 0
 MEMBER_DIGITS = 1
 CHUNK_SIZE = 1
-CHUNKS = 10
+CHUNKS = 36
 # CHUNKS = 1
 
 
diff --git a/earthdiagnostics/statistics/climatologicalpercentile.py b/earthdiagnostics/statistics/climatologicalpercentile.py
index 1e52049c..60fd7568 100644
--- a/earthdiagnostics/statistics/climatologicalpercentile.py
+++ b/earthdiagnostics/statistics/climatologicalpercentile.py
@@ -98,16 +98,16 @@ class ClimatologicalPercentile(Diagnostic):
         percentile_var = handler.createVariable('percentile', float, ('percentile',))
         percentile_var[:] = self.percentiles
 
-        handler.createDimension('lat', self.lat_len)
+        handler.createDimension('lat', self.lat.size)
         lat_var = handler.createVariable('lat', float, ('lat',))
         lat_var[:] = self.lat
 
-        handler.createDimension('lon', self.lon_len)
+        handler.createDimension('lon', self.lon.size)
         lon_var = handler.createVariable('lon', float, ('lon',))
         lon_var[:] = self.lon
 
-        p75_var = handler.createVariable('percent', float, ('percentile', 'lat', 'lon'))
-        p75_var[...] = percentile_values
+        percentile_var = handler.createVariable('percent', float, ('percentile', 'lat', 'lon'))
+        percentile_var[...] = percentile_values
 
         handler.close()
 
@@ -117,14 +117,14 @@ class ClimatologicalPercentile(Diagnostic):
     def _calculate_percentiles(self, distribution):
         Log.debug('Calculating percentiles')
 
-        def calculate_percentiles(point_distribution):
+        def calculate(point_distribution):
             cs = np.cumsum(point_distribution)
             total = cs[-1]
             percentile_values = self.percentiles * total
             index = np.searchsorted(cs, percentile_values)
             return [(self._bins[i + 1] + self._bins[i]) / 2 for i in index]
 
-        distribution = np.apply_along_axis(calculate_percentiles, 0, distribution)
+        distribution = np.apply_along_axis(calculate, 0, distribution)
         return distribution
 
     def _get_distribution(self, member_files):
@@ -189,18 +189,16 @@ class ClimatologicalPercentile(Diagnostic):
             return histogram
 
         var = handler.variables[self.variable]
-        return np.apply_along_axis(calculate_histogram, 0, var[:, realization, ...])
+        if 'realization' in var.dimensions or 'ensemble' in var.dimensions:
+            return np.apply_along_axis(calculate_histogram, 0, var[:, realization, ...])
+        else:
+            return np.apply_along_axis(calculate_histogram, 0, var[:])
 
     def _get_var_size(self, handler):
         if self.lat_len is not None:
             return
-
         self.lat = handler.variables['latitude'][:]
         self.lon = handler.variables['longitude'][:]
-        self.lat = handler.dimensions['latitude'].size
-        self.lon = handler.dimensions['longitude'].size
-
-
 
 
diff --git a/earthdiagnostics/threddsmanager.py b/earthdiagnostics/threddsmanager.py
index 4b5a1fad..0a36e02b 100644
--- a/earthdiagnostics/threddsmanager.py
+++ b/earthdiagnostics/threddsmanager.py
@@ -34,16 +34,19 @@ class THREDDSManager(DataManager):
 
         aggregation_path = self.get_var_url(variable, startdate, frequency, None, vartype)
         startdate = parse_date(startdate)
-        start_chunk = chunk_start_date(startdate, self.experiment.chunks, self.experiment.chunk_size, 'month', 'standard')
+        start_chunk = chunk_start_date(startdate, self.experiment.num_chunks, self.experiment.chunk_size,
+                                       'month', 'standard')
         end_chunk = chunk_end_date(start_chunk, self.experiment.chunk_size, 'month', 'standard')
 
-        thredds_subset = THREDDSSubset(aggregation_path, variable, startdate, end_chunk)
+        thredds_subset = THREDDSSubset(aggregation_path, variable, startdate, end_chunk).get_url()
         selected_months = ','.join([str(add_months(startdate, i, 'standard').month) for i in leadtimes])
-        select_months = '-selmonth,{0} {1}'.format(selected_months, thredds_subset.get_subset_url())
-        selected_years = ','.join([str(add_months(startdate, i, 'standard').year) for i in leadtimes])
-
         temp = TempFile.get()
-        Utils.cdo.selyear(selected_years, input=select_months, output=temp)
+        if self.config.data_type == 'exp':
+            select_months = '-selmonth,{0} {1}'.format(selected_months, thredds_subset)
+            selected_years = ','.join([str(add_months(startdate, i, 'standard').year) for i in leadtimes])
+            Utils.cdo.selyear(selected_years, input=select_months, output=temp)
+        else:
+            Utils.cdo.selmonth(selected_months, input=thredds_subset, output=temp)
         return temp
 
     def get_file(self, domain, var, startdate, member, chunk, grid=None, box=None, frequency=None,
@@ -135,10 +138,7 @@ class THREDDSManager(DataManager):
         if not frequency:
             frequency = self.config.frequency
 
-        start_chunk = chunk_start_date(parse_date(startdate), chunk, self.experiment.chunk_size, 'month',
-                                           'standard')
-
-        filepath = self.get_file_path(date2str(start_chunk)[0:6], domain, var, frequency, vartype, box, grid)
+        filepath = self.get_file_path(startdate, domain, var, frequency, vartype, box, grid)
         netcdf_file = NetCDFFile(filepath, filetosend, domain, var, cmor_var)
         if diagnostic:
             netcdf_file.add_diagnostic_history(diagnostic)
@@ -227,7 +227,7 @@ class THREDDSManager(DataManager):
 
     def _get_file_name(self, startdate, var):
         if startdate:
-            return '{0}_{1}.nc'.format(var, startdate)
+            return '{0}_{1}.nc'.format(var, startdate[0:6])
         else:
             return '{0}.nc'.format(var)
 
-- 
GitLab


From 52b4cf0011f38a09ce3e2633cebb44242ff673f6 Mon Sep 17 00:00:00 2001
From: Javier Vegas-Regidor <javier.vegas@bsc.es>
Date: Fri, 25 Nov 2016 15:53:31 +0100
Subject: [PATCH 7/8] Added fix for ensemble dimension (whose variable is
 realization instead of ensemble)

---
 diags.conf                         | 12 ++++++------
 earthdiagnostics/threddsmanager.py |  6 +++++-
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/diags.conf b/diags.conf
index 13d7fa7b..582b693d 100644
--- a/diags.conf
+++ b/diags.conf
@@ -5,14 +5,14 @@ DATA_ADAPTOR = THREDDS
 SCRATCH_DIR = /scratch/Earth/$USER
 # Root path for the cmorized data to use
 DATA_DIR = /esnas:/esarchive
-DATA_TYPE = recon
+DATA_TYPE = exp
 
 # Path to NEMO's mask and grid files needed for CDFTools
 CON_FILES = /esnas/autosubmit/con_files/
 # Diagnostics to run, space separated. You must provide for each one the name and the parameters (comma separated) or
 # an alias defined in the ALIAS section (see more below). If you are using the diagnpostics just to CMORize, leave it
 # empty
-DIAGS = climpercent,atmos,sfcWind,1
+DIAGS = monpercent,atmos,tas,66 monpercent,atmos,tas,33 climpercent,atmos,sfcwind,1
 # DIAGS = OHC
 # Frequency of the data you want to use by default. Some diagnostics do not use this value: i.e. monmean always stores
 # its results at monthly frequency (obvious) and has a parameter to specify input's frequency.
@@ -65,7 +65,7 @@ SERVER_URL = https://earth.bsc.es/thredds
 [EXPERIMENT]
 # Experiments parameters as defined in CMOR standard
 INSTITUTE = ecmwf
-MODEL = erainterim
+MODEL = system4_m1
 # Model version: Available versions
 MODEL_VERSION =Ec2.3_O1L46
 # Atmospheric output timestep in hours
@@ -82,11 +82,11 @@ OCEAN_TIMESTEP = 6
 # CHUNK_SIZE is the size of each data file, given in months
 # CHUNKS is the number of chunks. You can specify less chunks than present on the experiment
 EXPID = resilience
-STARTDATES = 19790101
+STARTDATES = 19810101
 MEMBERS = 0
 MEMBER_DIGITS = 1
-CHUNK_SIZE = 1
-CHUNKS = 36
+CHUNK_SIZE = 7
+CHUNKS = 1
 # CHUNKS = 1
 
 
diff --git a/earthdiagnostics/threddsmanager.py b/earthdiagnostics/threddsmanager.py
index 0a36e02b..ae8acb00 100644
--- a/earthdiagnostics/threddsmanager.py
+++ b/earthdiagnostics/threddsmanager.py
@@ -227,7 +227,9 @@ class THREDDSManager(DataManager):
 
     def _get_file_name(self, startdate, var):
         if startdate:
-            return '{0}_{1}.nc'.format(var, startdate[0:6])
+            if self.config.data_type != 'exp':
+                startdate = startdate[0:6]
+            return '{0}_{1}.nc'.format(var, startdate)
         else:
             return '{0}.nc'.format(var)
 
@@ -329,6 +331,8 @@ class THREDDSSubset:
         for dimension in self.var_dimensions:
             slice_index = self._get_slice_index(self.dimension_indexes[dimension])
             var_slice += slice_index
+            if dimension == 'ensemble':
+                dimension = 'realization'
             dimensions_slice += '{0}{1},'.format(dimension, slice_index)
 
         return '{0}?{1}{2}'.format(self.thredds_path, dimensions_slice, var_slice)
-- 
GitLab


From acaa4db6bb1e180693abe7cc9df8e14eef9a4c84 Mon Sep 17 00:00:00 2001
From: Javier Vegas-Regidor <javier.vegas@bsc.es>
Date: Fri, 25 Nov 2016 17:20:44 +0100
Subject: [PATCH 8/8] Updated doc for diags and minor improvement on
 climpercent

---
 diags.conf                                         |  3 ++-
 .../statistics/climatologicalpercentile.py         | 14 +++++++++-----
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/diags.conf b/diags.conf
index 582b693d..0c17c47d 100644
--- a/diags.conf
+++ b/diags.conf
@@ -5,6 +5,7 @@ DATA_ADAPTOR = THREDDS
 SCRATCH_DIR = /scratch/Earth/$USER
 # Root path for the cmorized data to use
 DATA_DIR = /esnas:/esarchive
+# Specify if your data is from an experiment (exp), observation (obs) or reconstructions (recon)
 DATA_TYPE = exp
 
 # Path to NEMO's mask and grid files needed for CDFTools
@@ -12,7 +13,7 @@ CON_FILES = /esnas/autosubmit/con_files/
 # Diagnostics to run, space separated. You must provide for each one the name and the parameters (comma separated) or
 # an alias defined in the ALIAS section (see more below). If you are using the diagnpostics just to CMORize, leave it
 # empty
-DIAGS = monpercent,atmos,tas,66 monpercent,atmos,tas,33 climpercent,atmos,sfcwind,1
+DIAGS = climpercent,atmos,sfcWind,1
 # DIAGS = OHC
 # Frequency of the data you want to use by default. Some diagnostics do not use this value: i.e. monmean always stores
 # its results at monthly frequency (obvious) and has a parameter to specify input's frequency.
diff --git a/earthdiagnostics/statistics/climatologicalpercentile.py b/earthdiagnostics/statistics/climatologicalpercentile.py
index 60fd7568..5df47419 100644
--- a/earthdiagnostics/statistics/climatologicalpercentile.py
+++ b/earthdiagnostics/statistics/climatologicalpercentile.py
@@ -22,7 +22,7 @@ class ClimatologicalPercentile(Diagnostic):
     alias = 'climpercent'
     "Diagnostic alias for the configuration file"
 
-    def __init__(self, data_manager, domain, variable, leadtimes, experiment_config):
+    def __init__(self, data_manager, domain, variable, leadtimes, num_bins, experiment_config):
         Diagnostic.__init__(self, data_manager)
         self.variable = variable
         self.domain = domain
@@ -32,7 +32,7 @@ class ClimatologicalPercentile(Diagnostic):
         self.realizations = None
         self.lat_len = None
         self.lon_len = None
-        self.num_bins = 2000
+        self.num_bins = num_bins
         self._bins = None
         self.percentiles = np.array([0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9])
         self.cmor_var = Variable.get_variable(variable, silent=True)
@@ -67,14 +67,18 @@ class ClimatologicalPercentile(Diagnostic):
         if num_options < 3:
             raise Exception('You must specify the variable (and its domain) and the leadtimes you want to compute '
                             'the percentiles on')
-        if num_options > 3:
-            raise Exception('You must specify three parameters for the climatological percentiles')
+        if num_options > 4:
+            raise Exception('You must specify between three and 4 parameters for the climatological percentiles')
         domain = Domain(options[1])
         variable = options[2]
         leadtimes = [int(i) for i in options[3].split('-')]
+        if num_options > 3:
+            num_bins = int(options[4])
+        else:
+            num_bins = 2000
 
         job_list = list()
-        job_list.append(ClimatologicalPercentile(diags.data_manager, domain, variable, leadtimes,
+        job_list.append(ClimatologicalPercentile(diags.data_manager, domain, variable, leadtimes, num_bins,
                                                  diags.config.experiment))
         return job_list
 
-- 
GitLab