From 9b64eb199d487417eabb6577b66b33d45cb29245 Mon Sep 17 00:00:00 2001 From: Calum Meikle Date: Fri, 31 Jan 2025 16:03:58 +0100 Subject: [PATCH 1/6] Added read_nc and multiprocessing functions to the tropomi file --- mapies/tropomi.py | 237 ++++++++++++++++++++++++++++++++++++++++----- run/run_tropomi.py | 26 +++++ 2 files changed, 241 insertions(+), 22 deletions(-) create mode 100644 run/run_tropomi.py diff --git a/mapies/tropomi.py b/mapies/tropomi.py index aaf5cdc6..70c7bc01 100644 --- a/mapies/tropomi.py +++ b/mapies/tropomi.py @@ -26,9 +26,9 @@ class TROPOMI(MAPIES): super().__init__(start_date, end_date) self.time_orig = np.datetime64("1993-01-01T00:00:00") - self.lat=lat - self.lon=lon #self.quality_flag_limit + self.monthly_dict = {} + self.daily_dict = {} # Dictionary to produce daily netcdf files # Add quality value filter number @@ -39,35 +39,228 @@ class TROPOMI(MAPIES): if isinstance(self.end_date, str): self.end_date = time_converter(self.end_date) + self.dates_slice = pd.date_range( + self.start_date, + self.end_date, + ).strftime('%Y%m%d%H%M') + self.read_config(**kwargs) - @timeit - def read_config(self, **kwargs): + @timeit + def read_nc(self): """ - Read yaml config file + Returns list of files needed to be processed by the application """ - module_dir = os.path.dirname(__file__) - if config_file: - try: - config = yaml.safe_load(open(config_file)) - except FileNotFoundError: - logging.error("This is not a config file") - else: - config = yaml.safe_load(open(os.path.join(module_dir, "config/satellite_config.yaml"))) + file_patterns = [] + print(f' Date slice = {self.dates_slice}') + for date in self.dates_slice: + # Convert to Julian day + date = datetime.strptime(date, '%Y%m%d%H%M').strftime('%Y%m%d') + + #TODO add something that takes in the + filepaths = f'{self.indir}/**/S5P_OFFL_L2_*{date}*' + file_patterns.append(filepaths) # Add pattern to the set + + files = sorted(get_file_list(file_patterns)) + print(f"Total number of files: {len(files)}") - variable_dict = config[self.datatype]["variables"] - self.time_var = variable_dict["time_variable"] - self.lon_var = variable_dict["lon_variable"] - self.lat_var = variable_dict["lat_variable"] + # I'm not sure we need this part + files_filtered = [] + for file in files: + parts = file.split('_') + date = parts[8] - # If obs_var passed then run data analysis on that variable if not pull default - obs_var = kwargs.get("obs_var") - if obs_var: - self.obs_var = obs_var + file_dt = datetime.strptime(date, "%Y%m%dT%H%M%S") + if self.start_date <= file_dt <= self.end_date: + files_filtered.append(file) + + + first_idx = max(0, files.index(files_filtered[0]) - 2) + last_idx = min(len(files) - 1, files.index(files_filtered[-1]) + 2) + self.files = files[first_idx : last_idx + 1] + + print(f"Total number of filtered files: {len(self.files)}") + + for file in self.files: + parts = file.split('_') + julian_day = parts[8] + + date = datetime.strptime(julian_day, "%Y%m%dT%H%M%S") + if date.month not in self.monthly_dict: + self.monthly_dict[date.month] = { + "files": [], # List to store file paths + "lon": None, # Longitudes array + "lat": None, # Latitudes array + "obs": None, # Observations array + "count": None, # Count of observations used to compute the average + } + if date.day not in self.daily_dict: + self.daily_dict[date.day] = { + "files": [], # List to store file paths + "lon": None, # Longitudes array + "lat": None, # Latitudes array + "obs": None, # Observations array + "time": None, # Scan Start Time array + } + + # Append the file to the list of files for the corresponding month + self.monthly_dict[date.month]["files"].append(file) + self.daily_dict[date.day]["files"].append(file) + + print(f"Discovered files for months: {list(self.monthly_dict.keys())}") + for month, data in self.monthly_dict.items(): + print(f" Month {month:02d}: {len(data['files'])} files") + + print(f"Discovered files for days: {list(self.daily_dict.keys())}") + + # Print the daily disctionary to see if files are stored in chronological order + for day, data in self.daily_dict.items(): + print(f"Day {day:02d}: {len(data['files'])} files") + + + @timeit + def process_data(self, monthly_avg=False, batch_size=100): + """ + Process the data for the specified year and months. + + Parameters: + - monthly_avg: bool, whether to compute single monthly averages. + - batch_size: int, the number of files to process in each batch. + - chunk_size: int, the size of chunks for processing within each batch. + + Returns: + - if monthly_avg --> lon_final, lat_final, obs_final: Aggregated longitude, latitude, and observation arrays. + - if not monthly_avg --> preprocess_vars() is called. + """ + if monthly_avg: + for month in self.monthly_dict.keys(): + # print(f'Month: {self.monthly_dict[month]}') + # Catch if they haven't selected a grid + try: + print(f"Running average on {self.grid}") + except AttributeError: + raise Exception("Please specify a grid in the initialisation of MAPIES, using grid_repr=''") + try: + # Process the data for the current month + final_lon, final_lat, final_obs, cumulative_count = self.process_monthly_data( + batch_size=batch_size, month=month + ) + # final_lon, final_lat, final_obs, cumulative_count = self.process_files_in_blocks(month, block_size=300, batch_size=batch_size) + + # Update the monthly dictionary + self.monthly_dict[month] = { + "lon": final_lon, + "lat": final_lat, + "obs": final_obs, + "count": cumulative_count, + } + print(f'Updated monthly dictionary for month {month}') + print(f'Lon shape: {final_lon.shape}, Lat shape: {final_lat.shape}, Obs shape: {final_obs.shape}') + if np.all(final_lon == 0) or np.all(np.isnan(final_lon)): + print(f"All longitude values for month {month} are 0 or NaN.") + if np.all(final_lat == 0) or np.all(np.isnan(final_lat)): + print(f"All latitude values for month {month} are 0 or NaN.") + if np.all(final_obs == 0) or np.all(np.isnan(final_obs)): + print(f"All observation values for month {month} are 0 or NaN.") + + except Exception as e: + print(f"Error processing data for month {month}: {e}") + continue else: - self.obs_var = variable_dict["obs_variable"] + try: + final_lon, final_lat, final_obs, cumulative_count = self.process_monthly_data( + batch_size=batch_size, month=None + ) + self.lon_values = final_lon + self.lat_values = final_lat + self.obs = final_obs + self.count_obs = cumulative_count + except Exception as e: + print(f"Error processing data for the time period from {self.start_date} to {self.end_date}: {e}") + return + + def process_monthly_data(self, batch_size, month=None): + + """ + Function that returns all the needed variables for the DA + """ + # r = RotatedGrid(centre_lon=20, centre_lat=35, dlon=.1, dlat=.1, west=-51, south=-35) + flag = False + if month is None: + month_files = self.files + print(f'Processing all files from {self.start_date} to {self.end_date}') + else: + month_files = self.monthly_dict[month]["files"] + print(f"Processing {len(month_files)} files for month {month}") + + + # Split files into batches + batches = [ + month_files[i:i + batch_size] for i in range(0, len(month_files), batch_size) + ] + + # Initialize cumulative arrays + cumulative_obs = None + cumulative_lat = None + cumulative_lon = None + cumulative_count = None + failed_files = [] + + for batch in batches: + print(f"Processing a batch of {len(batch)} files...") + print(f"Batch: {batch}") + + # Use multiprocessing to process files within the batch + args = [(self.grid, file, self.obs_var, self.lat_var, self.lon_var, self.time_var, self.start_date, self.end_date, self.apply_qa, self.qualityFlags, flag) for file in batch] + + with multiprocessing.Pool(processes=8) as pool: # Adjust number of processes as needed + results = pool.starmap(process_single_file, args) + + # Filter valid results and log failed files + valid_results = [res for res in results if res is not None] + failed_files.extend([file for file, res in zip(batch, results) if res is None]) + + if not valid_results: + print("All files in this batch failed. Skipping...") + continue + + # Aggregate results incrementally + for obs_agg, lat_agg, lon_agg, count_obs in valid_results: + if cumulative_obs is None: + # Initialize cumulative arrays + cumulative_obs = obs_agg * count_obs + cumulative_lat = lat_agg * count_obs + cumulative_lon = lon_agg * count_obs + cumulative_count = count_obs + else: + cumulative_obs += obs_agg * count_obs + cumulative_lat += lat_agg * count_obs + cumulative_lon += lon_agg * count_obs + cumulative_count += count_obs + + if failed_files: + print(f"The following files failed to process: {failed_files}") + + # Check if any valid data was processed + if cumulative_obs is None or cumulative_count is None: + print(f"No valid data processed for month {month}. Returning empty arrays.") + return None, None, None, None + + # Compute final averages + valid_mask = cumulative_count > 0 + final_obs = np.zeros_like(cumulative_obs) + final_lat = np.zeros_like(cumulative_lat) + final_lon = np.zeros_like(cumulative_lon) + + final_obs[valid_mask] = cumulative_obs[valid_mask] / cumulative_count[valid_mask] + final_lat[valid_mask] = cumulative_lat[valid_mask] / cumulative_count[valid_mask] + final_lon[valid_mask] = cumulative_lon[valid_mask] / cumulative_count[valid_mask] + + print(f"Completed processing for month {month}.") + + return final_lon, final_lat, final_obs, cumulative_count @timeit diff --git a/run/run_tropomi.py b/run/run_tropomi.py new file mode 100644 index 00000000..ec5e3514 --- /dev/null +++ b/run/run_tropomi.py @@ -0,0 +1,26 @@ +from mapies.tropomi import TROPOMI +import time +# from dask.distributed import Client + + +if __name__ == "__main__": + + # 00:30 to 03:36 of 2024-01-01 + start_date = "202401010030" + end_date = "202401011636" + + outdir = "/home/cmeikle/Projects/data/" + indir = "/home/cmeikle/Projects/data/TROPOMI" + + start_time = time.time() + + c = TROPOMI(start_date, end_date, dest=outdir, indir=indir, apply_qa=False, grid_repr="rotated") + c.read_nc() + c.process_data(batch_size = 20) + # c.yearly_average() + c.plot_2D_observations(months=[0], outdir=outdir) + c.plot_2D_num_obs(months=[0], outdir=outdir) + c.process_lazy_data() + end_time = time.time() + elapsed_time = end_time - start_time + print(f"Script executed in {elapsed_time:.2f} seconds.") -- GitLab From a1ce818eec46b68b056206f6afa128ac5a2088e2 Mon Sep 17 00:00:00 2001 From: Calum Meikle Date: Tue, 4 Feb 2025 18:35:53 +0100 Subject: [PATCH 2/6] Getting tropomi to work --- mapies/config/satellite_config.yaml | 18 ++- mapies/mapies.py | 4 +- mapies/tropomi.py | 243 ++++++++++++++++++---------- mapies/util/func_tools.py | 65 ++++---- mapies/viirs.py | 5 +- run/run_tropomi.py | 10 +- run/run_viirs.py | 8 +- 7 files changed, 222 insertions(+), 131 deletions(-) diff --git a/mapies/config/satellite_config.yaml b/mapies/config/satellite_config.yaml index 847a47d7..af4837f2 100644 --- a/mapies/config/satellite_config.yaml +++ b/mapies/config/satellite_config.yaml @@ -47,11 +47,27 @@ tropomi: lon_variable: "longitude" lat_variable: "latitude" obs_variable: "nitrogendioxide_tropospheric_column" + grids: + grid_repr: "" # default grid repr + rotated: + centre_lat: 35 + centre_lon: 20 + west: -51 + south: -35 + dlon: 0.1 + dlat: 0.1 + regular: + west: -180 + south: -90 + dlon: 0.1 + dlat: 0.1 + #nlon: 3600 + #nlat: 1800 qa: qa_variables: qa_value: qa_name: "qa_value" - qa_values: [0.5] + qa_values: [1] #[0.62, 0.65999997, 0.66999996, 0.69, 0.72999996, 0.74, 0.78999996, 0.84, 0.88, 0.9, 0.93, 1.] qa_combined: True diff --git a/mapies/mapies.py b/mapies/mapies.py index b2466f42..a749d753 100644 --- a/mapies/mapies.py +++ b/mapies/mapies.py @@ -40,6 +40,7 @@ class MAPIES: self.time_orig = np.datetime64("1900-01-01T00:00:00") self.start_date = start_date self.end_date = end_date + self.grid_dict = { @@ -355,7 +356,8 @@ class MAPIES: proj = ccrs.PlateCarree() # Define discrete levels for the color bar - levels = [0.02, 0.04, 0.06, 0.08, 0.10, 0.15, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 1.00] + #levels = [0.02, 0.04, 0.06, 0.08, 0.10, 0.15, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 1.00] + levels = [0.0002, 0.0004, 0.0006, 0.0008, 0.0010, 0.0015, 0.0020, 0.0030, 0.0040, 0.0050, 0.0060, 0.0070, 0.0080, 0.01] # Define the custom color list colors = [ diff --git a/mapies/tropomi.py b/mapies/tropomi.py index 70c7bc01..1c10ac78 100644 --- a/mapies/tropomi.py +++ b/mapies/tropomi.py @@ -9,10 +9,45 @@ from functools import wraps from mapies.mapies import MAPIES from mapies.util.func_tools import timeit, get_file_list, time_converter, inflate_array import time +import multiprocessing import numpy as np import pandas as pd import xarray as xr +import os +from mapies.util.func_tools import * +#timeit, get_file_list, time_converter, frequency_int, error_estimation, time_domain_selection, geo_to_rot +from mapies.grids.monarch import RotatedGrid, IrregularRotatedGrid, RegularGrid, regridding_function + +def process_single_file(r, file, obs_var, lat_var, lon_var, time_var, start_date, end_date, apply_qa, qualityFlags, flag=False): + """ + Process a single file and return aggregated results or flattened arrays. + """ + try: + ds = xr.open_dataset(file, engine="h5netcdf", group="PRODUCT") + + obs, lat, lon, time_values, time_values_index = new_preprocess_vars(ds, obs_var, lat_var, lon_var, time_var, start_date, end_date) + # obs = ds[obs_var].values.flatten() + # lat = ds[lat_var].values.flatten() + + if apply_qa: + obs, lat, lon, time_values = quality_assurance(ds, obs, lat, lon, time_values, qualityFlags) + + if not flag: + # Regrid and aggregate + if isinstance(r, RotatedGrid): + lon_agg, lat_agg, rlon_agg, rlat_agg, obs_agg, count_obs = regridding_function(r, lon, lat, obs) + elif isinstance(r, RegularGrid): + lon_agg, lat_agg, obs_agg, count_obs = regridding_function(r, lon, lat, obs) + ds.close() + return obs_agg, lat_agg, lon_agg, count_obs + else: + ds.close() + return obs, lat, lon, time_values + + except Exception as e: + print(f"Error processing file {file}: {e}") + return None class TROPOMI(MAPIES): """ @@ -26,9 +61,15 @@ class TROPOMI(MAPIES): super().__init__(start_date, end_date) self.time_orig = np.datetime64("1993-01-01T00:00:00") + self.dest = kwargs.get("dest") + if not os.path.exists(self.dest): + raise Exception("Output directory doesn't exist") + self.indir = kwargs.get("indir") + self.year = int(start_date[:4]) #self.quality_flag_limit self.monthly_dict = {} self.daily_dict = {} # Dictionary to produce daily netcdf files + self.datatype = "tropomi" # Add quality value filter number @@ -48,7 +89,7 @@ class TROPOMI(MAPIES): self.read_config(**kwargs) - @timeit + @timeit def read_nc(self): """ Returns list of files needed to be processed by the application @@ -102,7 +143,7 @@ class TROPOMI(MAPIES): "lon": None, # Longitudes array "lat": None, # Latitudes array "obs": None, # Observations array - "time": None, # Scan Start Time array + "time": None, # Time array } # Append the file to the list of files for the corresponding month @@ -120,7 +161,7 @@ class TROPOMI(MAPIES): print(f"Day {day:02d}: {len(data['files'])} files") - @timeit + @timeit def process_data(self, monthly_avg=False, batch_size=100): """ Process the data for the specified year and months. @@ -181,7 +222,7 @@ class TROPOMI(MAPIES): print(f"Error processing data for the time period from {self.start_date} to {self.end_date}: {e}") return - def process_monthly_data(self, batch_size, month=None): + def process_monthly_data(self, batch_size, month=None): """ Function that returns all the needed variables for the DA @@ -263,87 +304,123 @@ class TROPOMI(MAPIES): return final_lon, final_lat, final_obs, cumulative_count - @timeit - def preprocess_vars(self): - """ - Preprocessing of the dataset - """ - super().preprocess_vars() - - - # no2 column values - obs_dims = self.ds[self.obs_var].dims - obs_shape = self.ds[self.obs_var].shape - obs_attrs = self.ds[self.obs_var].attrs - - # Duplicate values in the time array to get the same shape as the flattened variable array - self.time_values = inflate_array(self.time_values, self.time_shape, no2_shape) - self.time_values_index = inflate_array(self.time_values_index, self.time_shape, obs_shape) - # Lon and lat values - lon_dims = self.ds[self.lon_var].dims - lon_shape = self.ds[self.lon_var].shape - lon_attrs = self.ds[self.lon_var].attrs - self.lon_values = self.ds[self.lon_var].values.flatten() - - # Reindex lon column values - self.lon_values = self.reindex( - self.time_values_index, - self.lon_values, - ) + def plot_2D_observations(self, months=[0], filename=None, outdir=None): - lat_dims = self.ds[self.lat_var].dims - lat_shape = self.ds[self.lat_var].shape - lat_attrs = self.ds[self.lat_var].attrs - self.lat_values = self.ds[self.lat_var].values.flatten() - - # Reindex aod column values - self.lat_values = self.reindex( - self.time_values_index, - self.lat_values, + if months == [0]: + try: + lon_values = self.lon_values + lat_values = self.lat_values + obs_values = self.obs + except Exception as e: + print(f"Error plotting all data: {e}") + return + title = f"Observation 2D plot from {self.start_date} to {self.end_date}" + filename = filename or f"{self.datatype}_2D_obs_from_{self.start_date}_to_{self.end_date}.png" + super().plot_2D_obs_custom( + lon_values=lon_values, + lat_values=lat_values, + obs_values=obs_values, + outdir=outdir, + title=title, + filename=filename, ) - - # TODO: Check if this works well, if the inflated values are along the right dimensions - self.obs = self.ds[self.obs_var].values.flatten() - - # Reindex no2 column values - self.obs = self.reindex( - self.time_values_index, - self.obs, + return + elif months is not None: + for m in months: + try: + month_data = self.monthly_dict[m] + lon_values = month_data["lon"] + lat_values = month_data["lat"] + obs_values = month_data["obs"] + except Exception as e: + print(f"Error for month {m}: {e}") + continue + title = f"Observation 2D plot of month {m} from {self.start_date} to {self.end_date}" + filename = filename or f"{self.datatype}_2D_obs_month_{m}_year_{self.year}.png" + super().plot_2D_obs_custom( + lon_values=lon_values, + lat_values=lat_values, + obs_values=obs_values, + outdir=outdir, + title=title, + filename=filename, + ) + else: + try: + lon_values = self.yearly_lon + lat_values = self.yearly_lat + obs_values = self.yearly_obs + except Exception as e: + print(f"Error for yearly data: {e}") + return + title = f"Observation 2D plot of yearly average of year {self.year}" + filename = filename or f"{self.datatype}_2D_yearly_obs_{self.year}.png" + super().plot_2D_obs_custom( + lon_values=lon_values, + lat_values=lat_values, + obs_values=obs_values, + outdir=outdir, + title=title, + filename=filename, ) + + def plot_2D_num_obs(self, months=[0], filename=None, outdir=None): - - @timeit - def read_nc(self): - """ - Read netcdf files with xarray - """ - - files = get_file_list("/esarchive/obs/sentinel/tropomi/original_files/tropomi_OFFL_NO2/S5P_OFFL_L2__NO2____20230101T220249_20230101*.nc") - print(files) - - - # Open dataset with xarray and dask - # Tropomi requires you choose the group - # If we want to read anything other than a single tropomi dataset we will need to do it in parallel - self.ds = xr.open_mfdataset(files, group="PRODUCT") - - - @timeit - def to_plumes(self): - """ - Restructing the data to be passed to calculate the plumes - """ - - coords = coords = dict(time=time_values,) - - ds = self.to_xarray(coords=coords, data_vars=data_vars) - - - -if __name__ == "__main__": - start_date = "202301011430" - end_date = "202401011531" - c = TROPOMI(start_date, end_date) - c.read_nc() - c.preprocess_vars() \ No newline at end of file + if months == [0]: + try: + lon_values = self.lon_values + lat_values = self.lat_values + cum_count = self.count_obs + except Exception as e: + print(f"Error plotting all data: {e}") + return + title = f"Plot of the number of valid observations from {self.start_date} to {self.end_date}" + filename = filename or f"{self.datatype}_2D_obs_count_from_{self.start_date}_to_{self.end_date}.png" + super().plot_2D_num_obs_custom( + lon_values=lon_values, + lat_values=lat_values, + num_obs=cum_count, + outdir=outdir, + title=title, + filename=filename, + ) + return + elif months is not None: + for m in months: + try: + month_data = self.monthly_dict[m] + lon_values = month_data["lon"] + lat_values = month_data["lat"] + cum_count = month_data["count"] + except Exception as e: + print(f"Error for month {m}: {e}") + continue + title = f"Plot of the number of valid observations for month {m} from {self.start_date} to {self.end_date}" + filename = filename or f"{self.datatype}_2D_obs_count_month_{m}_year_{self.year}.png" + super().plot_2D_num_obs_custom( + lon_values=lon_values, + lat_values=lat_values, + num_obs=cum_count, + outdir=outdir, + title=title, + filename=filename, + ) + else: + try: + lon_values = self.yearly_lon + lat_values = self.yearly_lat + cum_count = self.yearly_count + except Exception as e: + print(f"Error for yearly data: {e}") + return + title = f"Plot of the number of valid observations for yearly average of {self.year}" + filename = filename or f"{outdir}/{self.datatype}_2D_obs_count_yearly_{self.year}.png" + super().plot_2D_num_obs_custom( + lon_values=lon_values, + lat_values=lat_values, + num_obs=cum_count, + outdir=outdir, + title=title, + filename=filename, + ) diff --git a/mapies/util/func_tools.py b/mapies/util/func_tools.py index b0b634c3..145b83fe 100644 --- a/mapies/util/func_tools.py +++ b/mapies/util/func_tools.py @@ -224,8 +224,38 @@ def new_preprocess_vars(ds, obs_var, lat_var, lon_var, time_var, start_date, end """ Preprocessing of the dataset """ - time_values, time_values_index = preprocess_vars_mapies(ds, time_var, start_date, end_date) + # Get all info about time columns + time_dims = ds[time_var].dims + time_shape = ds[time_var].shape + time_attrs = ds[time_var].attrs # Useful if we need to convert to datetime + + # Get time values flatten and convert to datetime values + time_values = ds[time_var].values + time_values = time_values.flatten() + + if time_values.dtype == "timedelta64[ns]": + logging.info("Adding time origin to time values as the time variable is in timedelta") + time_values = np.add(TIME_ORIGIN, time_values) + time_values = pd.to_datetime(time_values) + else: + time_values = pd.to_datetime(time_values) + + + # Time domain selection + # print(f'Time values before selection: {time_values}') + time_values, time_values_index = time_domain_selection(time_values, start_date, end_date) + # print(f'Time values after selection: {time_values}')obs_dims = ds[obs_var].dims + + obs_shape = ds[obs_var].shape + obs_attrs = ds[obs_var].attrs + + # Duplicate values in the time array to get the same shape as the flattened variable array + if time_shape != obs_shape: + time_values = inflate_array(time_values, time_shape, obs_shape) + time_values_index = inflate_array(time_values_index, time_shape, obs_shape) + time_values, time_values_index = time_domain_selection(time_values, start_date, end_date) + # Lon and lat values lon_dims = ds[lon_var].dims @@ -252,10 +282,7 @@ def new_preprocess_vars(ds, obs_var, lat_var, lon_var, time_var, start_date, end # AOD column values, default "Aerosol_Optical_Thickness_550_Land_Ocean_Best_Estimate" - aod_dims = ds[obs_var].dims - - aod_shape = ds[obs_var].shape - aod_attrs = ds[obs_var].attrs + obs = ds[obs_var].values.flatten() # Reindex aod column values @@ -266,34 +293,6 @@ def new_preprocess_vars(ds, obs_var, lat_var, lon_var, time_var, start_date, end return obs, lat_values, lon_values, time_values, time_values_index -def preprocess_vars_mapies(ds, time_var, start_date, end_date): - """ - Preprocessing of the dataset - """ - # Get all info about time columns - time_dims = ds[time_var].dims - time_shape = ds[time_var].shape - time_attrs = ds[time_var].attrs # Useful if we need to convert to datetime - - - # Get time values flatten and convert to datetime values - time_values = ds[time_var].values - time_values = time_values.flatten() - - if time_values.dtype == "timedelta64[ns]": - logging.info("Adding time origin to time values as the time variable is in timedelta") - time_values = np.add(TIME_ORIGIN, time_values) - time_values = pd.to_datetime(time_values) - else: - time_values = pd.to_datetime(time_values) - - - # Time domain selection - # print(f'Time values before selection: {time_values}') - time_values, time_values_index = time_domain_selection(time_values, start_date, end_date) - # print(f'Time values after selection: {time_values}') - - return time_values, time_values_index def reindex( dependent_var_index, diff --git a/mapies/viirs.py b/mapies/viirs.py index 6feb60e2..dd37af6a 100644 --- a/mapies/viirs.py +++ b/mapies/viirs.py @@ -42,7 +42,7 @@ def process_single_file(r, file, obs_var, lat_var, lon_var, time_var, start_date Process a single file and return aggregated results or flattened arrays. """ try: - ds = xr.open_dataset(file, engine="h5netcdf") + ds = xr.open_dataset(file, engine="h5netcdf") obs, lat, lon, time_values, time_values_index = new_preprocess_vars(ds, obs_var, lat_var, lon_var, time_var, start_date, end_date) # obs = ds[obs_var].values.flatten() @@ -95,13 +95,12 @@ class VIIRS(MAPIES): self.time_orig = np.datetime64("1993-01-01T00:00:00") self.datatype="viirs" - frequency = kwargs.get("frequency") self.dest = kwargs.get("dest") if not os.path.exists(self.dest): raise Exception("Output directory doesn't exist") self.indir = kwargs.get("indir") - self.year = int(start_date[:4]) + self.year = int(start_date[:4]) self.start_day = int(start_date[6:8]) self.end_day = int(end_date[6:8]) self.monthly_dict = {} diff --git a/run/run_tropomi.py b/run/run_tropomi.py index ec5e3514..d8084fff 100644 --- a/run/run_tropomi.py +++ b/run/run_tropomi.py @@ -1,4 +1,5 @@ from mapies.tropomi import TROPOMI +import numpy as np import time # from dask.distributed import Client @@ -6,21 +7,22 @@ import time if __name__ == "__main__": # 00:30 to 03:36 of 2024-01-01 - start_date = "202401010030" - end_date = "202401011636" + start_date = "202301010000" + end_date = "202301020000" outdir = "/home/cmeikle/Projects/data/" indir = "/home/cmeikle/Projects/data/TROPOMI" start_time = time.time() - c = TROPOMI(start_date, end_date, dest=outdir, indir=indir, apply_qa=False, grid_repr="rotated") + c = TROPOMI(start_date, end_date, dest=outdir, indir=indir, apply_qa=True, grid_repr="regular") c.read_nc() c.process_data(batch_size = 20) + # c.yearly_average() c.plot_2D_observations(months=[0], outdir=outdir) c.plot_2D_num_obs(months=[0], outdir=outdir) - c.process_lazy_data() + #c.process_lazy_data() end_time = time.time() elapsed_time = end_time - start_time print(f"Script executed in {elapsed_time:.2f} seconds.") diff --git a/run/run_viirs.py b/run/run_viirs.py index bd8b1504..4ef7e11a 100644 --- a/run/run_viirs.py +++ b/run/run_viirs.py @@ -21,12 +21,8 @@ if __name__ == "__main__": # ONE MONTH - JANUARY 2024 # start_date = "202401010000" # end_date = "202401312359" - - # outdir="/home/cmeikle/Projects/data/" - # indir="/home/cmeikle/Projects/data/VIIRS/original_files/AERDB_L2_VIIRS_NOAA20" - outdir = "/home/cgile/Documents/mapies/figures" - # indir = "/home/cgile/bscearth000/esarchive/obs/nasa/viirs_noaa20_aerdb_l2/original_files/VIIRS" - indir = "/home/cgile/Documents/mapies/VIIRS" + outdir = "/home/cmeikle/Projects/data/" + indir = "/home/cmeikle/Projects/data/VIIRS" filename_obs = 'carlotta' filename_num_obs = 'carlotta' -- GitLab From ee9490076e903e19ba396d85178c65e405327bd1 Mon Sep 17 00:00:00 2001 From: Calum Meikle Date: Wed, 5 Feb 2025 17:18:07 +0100 Subject: [PATCH 3/6] Initial construction of spatial cropping func --- mapies/tropomi.py | 4 +- mapies/util/func_tools.py | 79 +++++++++++++++++++++++++++------------ run/run_tropomi.py | 4 +- 3 files changed, 61 insertions(+), 26 deletions(-) diff --git a/mapies/tropomi.py b/mapies/tropomi.py index 1c10ac78..8305b02a 100644 --- a/mapies/tropomi.py +++ b/mapies/tropomi.py @@ -1,7 +1,6 @@ #!/usr/bin/env python from dataclasses import dataclass, field from typing import List, Tuple -from netCDF4 import Dataset, num2date, chartostring from datetime import datetime, timedelta from functools import wraps @@ -218,6 +217,9 @@ class TROPOMI(MAPIES): self.lat_values = final_lat self.obs = final_obs self.count_obs = cumulative_count + print("Here") + # move this + spatial_domain_selection(self.lat_values, self.lon_values, "ger", self.obs, [[5, 10], [5, 10]]) except Exception as e: print(f"Error processing data for the time period from {self.start_date} to {self.end_date}: {e}") return diff --git a/mapies/util/func_tools.py b/mapies/util/func_tools.py index 145b83fe..95b62338 100644 --- a/mapies/util/func_tools.py +++ b/mapies/util/func_tools.py @@ -82,33 +82,66 @@ def time_converter(date:str) -> pd.Timestamp: return date def quality_assurance(ds, obs, lat, lon, time_values, qualityFlags): - """ - Applying quality assurance flags to mapies data - """ - mask = np.zeros_like(obs, dtype=bool) - - for i, (flag, info) in enumerate(qualityFlags.items()): - - flag_mask = np.zeros_like(obs, dtype=bool) - arr = ds[info["qa_name"]].values.flatten() - for value in info["qa_values"]: - flag_mask |= (arr == value) - - if i == 0: - mask = flag_mask + """ + Applying quality assurance flags to mapies data + """ + mask = np.zeros_like(obs, dtype=bool) + + for i, (flag, info) in enumerate(qualityFlags.items()): + + flag_mask = np.zeros_like(obs, dtype=bool) + arr = ds[info["qa_name"]].values.flatten() + for value in info["qa_values"]: + flag_mask |= (arr == value) + + if i == 0: + mask = flag_mask + else: + if info["qa_combined"]: + mask &= flag_mask else: - if info["qa_combined"]: - mask &= flag_mask - else: - mask |= flag_mask + mask |= flag_mask + + obs = obs[mask] + lat = lat[mask] + lon = lon[mask] + time_values = time_values[mask] + + + return obs, lat, lon, time_values + +def spatial_domain_selection(lat, lon, time, obs, spatial_bounds): + """ + A function for cutting the data spatially + + :param: spatial_bounds - the boundaries that you would like to crop te data by [[lon], [lat]] e.g. [[0, 20], [10, 12]] + """ + # Check that the spatial bounds are 2 dimensional + if len(spatial_bounds) != 2: + raise Exception("Spatial bounds passed are not 2 dimensional, make sure to pass [[left_lon, right_lon], [left_lat, right_lat]]") + + # Get the bounds + lon_l_bound, lon_r_bound = spatial_bounds[0] + lat_l_bound, lat_r_bound = spatial_bounds[1] + + # Crop the spatial variabes + lon_mask = np.logical_and(lon>=lon_l_bound, lon<=lon_r_bound) + lat_mask = np.logical_and(lon>=lon_l_bound, lon<=lon_r_bound) + print(lon_mask) + print(lat_mask) + print(lon) + lon = lon[lon_mask & lat_mask] + lat = lat[lon_mask & lat_mask] + time = time[lon_mask & lat_mask] + obs = obs[lon_mask & lat_mask] + + print(lon) + + #mask the other variables - obs = obs[mask] - lat = lat[mask] - lon = lon[mask] - time_values = time_values[mask] - return obs, lat, lon, time_values + print("woooo") # Time Domain Selection function based off of xarray Will this work with dask too, a question for later def time_domain_selection( diff --git a/run/run_tropomi.py b/run/run_tropomi.py index d8084fff..666300be 100644 --- a/run/run_tropomi.py +++ b/run/run_tropomi.py @@ -10,8 +10,8 @@ if __name__ == "__main__": start_date = "202301010000" end_date = "202301020000" - outdir = "/home/cmeikle/Projects/data/" - indir = "/home/cmeikle/Projects/data/TROPOMI" + outdir = "/Users/calummeikle/BSC/data/" + indir = "/Users/calummeikle/BSC/data/TROPOMI" start_time = time.time() -- GitLab From 1bf0acc527e7f5f49bdd4300a24e6a590807e6c0 Mon Sep 17 00:00:00 2001 From: Calum Meikle Date: Thu, 6 Feb 2025 14:46:51 +0100 Subject: [PATCH 4/6] Spatial cropping --- mapies/mapies.py | 11 ++++++++--- mapies/tropomi.py | 32 +++++++++++++++++++++++++------- mapies/util/func_tools.py | 23 ++++++++++------------- run/run_tropomi.py | 6 +++--- run/run_viirs.py | 4 ---- 5 files changed, 46 insertions(+), 30 deletions(-) diff --git a/mapies/mapies.py b/mapies/mapies.py index c883a5a0..2726c5e2 100644 --- a/mapies/mapies.py +++ b/mapies/mapies.py @@ -190,8 +190,13 @@ class MAPIES: proj = ccrs.PlateCarree() # Define discrete levels for the color bar - levels = [0.02, 0.04, 0.06, 0.08, 0.10, 0.15, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 1.00] - + if self.datatype == "viirs": + label = "AOD" + levels = [0.02, 0.04, 0.06, 0.08, 0.10, 0.15, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 1.00] + elif self.datatype == "tropomi": + #TODO remove hard coding + label = "NO2" + levels = [0.00001, 0.00002, 0.00003, 0.00004, 0.00005, 0.00006, 0.00007, 0.00008, 0.00009, 0.00010, 0.00015, 0.00020, 0.00030, 0.00040] # Define the custom color list colors = [ '#290AD8', '#2645FA', '#3A91FF', '#66CBFF', '#99EEFF', @@ -217,7 +222,7 @@ class MAPIES: # Add the color bar cbar = fig.colorbar(im, ax=ax, orientation="vertical", pad=0.05, ticks=levels, extend="both") - cbar.set_label("AOD") + cbar.set_label(label) cbar.ax.set_yticklabels([f"{level:.2f}" for level in levels]) # Format tick labels # Add title and save the plot diff --git a/mapies/tropomi.py b/mapies/tropomi.py index 8305b02a..960aaf5b 100644 --- a/mapies/tropomi.py +++ b/mapies/tropomi.py @@ -18,7 +18,20 @@ from mapies.util.func_tools import * #timeit, get_file_list, time_converter, frequency_int, error_estimation, time_domain_selection, geo_to_rot from mapies.grids.monarch import RotatedGrid, IrregularRotatedGrid, RegularGrid, regridding_function -def process_single_file(r, file, obs_var, lat_var, lon_var, time_var, start_date, end_date, apply_qa, qualityFlags, flag=False): +def process_single_file( + r, + file, + obs_var, + lat_var, + lon_var, + time_var, + start_date, + end_date, + apply_qa, + qualityFlags, + flag=False, + geospatial_crop=None + ): """ Process a single file and return aggregated results or flattened arrays. """ @@ -31,7 +44,9 @@ def process_single_file(r, file, obs_var, lat_var, lon_var, time_var, start_date if apply_qa: obs, lat, lon, time_values = quality_assurance(ds, obs, lat, lon, time_values, qualityFlags) - + if geospatial_crop is not None: + obs, lat, lon, time_values = spatial_domain_selection(obs, lat, lon, time_values, geospatial_crop) + if not flag: # Regrid and aggregate if isinstance(r, RotatedGrid): @@ -70,6 +85,8 @@ class TROPOMI(MAPIES): self.daily_dict = {} # Dictionary to produce daily netcdf files self.datatype = "tropomi" + self.geospatial_crop = None + # Add quality value filter number @@ -161,7 +178,7 @@ class TROPOMI(MAPIES): @timeit - def process_data(self, monthly_avg=False, batch_size=100): + def process_data(self, monthly_avg=False, batch_size=100, geospatial_crop=None): """ Process the data for the specified year and months. @@ -169,11 +186,14 @@ class TROPOMI(MAPIES): - monthly_avg: bool, whether to compute single monthly averages. - batch_size: int, the number of files to process in each batch. - chunk_size: int, the size of chunks for processing within each batch. + - geospatial_crop, 2D numpy array Returns: - if monthly_avg --> lon_final, lat_final, obs_final: Aggregated longitude, latitude, and observation arrays. - if not monthly_avg --> preprocess_vars() is called. """ + if geospatial_crop is not None: + self.geospatial_crop = np.array(geospatial_crop) if monthly_avg: for month in self.monthly_dict.keys(): # print(f'Month: {self.monthly_dict[month]}') @@ -217,9 +237,7 @@ class TROPOMI(MAPIES): self.lat_values = final_lat self.obs = final_obs self.count_obs = cumulative_count - print("Here") - # move this - spatial_domain_selection(self.lat_values, self.lon_values, "ger", self.obs, [[5, 10], [5, 10]]) + except Exception as e: print(f"Error processing data for the time period from {self.start_date} to {self.end_date}: {e}") return @@ -256,7 +274,7 @@ class TROPOMI(MAPIES): print(f"Batch: {batch}") # Use multiprocessing to process files within the batch - args = [(self.grid, file, self.obs_var, self.lat_var, self.lon_var, self.time_var, self.start_date, self.end_date, self.apply_qa, self.qualityFlags, flag) for file in batch] + args = [(self.grid, file, self.obs_var, self.lat_var, self.lon_var, self.time_var, self.start_date, self.end_date, self.apply_qa, self.qualityFlags, flag, self.geospatial_crop) for file in batch] with multiprocessing.Pool(processes=8) as pool: # Adjust number of processes as needed results = pool.starmap(process_single_file, args) diff --git a/mapies/util/func_tools.py b/mapies/util/func_tools.py index 95b62338..e30f091c 100644 --- a/mapies/util/func_tools.py +++ b/mapies/util/func_tools.py @@ -110,7 +110,7 @@ def quality_assurance(ds, obs, lat, lon, time_values, qualityFlags): return obs, lat, lon, time_values -def spatial_domain_selection(lat, lon, time, obs, spatial_bounds): +def spatial_domain_selection(obs, lat, lon, time, spatial_bounds): """ A function for cutting the data spatially @@ -126,22 +126,20 @@ def spatial_domain_selection(lat, lon, time, obs, spatial_bounds): # Crop the spatial variabes lon_mask = np.logical_and(lon>=lon_l_bound, lon<=lon_r_bound) - lat_mask = np.logical_and(lon>=lon_l_bound, lon<=lon_r_bound) - print(lon_mask) - print(lat_mask) - print(lon) + lat_mask = np.logical_and(lat>=lat_l_bound, lat<=lat_r_bound) + + # Masking the data lon = lon[lon_mask & lat_mask] lat = lat[lon_mask & lat_mask] time = time[lon_mask & lat_mask] obs = obs[lon_mask & lat_mask] - - print(lon) - - #mask the other variables - - - print("woooo") + assert len(lon) == len(lat) + assert len(obs) == len(time) + assert len(lon) == len(obs) + + return obs, lat, lon, time + # Time Domain Selection function based off of xarray Will this work with dask too, a question for later def time_domain_selection( @@ -158,7 +156,6 @@ def time_domain_selection( :return: a cut down numpy array """ - # But the format of the time column may change aaaaaaaghh # For these if it is a string we need to know what the format is (maybe pandas figures that out) if isinstance(start_date, str): diff --git a/run/run_tropomi.py b/run/run_tropomi.py index 666300be..909285a5 100644 --- a/run/run_tropomi.py +++ b/run/run_tropomi.py @@ -10,14 +10,14 @@ if __name__ == "__main__": start_date = "202301010000" end_date = "202301020000" - outdir = "/Users/calummeikle/BSC/data/" - indir = "/Users/calummeikle/BSC/data/TROPOMI" + outdir = "/home/cmeikle/Projects/data/" + indir = "/home/cmeikle/Projects/data/TROPOMI" start_time = time.time() c = TROPOMI(start_date, end_date, dest=outdir, indir=indir, apply_qa=True, grid_repr="regular") c.read_nc() - c.process_data(batch_size = 20) + c.process_data(batch_size = 20, geospatial_crop=[[-30, 30], [-20, 60]]) # c.yearly_average() c.plot_2D_observations(months=[0], outdir=outdir) diff --git a/run/run_viirs.py b/run/run_viirs.py index a7a179c7..89ca6aa6 100644 --- a/run/run_viirs.py +++ b/run/run_viirs.py @@ -25,10 +25,6 @@ if __name__ == "__main__": outdir = "/home/cmeikle/Projects/data/" indir = "/home/cmeikle/Projects/data/VIIRS" - - filename_obs = 'atarri_obs' - filename_num_obs = 'atarri_num_obs' - start_time = time.time() c = VIIRS(start_date, end_date, dest=outdir, indir=indir, apply_qa=False, grid_repr="rotated") -- GitLab From 3b1a39eef8dce516f19115df12a6480c24857c7f Mon Sep 17 00:00:00 2001 From: Calum Meikle Date: Fri, 7 Feb 2025 15:14:32 +0100 Subject: [PATCH 5/6] Updating spatial cropping and processes for running of tropomi --- mapies/mapies.py | 4 +++ mapies/tropomi.py | 7 +++-- mapies/viirs.py | 74 ---------------------------------------------- run/run_tropomi.py | 18 +++++------ run/run_viirs.py | 5 +--- 5 files changed, 17 insertions(+), 91 deletions(-) diff --git a/mapies/mapies.py b/mapies/mapies.py index 2726c5e2..adca3b40 100644 --- a/mapies/mapies.py +++ b/mapies/mapies.py @@ -283,6 +283,10 @@ class MAPIES: # Ensure the output directory exists os.makedirs(outdir, exist_ok=True) + output_filename = "few{}fcec".format(site_code) + print("{}".format(this_string)) + + filepath = os.path.join(outdir, filename) plt.savefig(filepath, format="png") if display_fig: diff --git a/mapies/tropomi.py b/mapies/tropomi.py index 960aaf5b..9f50f1c4 100644 --- a/mapies/tropomi.py +++ b/mapies/tropomi.py @@ -117,17 +117,18 @@ class TROPOMI(MAPIES): date = datetime.strptime(date, '%Y%m%d%H%M').strftime('%Y%m%d') #TODO add something that takes in the - filepaths = f'{self.indir}/**/S5P_OFFL_L2_*{date}*' + filepaths = f'{self.indir}/**/S5P_OFFL_L2__NO2____{date}*' file_patterns.append(filepaths) # Add pattern to the set files = sorted(get_file_list(file_patterns)) print(f"Total number of files: {len(files)}") + # I'm not sure we need this part files_filtered = [] for file in files: parts = file.split('_') - date = parts[8] + date = parts[-6] file_dt = datetime.strptime(date, "%Y%m%dT%H%M%S") if self.start_date <= file_dt <= self.end_date: @@ -142,7 +143,7 @@ class TROPOMI(MAPIES): for file in self.files: parts = file.split('_') - julian_day = parts[8] + julian_day = parts[-6] date = datetime.strptime(julian_day, "%Y%m%dT%H%M%S") if date.month not in self.monthly_dict: diff --git a/mapies/viirs.py b/mapies/viirs.py index 72d68301..4b170fa8 100644 --- a/mapies/viirs.py +++ b/mapies/viirs.py @@ -138,80 +138,6 @@ class VIIRS(MAPIES): self.unc_const = da_dict["uncertainty_constants"] #float, int, list, dict - @timeit - def preprocess_vars(self): - """ - Preprocessing of the dataset - """ - print(f'Inside preprocess_vars') - super().preprocess_vars() - - - # Lon and lat values - lon_dims = self.ds[self.lon_var].dims - lon_shape = self.ds[self.lon_var].shape - lon_attrs = self.ds[self.lon_var].attrs - self.lon_values = self.ds[self.lon_var].values.flatten() - - # Reindex lon column values - self.lon_values = self.reindex( - self.time_values_index, - self.lon_values, - ) - - lat_dims = self.ds[self.lat_var].dims - lat_shape = self.ds[self.lat_var].shape - lat_attrs = self.ds[self.lat_var].attrs - self.lat_values = self.ds[self.lat_var].values.flatten() - - # Reindex aod column values - self.lat_values = self.reindex( - self.time_values_index, - self.lat_values, - ) - - - # AOD column values, default "Aerosol_Optical_Thickness_550_Land_Ocean_Best_Estimate" - aod_dims = self.ds[self.obs_var].dims - - aod_shape = self.ds[self.obs_var].shape - aod_attrs = self.ds[self.obs_var].attrs - self.obs = self.ds[self.obs_var].values.flatten() - - # Reindex aod column values - self.obs = self.reindex( - self.time_values_index, - self.obs, - ) - - # Apply QA - if self.apply_qa: - #.info("Applying quality assurance filters") - self.quality_assurance() - - - - # @timeit - # def regridding_function(self): - # """ - # Perform Rotation of Grid representation - # """ - # # Calculate the grid representation - # if isinstance(self.grid, RotatedGrid): - # # Aggregate the the observations to the grid representation - # lon_agg, lat_agg, rlon_agg, rlat_agg, obs_agg, count_obs = self.grid.aggregate(self.lon_values, self.lat_values, self.obs) - - # elif isinstance(self.grid, RegularGrid): - # # Aggregate the the observations to the grid representation - # lon_agg, lat_agg, obs_agg, count_obs = self.grid.aggregate(self.lon_values, self.lat_values, self.obs) - - # else: - # raise ValueError("Invalid grid representation") - - # self.lon_values = lon_agg - # self.lat_values = lat_agg - # self.obs = obs_agg - # self.count_obs = count_obs @timeit def read_nc(self): diff --git a/run/run_tropomi.py b/run/run_tropomi.py index 909285a5..4ee763a5 100644 --- a/run/run_tropomi.py +++ b/run/run_tropomi.py @@ -5,23 +5,21 @@ import time if __name__ == "__main__": + start_date = "202212010000" + end_date = "202212202359" - # 00:30 to 03:36 of 2024-01-01 - start_date = "202301010000" - end_date = "202301020000" - - outdir = "/home/cmeikle/Projects/data/" - indir = "/home/cmeikle/Projects/data/TROPOMI" + outdir = "/esarchive/scratch/cmeikle/" + indir = "/esarchive/obs/sentinel/tropomi/original_files/tropomi_OFFL_NO2" start_time = time.time() - c = TROPOMI(start_date, end_date, dest=outdir, indir=indir, apply_qa=True, grid_repr="regular") + c = TROPOMI(start_date, end_date, dest=outdir, indir=indir, apply_qa=True, grid_repr="rotated") c.read_nc() - c.process_data(batch_size = 20, geospatial_crop=[[-30, 30], [-20, 60]]) + c.process_data(monthly_avg=True, batch_size = 13) # c.yearly_average() - c.plot_2D_observations(months=[0], outdir=outdir) - c.plot_2D_num_obs(months=[0], outdir=outdir) + c.plot_2D_observations(months=[12], outdir=outdir) + c.plot_2D_num_obs(months=[12], outdir=outdir) #c.process_lazy_data() end_time = time.time() elapsed_time = end_time - start_time diff --git a/run/run_viirs.py b/run/run_viirs.py index 89ca6aa6..2436df48 100644 --- a/run/run_viirs.py +++ b/run/run_viirs.py @@ -31,10 +31,7 @@ if __name__ == "__main__": c.read_nc() c.process_data(monthly_avg=False, batch_size = 50) # c.yearly_average() - c.plot_2D_observations(months=[0], filename=filename_obs, outdir=outdir) - c.plot_2D_num_obs(months=[0], filename=filename_num_obs, outdir=outdir) - # c.process_lazy_data() - end_time = time.time() + c.plot_2D_observations(months=[0], filename=filename_obs, outdir=outdir, display_fig=True elapsed_time = end_time - start_time print(f"Script executed in {elapsed_time:.2f} seconds.") -- GitLab From 555da65ee083b306bcb13712a053b505602a1294 Mon Sep 17 00:00:00 2001 From: Calum Meikle Date: Fri, 7 Feb 2025 16:04:47 +0100 Subject: [PATCH 6/6] Updated some typings and docstring --- mapies/tropomi.py | 4 +++- mapies/util/func_tools.py | 7 ++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/mapies/tropomi.py b/mapies/tropomi.py index 9f50f1c4..26e3bef7 100644 --- a/mapies/tropomi.py +++ b/mapies/tropomi.py @@ -17,6 +17,8 @@ import os from mapies.util.func_tools import * #timeit, get_file_list, time_converter, frequency_int, error_estimation, time_domain_selection, geo_to_rot from mapies.grids.monarch import RotatedGrid, IrregularRotatedGrid, RegularGrid, regridding_function +from numpy.typing import DTypeLike, NDArray + def process_single_file( r, @@ -179,7 +181,7 @@ class TROPOMI(MAPIES): @timeit - def process_data(self, monthly_avg=False, batch_size=100, geospatial_crop=None): + def process_data(self, monthly_avg=False, batch_size=100, geospatial_crop: NDArray | None = None): """ Process the data for the specified year and months. diff --git a/mapies/util/func_tools.py b/mapies/util/func_tools.py index e30f091c..300f2ec6 100644 --- a/mapies/util/func_tools.py +++ b/mapies/util/func_tools.py @@ -114,7 +114,7 @@ def spatial_domain_selection(obs, lat, lon, time, spatial_bounds): """ A function for cutting the data spatially - :param: spatial_bounds - the boundaries that you would like to crop te data by [[lon], [lat]] e.g. [[0, 20], [10, 12]] + :param spatial_bounds: - the boundaries that you would like to crop te data by [[lon], [lat]] e.g. [[0, 20], [10, 12]] """ # Check that the spatial bounds are 2 dimensional if len(spatial_bounds) != 2: @@ -250,9 +250,10 @@ def geo_to_rot(lons, lats, centre_lon: float, centre_lat: float): return rlon, rlat def new_preprocess_vars(ds, obs_var, lat_var, lon_var, time_var, start_date, end_date): - """ - Preprocessing of the dataset + * Preprocessing of the dataset + * Flattening arrays + * Applying time domain selection """ # Get all info about time columns time_dims = ds[time_var].dims -- GitLab