#------------------------------------------------------ # This use case shows the basic usage of multiStart(), the wrapper of Start() # for multiple datasets retrieval that have different selector requirement. # The usage is almost the same as Start(). To assign different selectors for # each dataset, use the same format as the pattern_dim: a list of lists, first # item 'name' is the dataset name and the second item '' is the selector. # # This use case wants to put HadGEM3 and MPI-ESM datasets together in one call. # The major differences between them are the spatial grids and calendar type. # HadGEM3 uses 360_day calendar and MPI-ESM uses standard calendar. For the grid # difference, the 'tramsform' parameters can deal with it; while for the calendar # difference, Start() has difficulty align the time steps. We cannot assign NAs # as the selector in Start(), and Start() coarces the values to the nearest found # values, producing the hidden errors. # # Therefore, we use multiStart() to insert NAs manually. Three days, 30th Dec., # 31st Dec., and 1st Jan. are requested. HadGEM3 with 360_day calendar doesn't # have 31st Dec., so we need to insert one NA at the 2nd time step. Be sure that # the length (NA included) of two time selectors are equal. # # Here are some limitations for now: # 1. The selector for each dataset can only be indices(), not values() or 'all'. # If the selector is the common one (e.g., 'sdate', 'lat', and 'lon' in this # use case), then it follows the rule of Start(). # 2. The reshape options are not all available. See ex1_13 for details. # 3. The checker of multiStart() is not comprehensive. Please carefully examine # the results you get. #------------------------------------------------------ #======================================================= # Define the paths and selectors #======================================================= # Define paths # HadGEM3 daily data (360_day) path_hadgem3 <- paste0('/esarchive/exp/CMIP6/dcppA-hindcast/hadgem3-gc31-mm/', 'cmip6-dcppA-hindcast_i1p1/DCPP/MOHC/HadGEM3-GC31-MM/', 'dcppA-hindcast/r1i1p1f2/day/$var$/gn/v20200101/', '$var$_day_HadGEM3-GC31-MM_dcppA-hindcast_s$sdate$-r1i1p1f2_gn_$fyear$.nc') ## tasmax_day_HadGEM3-GC31-MM_dcppA-hindcast_s2000-r1i1p1f2_gn_20001101-20101230.nc # MPI-ESM1.2-HR (standard) path_mpi_esm <- paste0('/esarchive/exp/CMIP6/dcppA-hindcast/mpi-esm1-2-hr/', 'cmip6-dcppA-hindcast_i1p1/DCPP/MPI-M/MPI-ESM1-2-HR/', 'dcppA-hindcast/r1i1p1f1/day/$var$/gn/v20200101/', '$var$_day_MPI-ESM1-2-HR_dcppA-hindcast_s$sdate$-r1i1p1f1_gn_$fyear$.nc') ## tasmax_day_MPI-ESM1-2-HR_dcppA-hindcast_s2000-r1i1p1f1_gn_20001101-20101231.nc # Define selectors for each dataset sdate <- '2000' fyear_hadgem3 <- paste0(sdate, '1101-', as.numeric(sdate) + 10, '1230') fyear_mpi_esm <- paste0(sdate, '1101-', as.numeric(sdate) + 10, '1231') # To align the time steps, insert an NA in hadgem3 time_hadgem3 <- indices(c(60, NA, 61)) #12/30, NA, 1/1 time_mpi_esm <- indices(60:62) #12/30, 12/31, 1/1 #======================================================= # retrieve = TRUE; interpolation #======================================================= res <- multiStart( dat = list(list(name = 'hadgem3', path = path_hadgem3), list(name = 'mpi_esm', path = path_mpi_esm)), var = 'tasmax', sdate = '2000', fyear = list(list(name = 'hadgem3', fyear = fyear_hadgem3), list(name = 'mpi_esm', fyear = fyear_mpi_esm)), time = list(list(name = 'hadgem3', time = time_hadgem3), list(name = 'mpi_esm', time = time_mpi_esm)), lat = values(list(-30, 30)), lon = values(list(0, 50)), transform = CDORemapper, transform_params = list(grid = 'r360x181', method = 'conservative', crop = c(0, 50, -30, 30)), transform_vars = c('lat', 'lon'), return_vars = list(lat = NULL, lon = NULL, time = 'sdate'), retrieve = TRUE) # Check the returned values. hadgem3 has NA in the 2nd time step. dim(res) # dat var sdate fyear time lat lon # 2 1 1 1 3 61 51 res[,1,1,1,,10,10] # [,1] [,2] [,3] #[1,] 292.5023 NA 292.0899 #[2,] 293.0914 293.4608 292.8224 #======================================================= # retrieve = FALSE; Compute() #======================================================= #NOTE: To show a clearer concept of multiStart usage, the data are not interpolated # here, but they should be. If interpolated, the chunking dimensions cannot be # lat and lon (or the results contain many NAs.) res <- multiStart( dat = list(list(name = 'hadgem3', path = path_hadgem3), list(name = 'mpi_esm', path = path_mpi_esm)), var = 'tasmax', sdate = '2000', fyear = list(list(name = 'hadgem3', fyear = fyear_hadgem3), list(name = 'mpi_esm', fyear = fyear_mpi_esm)), time = list(list(name = 'hadgem3', time = time_hadgem3), list(name = 'mpi_esm', time = time_mpi_esm)), lat = indices(3:5), lon = indices(1), return_vars = list(lat = 'dat', lon = 'dat', time = 'dat'), retrieve = FALSE) str(res) #List of 2 # $ hadgem3: # .. # .. # $ mpi_esm: # .. # .. str(attr(res[[1]], 'na_pos_record')) #List of 1 # $ hadgem3:List of 7 # ..$ dat : num 0 # ..$ var : num 0 # ..$ sdate: num 0 # ..$ fyear: num 0 # ..$ time : int 2 # ..$ lat : num 0 # ..$ lon : num 0 str(attr(res[[2]], 'na_pos_record')) #List of 1 # $ mpi_esm:List of 7 # ..$ dat : num 0 # ..$ var : num 0 # ..$ sdate: num 0 # ..$ fyear: num 0 # ..$ time : num 0 # ..$ lat : num 0 # ..$ lon : num 0 func_sum <- function(x, y) { return(x + y) } # 'time' and 'fyear' can also be the operational dimension step <- Step(func_sum, target_dims = list(hadgem3 = c('var', 'lon'), mpi_esm = c('var', 'lon')), output_dims = c('var', 'lon')) wf <- AddStep(res, step) # Option 1: run locally output <- Compute(wf, chunks = list(lat = 2)) output <- Compute(wf, chunks = list(lat = 2, time = 2)) # Option 2: run on Nord3 #-------------------user-defined--------------------- queue_host <- 'nord1' temp_dir <- '/gpfs/scratch/bsc32/bsc32734/startR_hpc/' ecflow_suite_dir <- '/home/Earth/aho/startR_local/' #---------------------------------------------------- output <- Compute(wf, chunks = list(lat = 2, time = 2), threads_load = 2, threads_compute = 4, cluster = list(queue_host = queue_host, queue_type = 'lsf', temp_dir = temp_dir, cores_per_job = 2, job_wallclock = '05:00', max_jobs = 4, extra_queue_params = list('#BSUB -q bsc_es'), bidirectional = FALSE, polling_period = 10 ), ecflow_suite_dir = ecflow_suite_dir, wait = TRUE ) dim(output$output1) # var lon time dat sdate fyear lat # 1 1 3 1 1 1 3 output$output1[1, 1, , 1, 1, 1, ] # [,1] [,2] [,3] #[1,] 503.0177 502.6464 503.0280 #[2,] NA NA NA #[3,] 507.7226 507.1748 507.9795