ex2_11_multiStart_insert_NA.R 7.54 KB
Newer Older
aho's avatar
aho committed
#------------------------------------------------------
# This use case shows the basic usage of multiStart(), the wrapper of Start()
# for multiple datasets retrieval that have different selector requirement.
# The usage is almost the same as Start(). To assign different selectors for 
# each dataset, use the same format as the pattern_dim: a list of lists, first
# item 'name' is the dataset name and the second item '<dim_name>' is the selector.
#
# This use case wants to put HadGEM3 and MPI-ESM datasets together in one call.
# The major differences between them are the spatial grids and calendar type.
# HadGEM3 uses 360_day calendar and MPI-ESM uses standard calendar. For the grid
# difference, the 'tramsform' parameters can deal with it; while for the calendar
# difference, Start() has difficulty align the time steps. We cannot assign NAs
# as the selector in Start(), and Start() coarces the values to the nearest found 
# values, producing the hidden errors.
#
# Therefore, we use multiStart() to insert NAs manually. Three days, 30th Dec.,
# 31st Dec., and 1st Jan. are requested. HadGEM3 with 360_day calendar doesn't 
# have 31st Dec., so we need to insert one NA at the 2nd time step. Be sure that
# the length (NA included) of two time selectors are equal.
#
# Here are some limitations for now:
# 1. The selector for each dataset can only be indices(), not values() or 'all'.
#    If the selector is the common one (e.g., 'sdate', 'lat', and 'lon' in this
#    use case), then it follows the rule of Start().
# 2. The reshape options are not all available. See ex1_13 for details.
# 3. The checker of multiStart() is not comprehensive. Please carefully examine
#    the results you get.
#------------------------------------------------------

#=======================================================
# Define the paths and selectors
#=======================================================

  # Define paths
  # HadGEM3 daily data (360_day)
  path_hadgem3 <- paste0('/esarchive/exp/CMIP6/dcppA-hindcast/hadgem3-gc31-mm/',
                         'cmip6-dcppA-hindcast_i1p1/DCPP/MOHC/HadGEM3-GC31-MM/',
                         'dcppA-hindcast/r1i1p1f2/day/$var$/gn/v20200101/',
                         '$var$_day_HadGEM3-GC31-MM_dcppA-hindcast_s$sdate$-r1i1p1f2_gn_$fyear$.nc')
  ## tasmax_day_HadGEM3-GC31-MM_dcppA-hindcast_s2000-r1i1p1f2_gn_20001101-20101230.nc

  # MPI-ESM1.2-HR (standard)
  path_mpi_esm <- paste0('/esarchive/exp/CMIP6/dcppA-hindcast/mpi-esm1-2-hr/',
                         'cmip6-dcppA-hindcast_i1p1/DCPP/MPI-M/MPI-ESM1-2-HR/',
                         'dcppA-hindcast/r1i1p1f1/day/$var$/gn/v20200101/',
                         '$var$_day_MPI-ESM1-2-HR_dcppA-hindcast_s$sdate$-r1i1p1f1_gn_$fyear$.nc')
  ## tasmax_day_MPI-ESM1-2-HR_dcppA-hindcast_s2000-r1i1p1f1_gn_20001101-20101231.nc

  # Define selectors for each dataset
  sdate <- '2000'
  fyear_hadgem3 <- paste0(sdate, '1101-', as.numeric(sdate) + 10, '1230')
  fyear_mpi_esm <- paste0(sdate, '1101-', as.numeric(sdate) + 10, '1231')
  # To align the time steps, insert an NA in hadgem3
  time_hadgem3 <- indices(c(60, NA, 61)) #12/30, NA, 1/1
  time_mpi_esm <- indices(60:62) #12/30, 12/31, 1/1


#=======================================================
# retrieve = TRUE; interpolation
#=======================================================

  res <- multiStart(
         dat = list(list(name = 'hadgem3', path = path_hadgem3),
                    list(name = 'mpi_esm', path = path_mpi_esm)),
         var = 'tasmax', 
         sdate = '2000',
         fyear = list(list(name = 'hadgem3', fyear = fyear_hadgem3),
                      list(name = 'mpi_esm', fyear = fyear_mpi_esm)),
         time = list(list(name = 'hadgem3', time = time_hadgem3),
                     list(name = 'mpi_esm', time = time_mpi_esm)),
         lat = values(list(-30, 30)), 
         lon = values(list(0, 50)),
         transform = CDORemapper,
         transform_params = list(grid = 'r360x181',
                                 method = 'conservative',
                                 crop = c(0, 50, -30, 30)),
         transform_vars = c('lat', 'lon'),
         return_vars = list(lat = NULL, lon = NULL, time = 'sdate'),
         retrieve = TRUE)

  # Check the returned values. hadgem3 has NA in the 2nd time step.
  dim(res)
  #  dat   var sdate fyear  time   lat   lon 
  #    2     1     1     1     3    61    51 
  res[,1,1,1,,10,10]
  #         [,1]     [,2]     [,3]
  #[1,] 292.5023       NA 292.0899
  #[2,] 293.0914 293.4608 292.8224



#=======================================================
# retrieve = FALSE; Compute()
#=======================================================
#NOTE: To show a clearer concept of multiStart usage, the data are not interpolated
#      here, but they should be. If interpolated, the chunking dimensions cannot be
#      lat and lon (or the results contain many NAs.)

res <- multiStart(
         dat = list(list(name = 'hadgem3', path = path_hadgem3),
                    list(name = 'mpi_esm', path = path_mpi_esm)),
         var = 'tasmax', sdate = '2000',
         fyear = list(list(name = 'hadgem3', fyear = fyear_hadgem3),
                      list(name = 'mpi_esm', fyear = fyear_mpi_esm)),
         time = list(list(name = 'hadgem3', time = time_hadgem3),
                     list(name = 'mpi_esm', time = time_mpi_esm)),
         lat = indices(3:5), lon = indices(1),
         return_vars = list(lat = 'dat', lon = 'dat', time = 'dat'),
         retrieve = FALSE)

str(res)
#List of 2
# $ hadgem3: 
#  ..
#  ..
# $ mpi_esm: 
#  ..
#  ..

str(attr(res[[1]], 'na_pos_record'))
#List of 1
# $ hadgem3:List of 7
#  ..$ dat  : num 0
#  ..$ var  : num 0
#  ..$ sdate: num 0
#  ..$ fyear: num 0
#  ..$ time : int 2
#  ..$ lat  : num 0
#  ..$ lon  : num 0

str(attr(res[[2]], 'na_pos_record'))
#List of 1
# $ mpi_esm:List of 7
#  ..$ dat  : num 0
#  ..$ var  : num 0
#  ..$ sdate: num 0
#  ..$ fyear: num 0
#  ..$ time : num 0
#  ..$ lat  : num 0
#  ..$ lon  : num 0


func_sum <- function(x, y) {
  return(x + y)
}

# 'time' and 'fyear' can also be the operational dimension
step <- Step(func_sum, target_dims = list(hadgem3 = c('var', 'lon'), 
                                          mpi_esm = c('var', 'lon')),
                       output_dims = c('var', 'lon'))

wf <- AddStep(res, step)

# Option 1: run locally
output <- Compute(wf, chunks = list(lat = 2))
output <- Compute(wf, chunks = list(lat = 2, time = 2))


# Option 2: run on Nord3
#-------------------user-defined---------------------
  queue_host <- 'nord1'
  temp_dir <- '/gpfs/scratch/bsc32/bsc32734/startR_hpc/'
  ecflow_suite_dir <- '/home/Earth/aho/startR_local/'
#----------------------------------------------------

 output <- Compute(wf,
                   chunks = list(lat = 2, time = 2),
                   threads_load = 2,
                   threads_compute = 4,
                   cluster = list(queue_host = queue_host,
                     queue_type = 'lsf',
                     temp_dir = temp_dir,
                     cores_per_job = 2,
                     job_wallclock = '05:00',
                     max_jobs = 4,
                     extra_queue_params = list('#BSUB -q bsc_es'),
                     bidirectional = FALSE,
                     polling_period = 10
                   ),
                   ecflow_suite_dir = ecflow_suite_dir,
                   wait = TRUE
                   ) 

dim(output$output1)
#  var   lon  time   dat sdate fyear   lat 
#    1     1     3     1     1     1     3 
output$output1[1, 1, , 1, 1, 1, ]
#         [,1]     [,2]     [,3]
#[1,] 503.0177 502.6464 503.0280
#[2,]       NA       NA       NA
#[3,] 507.7226 507.1748 507.9795