ex2_11_multiStart_insert_NA.R 8.41 KB
Newer Older
aho's avatar
aho committed
#------------------------------------------------------
# Author: An-Chi Ho
# Date: 29th March 2021
#------------------------------------------------------
aho's avatar
aho committed
# This use case shows the basic usage of multiStart(), the wrapper of Start()
# for multiple datasets retrieval that have different selector requirement.
# The usage is quite similar to Start() only with some differences. 
# - To assign different selectors for each dataset, use the same format as the 
#   pattern_dim: a list of lists, first item 'name' is the dataset name and the second 
#   item '<dim_name>' is the selector.
# - When defining the operation and workflow, notice that the considered data array
#   number is the dataset number. In this case, it is 2 (hadgem3 and mpi_esm). Even 
#   though multiStart() returns one object 'data', it actually contains the startR calls
#   for each dataset separately.
aho's avatar
aho committed
#
# This use case wants to put HadGEM3 and MPI-ESM datasets together in one call.
# The major differences between them are the spatial grids and calendar type.
# HadGEM3 uses 360_day calendar and MPI-ESM uses standard calendar. For the grid
# difference, the 'tramsform' parameters can deal with it; while for the calendar
# difference, Start() has difficulty align the time steps. We cannot assign NAs
# as the selector in Start(), and Start() coarces the values to the nearest found 
# values, producing the hidden errors.
#
# Therefore, we use multiStart() to insert NAs manually. Three days, 30th Dec.,
# 31st Dec., and 1st Jan. are requested. HadGEM3 with 360_day calendar doesn't 
# have 31st Dec., so we need to insert one NA at the 2nd time step. Be sure that
# the length (NA included) of two time selectors are equal.
#
# Here are some limitations for now:
# 1. The selector for each dataset can only be indices(), not values() or 'all'.
#    If the selector is the common one (e.g., 'sdate', 'lat', and 'lon' in this
#    use case), then it follows the rule of Start().
# 2. The reshape options are not all available. See ex1_13 for details.
# 3. The checker of multiStart() is not comprehensive. Please carefully examine
#    the results you get.
#------------------------------------------------------

#=======================================================
# Define the paths and selectors
#=======================================================

  # Define paths
  # HadGEM3 daily data (360_day)
  path_hadgem3 <- paste0('/esarchive/exp/CMIP6/dcppA-hindcast/hadgem3-gc31-mm/',
                         'cmip6-dcppA-hindcast_i1p1/DCPP/MOHC/HadGEM3-GC31-MM/',
                         'dcppA-hindcast/r1i1p1f2/day/$var$/gn/v20200101/',
                         '$var$_day_HadGEM3-GC31-MM_dcppA-hindcast_s$sdate$-r1i1p1f2_gn_$fyear$.nc')
  ## tasmax_day_HadGEM3-GC31-MM_dcppA-hindcast_s2000-r1i1p1f2_gn_20001101-20101230.nc

  # MPI-ESM1.2-HR (standard)
  path_mpi_esm <- paste0('/esarchive/exp/CMIP6/dcppA-hindcast/mpi-esm1-2-hr/',
                         'cmip6-dcppA-hindcast_i1p1/DCPP/MPI-M/MPI-ESM1-2-HR/',
                         'dcppA-hindcast/r1i1p1f1/day/$var$/gn/v20200101/',
                         '$var$_day_MPI-ESM1-2-HR_dcppA-hindcast_s$sdate$-r1i1p1f1_gn_$fyear$.nc')
  ## tasmax_day_MPI-ESM1-2-HR_dcppA-hindcast_s2000-r1i1p1f1_gn_20001101-20101231.nc

  # Define selectors for each dataset
  sdate <- '2000'
  fyear_hadgem3 <- paste0(sdate, '1101-', as.numeric(sdate) + 10, '1230')
  fyear_mpi_esm <- paste0(sdate, '1101-', as.numeric(sdate) + 10, '1231')
  # To align the time steps, insert an NA in hadgem3
  time_hadgem3 <- indices(c(60, NA, 61)) #12/30, NA, 1/1
  time_mpi_esm <- indices(60:62) #12/30, 12/31, 1/1


#=======================================================
# retrieve = TRUE; interpolation
#=======================================================

  res <- multiStart(
         dat = list(list(name = 'hadgem3', path = path_hadgem3),
                    list(name = 'mpi_esm', path = path_mpi_esm)),
         var = 'tasmax', 
         sdate = '2000',
         fyear = list(list(name = 'hadgem3', fyear = fyear_hadgem3),
                      list(name = 'mpi_esm', fyear = fyear_mpi_esm)),
         time = list(list(name = 'hadgem3', time = time_hadgem3),
                     list(name = 'mpi_esm', time = time_mpi_esm)),
         lat = values(list(-30, 30)), 
         lon = values(list(0, 50)),
         transform = CDORemapper,
         transform_params = list(grid = 'r360x181',
                                 method = 'conservative',
                                 crop = c(0, 50, -30, 30)),
         transform_vars = c('lat', 'lon'),
         return_vars = list(lat = NULL, lon = NULL, time = 'sdate'),
         retrieve = TRUE)

  # Check the returned values. hadgem3 has NA in the 2nd time step.
  dim(res)
  #  dat   var sdate fyear  time   lat   lon 
  #    2     1     1     1     3    61    51 
  res[,1,1,1,,10,10]
  #         [,1]     [,2]     [,3]
  #[1,] 292.5023       NA 292.0899
  #[2,] 293.0914 293.4608 292.8224



#=======================================================
# retrieve = FALSE; Compute()
#=======================================================
  data <- multiStart(
          dat = list(list(name = 'hadgem3', path = path_hadgem3),
                     list(name = 'mpi_esm', path = path_mpi_esm)),
          var = 'tasmax',
          sdate = '2000',
          fyear = list(list(name = 'hadgem3', fyear = fyear_hadgem3),
                       list(name = 'mpi_esm', fyear = fyear_mpi_esm)),
          time = list(list(name = 'hadgem3', time = time_hadgem3),
                      list(name = 'mpi_esm', time = time_mpi_esm)),
          lat = values(list(-30, 30)),
          lon = values(list(0, 50)),
          transform = CDORemapper,
          transform_params = list(grid = 'r360x181',
                                  method = 'conservative',
                                  crop = c(0, 50, -30, 30)),
          transform_vars = c('lat', 'lon'),
          return_vars = list(lat = NULL, lon = NULL, time = 'sdate'),
          retrieve = FALSE)

str(data)
aho's avatar
aho committed
#List of 2
aho's avatar
aho committed
# $ hadgem3: language Start ...
#  ..- attr(*, "Dimensions") ...
aho's avatar
aho committed
#  ..
aho's avatar
aho committed
#  ..- attr(*, "Variables") ...
aho's avatar
aho committed
#  ..
aho's avatar
aho committed
# $ mpi_esm: language Start ...
#  ..- attr(*, "Dimensions") ...
aho's avatar
aho committed
#  ..
aho's avatar
aho committed
#  ..- attr(*, "Variables") ...
str(attr(data[[1]], 'na_pos_record'))
aho's avatar
aho committed
#List of 1
# $ hadgem3:List of 7
#  ..$ dat  : num 0
#  ..$ var  : num 0
#  ..$ sdate: num 0
#  ..$ fyear: num 0
#  ..$ time : int 2
#  ..$ lat  : num 0
#  ..$ lon  : num 0

str(attr(data[[2]], 'na_pos_record'))
aho's avatar
aho committed
#List of 1
# $ mpi_esm:List of 7
#  ..$ dat  : num 0
#  ..$ var  : num 0
#  ..$ sdate: num 0
#  ..$ fyear: num 0
#  ..$ time : num 0
#  ..$ lat  : num 0
#  ..$ lon  : num 0

# Notice that even though multiStart() returns one object 'data', it actually
# contain two startR calls inside.
length(data)
# 2
names(data)
#[1] "hadgem3" "mpi_esm"

# Therefore, the operation should consider two datasets instead of one.
aho's avatar
aho committed
func_sum <- function(x, y) {
  return(x + y)
}

# The target_dims should be a list of 2. The dataset names can be random.
# 'time' and 'fyear' can also be the operational dimension.
aho's avatar
aho committed
step <- Step(func_sum, target_dims = list(hadgem3 = c('var'),
                                          mpi_esm = c('var')),
                       output_dims = c('var'))
# The inputs is still one object 'data', but it contains three calls implicitly.
wf <- AddStep(data, step)
aho's avatar
aho committed

# Option 1: run locally
aho's avatar
aho committed
output <- Compute(wf, chunks = list(time = 2))
aho's avatar
aho committed

# Option 2: run on Nord3
#-------------------user-defined---------------------
aho's avatar
aho committed
  queue_host <- 'nord3'
  temp_dir <- '/gpfs/scratch/bsc32/bsc32<id>/startR_hpc/'
  ecflow_suite_dir <- '/home/Earth/<use_id>/startR_local/'
aho's avatar
aho committed
#----------------------------------------------------
aho's avatar
aho committed
 output <- Compute(wf,
aho's avatar
aho committed
                   chunks = list(time = 2),
aho's avatar
aho committed
                   threads_load = 2,
                   threads_compute = 4,
                   cluster = list(queue_host = queue_host,
                     queue_type = 'lsf',
                     temp_dir = temp_dir,
                     cores_per_job = 2,
                     job_wallclock = '05:00',
                     max_jobs = 4,
                     extra_queue_params = list('#BSUB -q bsc_es'),
                     bidirectional = FALSE,
                     polling_period = 10
                   ),
                   ecflow_suite_dir = ecflow_suite_dir,
                   wait = TRUE
aho's avatar
aho committed

dim(output$output1)
aho's avatar
aho committed
#  var   time   dat sdate fyear   lat   lon 
#    1      3     1     1     1    61    51
output$output1[1, , 1, 1, 1, 1, 1]
[1] 584.9990       NA 587.8201