Newer
Older
#------------------------------------------------------
# Author: An-Chi Ho
# Date: 29th March 2021
#------------------------------------------------------
# This use case shows the basic usage of multiStart(), the wrapper of Start()
# for multiple datasets retrieval that have different selector requirement.
# The usage is quite similar to Start() only with some differences.
# - To assign different selectors for each dataset, use the same format as the
# pattern_dim: a list of lists, first item 'name' is the dataset name and the second
# item '<dim_name>' is the selector.
# - When defining the operation and workflow, notice that the considered data array
# number is the dataset number. In this case, it is 2 (hadgem3 and mpi_esm). Even
# though multiStart() returns one object 'data', it actually contains the startR calls
# for each dataset separately.
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#
# This use case wants to put HadGEM3 and MPI-ESM datasets together in one call.
# The major differences between them are the spatial grids and calendar type.
# HadGEM3 uses 360_day calendar and MPI-ESM uses standard calendar. For the grid
# difference, the 'tramsform' parameters can deal with it; while for the calendar
# difference, Start() has difficulty align the time steps. We cannot assign NAs
# as the selector in Start(), and Start() coarces the values to the nearest found
# values, producing the hidden errors.
#
# Therefore, we use multiStart() to insert NAs manually. Three days, 30th Dec.,
# 31st Dec., and 1st Jan. are requested. HadGEM3 with 360_day calendar doesn't
# have 31st Dec., so we need to insert one NA at the 2nd time step. Be sure that
# the length (NA included) of two time selectors are equal.
#
# Here are some limitations for now:
# 1. The selector for each dataset can only be indices(), not values() or 'all'.
# If the selector is the common one (e.g., 'sdate', 'lat', and 'lon' in this
# use case), then it follows the rule of Start().
# 2. The reshape options are not all available. See ex1_13 for details.
# 3. The checker of multiStart() is not comprehensive. Please carefully examine
# the results you get.
#------------------------------------------------------
#=======================================================
# Define the paths and selectors
#=======================================================
# Define paths
# HadGEM3 daily data (360_day)
path_hadgem3 <- paste0('/esarchive/exp/CMIP6/dcppA-hindcast/hadgem3-gc31-mm/',
'cmip6-dcppA-hindcast_i1p1/DCPP/MOHC/HadGEM3-GC31-MM/',
'dcppA-hindcast/r1i1p1f2/day/$var$/gn/v20200101/',
'$var$_day_HadGEM3-GC31-MM_dcppA-hindcast_s$sdate$-r1i1p1f2_gn_$fyear$.nc')
## tasmax_day_HadGEM3-GC31-MM_dcppA-hindcast_s2000-r1i1p1f2_gn_20001101-20101230.nc
# MPI-ESM1.2-HR (standard)
path_mpi_esm <- paste0('/esarchive/exp/CMIP6/dcppA-hindcast/mpi-esm1-2-hr/',
'cmip6-dcppA-hindcast_i1p1/DCPP/MPI-M/MPI-ESM1-2-HR/',
'dcppA-hindcast/r1i1p1f1/day/$var$/gn/v20200101/',
'$var$_day_MPI-ESM1-2-HR_dcppA-hindcast_s$sdate$-r1i1p1f1_gn_$fyear$.nc')
## tasmax_day_MPI-ESM1-2-HR_dcppA-hindcast_s2000-r1i1p1f1_gn_20001101-20101231.nc
# Define selectors for each dataset
sdate <- '2000'
fyear_hadgem3 <- paste0(sdate, '1101-', as.numeric(sdate) + 10, '1230')
fyear_mpi_esm <- paste0(sdate, '1101-', as.numeric(sdate) + 10, '1231')
# To align the time steps, insert an NA in hadgem3
time_hadgem3 <- indices(c(60, NA, 61)) #12/30, NA, 1/1
time_mpi_esm <- indices(60:62) #12/30, 12/31, 1/1
#=======================================================
# retrieve = TRUE; interpolation
#=======================================================
res <- multiStart(
dat = list(list(name = 'hadgem3', path = path_hadgem3),
list(name = 'mpi_esm', path = path_mpi_esm)),
var = 'tasmax',
sdate = '2000',
fyear = list(list(name = 'hadgem3', fyear = fyear_hadgem3),
list(name = 'mpi_esm', fyear = fyear_mpi_esm)),
time = list(list(name = 'hadgem3', time = time_hadgem3),
list(name = 'mpi_esm', time = time_mpi_esm)),
lat = values(list(-30, 30)),
lon = values(list(0, 50)),
transform = CDORemapper,
transform_params = list(grid = 'r360x181',
method = 'conservative',
crop = c(0, 50, -30, 30)),
transform_vars = c('lat', 'lon'),
return_vars = list(lat = NULL, lon = NULL, time = 'sdate'),
retrieve = TRUE)
# Check the returned values. hadgem3 has NA in the 2nd time step.
dim(res)
# dat var sdate fyear time lat lon
# 2 1 1 1 3 61 51
res[,1,1,1,,10,10]
# [,1] [,2] [,3]
#[1,] 292.5023 NA 292.0899
#[2,] 293.0914 293.4608 292.8224
#=======================================================
# retrieve = FALSE; Compute()
#=======================================================
data <- multiStart(
dat = list(list(name = 'hadgem3', path = path_hadgem3),
list(name = 'mpi_esm', path = path_mpi_esm)),
var = 'tasmax',
sdate = '2000',
fyear = list(list(name = 'hadgem3', fyear = fyear_hadgem3),
list(name = 'mpi_esm', fyear = fyear_mpi_esm)),
time = list(list(name = 'hadgem3', time = time_hadgem3),
list(name = 'mpi_esm', time = time_mpi_esm)),
lat = values(list(-30, 30)),
lon = values(list(0, 50)),
transform = CDORemapper,
transform_params = list(grid = 'r360x181',
method = 'conservative',
crop = c(0, 50, -30, 30)),
transform_vars = c('lat', 'lon'),
return_vars = list(lat = NULL, lon = NULL, time = 'sdate'),
retrieve = FALSE)
str(data)
# $ hadgem3: language Start ...
# ..- attr(*, "Dimensions") ...
# $ mpi_esm: language Start ...
# ..- attr(*, "Dimensions") ...
#List of 1
# $ hadgem3:List of 7
# ..$ dat : num 0
# ..$ var : num 0
# ..$ sdate: num 0
# ..$ fyear: num 0
# ..$ time : int 2
# ..$ lat : num 0
# ..$ lon : num 0
#List of 1
# $ mpi_esm:List of 7
# ..$ dat : num 0
# ..$ var : num 0
# ..$ sdate: num 0
# ..$ fyear: num 0
# ..$ time : num 0
# ..$ lat : num 0
# ..$ lon : num 0
# Notice that even though multiStart() returns one object 'data', it actually
# contain two startR calls inside.
length(data)
# 2
names(data)
#[1] "hadgem3" "mpi_esm"
# Therefore, the operation should consider two datasets instead of one.
func_sum <- function(x, y) {
return(x + y)
}
# The target_dims should be a list of 2. The dataset names can be random.
# 'time' and 'fyear' can also be the operational dimension.
step <- Step(func_sum, target_dims = list(hadgem3 = c('var'),
mpi_esm = c('var')),
output_dims = c('var'))
# The inputs is still one object 'data', but it contains three calls implicitly.
wf <- AddStep(data, step)
output <- Compute(wf, chunks = list(time = 2))
# Option 2: run on Nord3
#-------------------user-defined---------------------
queue_host <- 'nord3'
temp_dir <- '/gpfs/scratch/bsc32/bsc32<id>/startR_hpc/'
ecflow_suite_dir <- '/home/Earth/<use_id>/startR_local/'
#----------------------------------------------------
threads_load = 2,
threads_compute = 4,
cluster = list(queue_host = queue_host,
queue_type = 'lsf',
temp_dir = temp_dir,
cores_per_job = 2,
job_wallclock = '05:00',
max_jobs = 4,
extra_queue_params = list('#BSUB -q bsc_es'),
bidirectional = FALSE,
polling_period = 10
),
ecflow_suite_dir = ecflow_suite_dir,
wait = TRUE
# var time dat sdate fyear lat lon
# 1 3 1 1 1 61 51
output$output1[1, , 1, 1, 1, 1, 1]
[1] 584.9990 NA 587.8201