From a973596516a6f079dd43037d2fef29a97d8b8f45 Mon Sep 17 00:00:00 2001 From: aho Date: Wed, 12 Jul 2023 11:25:11 +0200 Subject: [PATCH] Refine usecase; provide another easy method to get chunk information --- inst/doc/faq.md | 9 +++----- inst/doc/usecase/ex2_14_margin_dim_indices.R | 17 +++++++++++++++ inst/doc/usecase/ex2_5_rainFARM.R | 22 ++++++++++++++++++-- 3 files changed, 40 insertions(+), 8 deletions(-) diff --git a/inst/doc/faq.md b/inst/doc/faq.md index 1508742..ffe91a5 100644 --- a/inst/doc/faq.md +++ b/inst/doc/faq.md @@ -1001,13 +1001,10 @@ See [How-to-21](#21-retrieve-the-complete-data-when-the-dimension-length-varies- In the self-defined function in startR workflow, the dimensions required for the computations are used as target dimensions, and the rest can be used to chunk the data in pieces. There is one situation that some information of one dimension is needed in the function but it is not depended by the computation. In this case, we may be able to chunk through this dimension while using it in the function still. It is a saver if you have a complex case with no margin dimension left (see [How-to-25](#25-what-to-do-if-your-function-has-too-many-target-dimensions).) -You just need to define a parameter in your function 'nchunks = chunk_indices' and use it in the function. -The use case [RainFARM precipitation downscaling](https://earth.bsc.es/gitlab/es/startR/-/blob/develop-RainFARMCase/inst/doc/usecase/ex2_5_rainFARM.R) demonstrates an example that the start date dimension is used as chunking dimension, -but we use its chunk number to know the start date value of each chunk. -The first part of the function performs downscaling method, which requres longitude and latitude dimensions, so these two dimensions must be the target dimensions in the workflow. -After that, the results are saved as netCDF file following esarchive convention. We need start date value here to decide the file name. -As you can see, the sdate dimension is not required for the computation, so it is not necessary to be the target dimension. We can just use 'chunk_indices' to get the chunk number therefore get the corresponding start date value for the file name. +We have two examples: (1) [ex2_5_RainFARM precipitation downscaling](inst/doc/usecase/ex2_5_rainFARM.R) +shows how to get start date for each chunk using chunk number; (2) [ex2_14](inst/doc/usecase/ex2_14_margin_dim_indices.R) shows how to distinguish the variable in each chunk since "variable" is one of the chunking dimensions +(__NOTE: In this case, it is easier to simply use attributes to find which variable it is. Check use case for more details.__) There are many other possible applications of this parameter. Please share with us other uses cases you may create. diff --git a/inst/doc/usecase/ex2_14_margin_dim_indices.R b/inst/doc/usecase/ex2_14_margin_dim_indices.R index 747a4e3..4ce1caf 100644 --- a/inst/doc/usecase/ex2_14_margin_dim_indices.R +++ b/inst/doc/usecase/ex2_14_margin_dim_indices.R @@ -4,6 +4,10 @@ # This usecase shows you how to know the margin dimension indices in the self-defined function. # In this example, we chunk the data along dimensions 'var' and 'sdate'. We can get the indices # of each chunck, and when dimension 'var' is 2 (i.e., 'tas'), we convert unit from K to degC. +# +# [UPDATE_12072023] This case can be much easier, simply use attributes to +# identify which variable it is in each chunk because attributes are also +# chunked along with data. #------------------------------------------------------------------ library(startR) @@ -19,7 +23,19 @@ return_vars = list(time = 'sdate', lon = NULL, lat = NULL), retrieve = FALSE) +#---------------- METHOD 1 (RECOMMENDED) ----------------- + func <- function(x) { + # x: [lat, lon] + attrs_names <- names(attr(x, 'Variables')$common) + if ('tas' %in% attrs_names) x <- x - 273.15 + + res <- ClimProjDiags::WeightedMean(x, lat = c(attr(x, 'Variables')$common$lat), lon = c(attr(x, 'Variables')$common$lon)) + + return(res) + } + +#---------------- METHOD 2 ----------------- #NOTE: 'chunk_indices', 'chunks', and 'start_call' are the variables from startR:::ByChunks func <- function(x) { # x: [lat, lon] @@ -70,6 +86,7 @@ return(res) } +#-------------------------------------------------------- step <- Step(func, target_dims = c('lat', 'lon'), output_dims = NULL, use_attributes = list("Variables")) diff --git a/inst/doc/usecase/ex2_5_rainFARM.R b/inst/doc/usecase/ex2_5_rainFARM.R index 8d315a0..8b58901 100644 --- a/inst/doc/usecase/ex2_5_rainFARM.R +++ b/inst/doc/usecase/ex2_5_rainFARM.R @@ -1,6 +1,19 @@ # ------------------------------------------------------------------------------ # Downscaling precipitation using RainFARM # ------------------------------------------------------------------------------ +# This usecase demonstrates that the start date dimension is used as chunking +# dimension, but the chunk number is used to know the start date value of each +# chunk. +# The first part of the function performs downscaling method, which requires +# longitude and latitude dimensions, so these two dimensions must be the target +# dimensions in the workflow. +# After that, the results are saved as netCDF file following esarchive convention. +# We need start date value here to decide the file name. +# As you can see, the sdate dimension is not required for the computation, so it +# is not necessary to be the target dimension. We can just use 'chunk_indices' to +# get the chunk number therefore get the corresponding start date value for the +# file name. +# ------------------------------------------------------------------------------ # Note 1: The data could be first transformed with QuantileMapping from CSTools # Note 2: Extra parameters could be used to downscale the data: weights, slope... # See more information in: @@ -74,15 +87,20 @@ step <- Step(Chunk_RF, use_libraries = c('CSTools', 'ncdf4'), use_attributes = list(data = "Variables")) -workflow <- AddStep(data, step, nf = 4, - destination = "/esarchive/scratch/nperez/git/Flor/cstools/test_RF_start/", +workflow <- AddStep(list(data = data), step, nf = 4, + destination = "./test_RF_start/", startdates = as.Date(sdates, format = "%Y%m%d")) + +#========= OPTION 1: Compute locally ============ res <- Compute(workflow, chunks = list(sdate = 4), threads_load = 2, threads_compute = 4) + +#========= OPTION 2: Compute ON NORD3 ============ + #-----------modify according to your personal info--------- queue_host = 'nord3' # your own host name for nord3v2 temp_dir = '/gpfs/scratch/bsc32/bsc32339/startR_hpc/' -- GitLab