diff --git a/DESCRIPTION b/DESCRIPTION index fdd5843f05e6104ccfdbaacedf227edcc800f847..2d569fe9c1883931c7784607620b2181a04e6f5a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: startR Title: Automatically Retrieve Multidimensional Distributed Data Sets -Version: 2.2.2 +Version: 2.2.3 Authors@R: c( person("Nicolau", "Manubens", , "nicolau.manubens@bsc.es", role = c("aut")), person("An-Chi", "Ho", , "an.ho@bsc.es", role = c("aut", "cre")), @@ -42,3 +42,4 @@ BugReports: https://earth.bsc.es/gitlab/es/startR/-/issues SystemRequirements: cdo ecFlow Encoding: UTF-8 RoxygenNote: 7.2.0 +Config/testthat/edition: 3 diff --git a/NEWS.md b/NEWS.md index ce5eab1ab35a8ef394d2bd94c2297f86457e13f9..888fb729955e559437f574fd6ec013f573dedeef 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,7 @@ -# startR v2.2.1 (Release date: 2023-03-24) +# startR v2.2.3 (Release date: 2023-06-06) +- Bugfix in Start(): when using parameter `longitude = 'all'` with transform, there was a missing point for some cases. + +# startR v2.2.2 (Release date: 2023-03-24) - Start(): Bugfix when the input parameters are assigned by a variable with NULL value and retrieve = F - NcDataReader(): Bugfix for wrong time attributes return when the unit is "month" diff --git a/R/Start.R b/R/Start.R index 54ebdb8159a5a274f6132964f3ba1bdc00d20bd1..702b77633cfda496eedf865bce1fb5b75b849c5a 100644 --- a/R/Start.R +++ b/R/Start.R @@ -2061,24 +2061,28 @@ Start <- function(..., # dim = indices/selectors, transform_crop_domain[[transform_var]] <- generate_transform_crop_domain_values( transform_crop_domain[[transform_var]], - picked_vars = picked_common_vars_ordered[[transform_var]]) + picked_vars = picked_common_vars_ordered[[transform_var]], + transform_var) } else { transform_crop_domain[[transform_var]] <- generate_transform_crop_domain_values( transform_crop_domain[[transform_var]], - picked_vars = picked_common_vars[[transform_var]]) + picked_vars = picked_common_vars[[transform_var]], + transform_var) } } else { # return_vars if (transform_var %in% names(dim_reorder_params)) { transform_crop_domain[[transform_var]] <- generate_transform_crop_domain_values( transform_crop_domain[[transform_var]], - picked_vars = picked_vars_ordered[[i]][[transform_var]]) + picked_vars = picked_vars_ordered[[i]][[transform_var]], + transform_var) } else { transform_crop_domain[[transform_var]] <- generate_transform_crop_domain_values( transform_crop_domain[[transform_var]], - picked_vars = picked_vars[[i]][[transform_var]]) + picked_vars = picked_vars[[i]][[transform_var]], + transform_var) } } } else if (is.atomic(transform_crop_domain[[transform_var]])) { diff --git a/R/zzz.R b/R/zzz.R index 381c9d3ac69ce95bc4d1b8aa6c68bde291e5437c..22805ff6e46751fbe4c991a08f5f5bb6a38220b1 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -565,9 +565,21 @@ generate_vars_to_transform <- function(vars_to_transform, picked_vars, transform } # Turn indices to values for transform_crop_domain -generate_transform_crop_domain_values <- function(transform_crop_domain, picked_vars) { +generate_transform_crop_domain_values <- function(transform_crop_domain, + picked_vars, + transform_var) { if (any(transform_crop_domain == 'all')) { - transform_crop_domain <- c(picked_vars[1], tail(picked_vars, 1)) + if (transform_var %in% .KnownLatNames()) { + transform_crop_domain <- c(-90, 90) + } else if (transform_var %in% .KnownLonNames()) { + if (any(picked_vars > 180)) { + transform_crop_domain <- c(0, 360) + } else { + transform_crop_domain <- c(-180, 180) + } + } else { + transform_crop_domain <- c(picked_vars[1], tail(picked_vars, 1)) + } } else { # indices() if (is.list(transform_crop_domain)) { transform_crop_domain <- picked_vars[unlist(transform_crop_domain)] diff --git a/inst/doc/faq.md b/inst/doc/faq.md index f92b798a94570169ac7a7f1c6bfdf00d3ebb90d1..1508742096218a5ca6ee05840b347855afbaaf03 100644 --- a/inst/doc/faq.md +++ b/inst/doc/faq.md @@ -30,6 +30,7 @@ This document intends to be the first reference for any doubts that you may have 24. [Do both interpolation and chunking on spatial dimensions](#24-do-both-interpolation-and-chunking-on-spatial-dimensions) 25. [What to do if your function has too many target dimensions](#25-what-to-do-if-your-function-has-too-many-target-dimensions) 26. [Use merge_across_dims_narm to remove NAs](#26-use-merge_across_dims_narm-to-remove-nas) + 27. [Utilize chunk number in the function](#27-utilize-chunk-number-in-the-function) 2. **Something goes wrong...** @@ -721,11 +722,12 @@ obs <- Start(dat = path.obs, retrieve = T) ``` -### 18. Use glob expression '*' to define the file path -The standard way to define the file path for Start() is using tags (i.e., $TAG_NAME$). -The glob expression, or wildcard, '*', can also be used in the path definition, while the rule is different from the common usage. -Please note that **'*' can only be used to replace the common part of all the files**. For example, if all the required files have the folder 'EC-Earth-Consortium/' in their path, then this part can be substituted with '*/'. +### 18. Use glob expression '*' to define the file path +The standard way to define the file path for Start() is using tags (i.e., $TAG_NAME$). +The glob expression, or wildcard, '*', can also be used in the path definition, while the rule is different from the common usage. + +Please note that **'*' can only be used to replace the common part of all the files**. For example, if all the required files have the folder 'EC-Earth-Consortium/' in their path, then this part can be substituted with '*/'. It can save some effort to define the long and uncritical path, and also make the script cleaner. However, if the part replaced by '\*' is not same among all the files, Start() will use **the first pattern it finds in the first file to substitute '*'**. @@ -976,7 +978,10 @@ We provide some [use cases](inst/doc/usecase/ex2_12_transform_and_chunk.R) showi ### 25. What to do if your function has too many target dimensions -Unfortunately, we don't have a perfect solution now before we have multiple steps feature. Talk to maintainers to see how to generate a workaround for your case. +Ideally, the desired startR workflow uses those required (for the key computations) dimensions to compute an analysis and the rest to chunk the data in pieces. +If we have a complex analysis that require all the dimensions in the computation in one single step, we don't have any free (i.e., margin) dimension to chunk the data. +Unfortunately, we don't have a perfect solution now before we have multiple steps feature. +You may check [How-to-27](#27-utilize-chunk-number-in-the-function) to see if the solution applies to your case. If not, talk to the maintainers to see how to generate a workaround for your case. ### 26. Use merge_across_dims_narm to remove NAs @@ -990,9 +995,23 @@ This parameter tells Start() that it needs to look into each file to know the di A typical example is reading daily data and merging time dimension together. The 30-day months will have one NA at the end of time dimension, if `merge_across_dims_narm = T` and `largest_dims_length = T` are not used. Check usecase [ex1_16](/inst/doc/usecase/ex1_16_files_different_time_dim_length.R) for the example script. See [How-to-21](#21-retrieve-the-complete-data-when-the-dimension-length-varies-among-files) for more details of `largest_dims_length`. - +### 27. Utilize chunk number in the function +In the self-defined function in startR workflow, the dimensions required for the computations are used as target dimensions, +and the rest can be used to chunk the data in pieces. There is one situation that some information of one dimension is needed in the function but it is not depended by the computation. +In this case, we may be able to chunk through this dimension while using it in the function still. It is a saver if you have a complex case with no margin dimension left (see [How-to-25](#25-what-to-do-if-your-function-has-too-many-target-dimensions).) +You just need to define a parameter in your function 'nchunks = chunk_indices' and use it in the function. + +The use case [RainFARM precipitation downscaling](https://earth.bsc.es/gitlab/es/startR/-/blob/develop-RainFARMCase/inst/doc/usecase/ex2_5_rainFARM.R) demonstrates an example that the start date dimension is used as chunking dimension, +but we use its chunk number to know the start date value of each chunk. +The first part of the function performs downscaling method, which requres longitude and latitude dimensions, so these two dimensions must be the target dimensions in the workflow. +After that, the results are saved as netCDF file following esarchive convention. We need start date value here to decide the file name. +As you can see, the sdate dimension is not required for the computation, so it is not necessary to be the target dimension. We can just use 'chunk_indices' to get the chunk number therefore get the corresponding start date value for the file name. + +There are many other possible applications of this parameter. Please share with us other uses cases you may create. + + # Something goes wrong... ### 1. No space left on device diff --git a/inst/doc/figures/Rotated_Coordinates.png b/inst/doc/figures/Rotated_Coordinates.png index c5bc9cf55905972921a816e1bf10a1b8c1e9d8fb..e8a6e2ec30437a9a3978e578159ce288cf4f17aa 100644 Binary files a/inst/doc/figures/Rotated_Coordinates.png and b/inst/doc/figures/Rotated_Coordinates.png differ diff --git a/inst/doc/figures/subseasonal_1.png b/inst/doc/figures/subseasonal_1.png new file mode 100644 index 0000000000000000000000000000000000000000..d5ca71a307dcf958543935bf93516cdb59ab27b9 Binary files /dev/null and b/inst/doc/figures/subseasonal_1.png differ diff --git a/inst/doc/figures/subseasonal_2.png b/inst/doc/figures/subseasonal_2.png new file mode 100644 index 0000000000000000000000000000000000000000..6c71dc71cf1f54e90ccc0adc7d6dddfdfd29927e Binary files /dev/null and b/inst/doc/figures/subseasonal_2.png differ diff --git a/inst/doc/figures/subseasonal_3.png b/inst/doc/figures/subseasonal_3.png new file mode 100644 index 0000000000000000000000000000000000000000..e5e3d8c2f67db15c38a3f0e1f48f2431af1cab88 Binary files /dev/null and b/inst/doc/figures/subseasonal_3.png differ diff --git a/inst/doc/figures/subseasonal_4.png b/inst/doc/figures/subseasonal_4.png new file mode 100644 index 0000000000000000000000000000000000000000..a9d1b051d7dfff9576316c1e95e57062d1752fa7 Binary files /dev/null and b/inst/doc/figures/subseasonal_4.png differ diff --git a/inst/doc/figures/subseasonal_5.png b/inst/doc/figures/subseasonal_5.png new file mode 100644 index 0000000000000000000000000000000000000000..1d339b017ca298561177358e997abbf8f3f1bef8 Binary files /dev/null and b/inst/doc/figures/subseasonal_5.png differ diff --git a/inst/doc/usecase.md b/inst/doc/usecase.md index c7748f6f2a8b57d82f2b723f18aeebfedc5b6ff0..47ee89e526d4a21718f7f9651ad436db76271d0f 100644 --- a/inst/doc/usecase.md +++ b/inst/doc/usecase.md @@ -37,51 +37,51 @@ for more explanation. The problem may occur when the dimension number of the splitted selector is more than two. If you are not familiar with the usage of these parameters, please see usecases ex1_2 and ex1_3 first, which are less complicated. You can also go to FAQ How-to-#17 for more explanation. 8. [Loading tas and tos from Decadal Predictions performed with the EC-Earth model](inst/doc/usecase/ex1_8_tasandtos.R) - Some climate indices needs to be computed loading 'tas' (air temperature at 2m) over land and 'tos' (ocean surface temperature) over sea. Using **startR**, you can load these data in a unique **Start** call or with multiple calls separately for each variable. + Some climate indices needs to be computed loading 'tas' (air temperature at 2m) over land and 'tos' (ocean surface temperature) over sea. Using **startR**, you can load these data in a unique **Start** call or with multiple calls separately for each variable. 9. [Use glob expression * to define the path](inst/doc/usecase/ex1_9_path_glob_permissive.R) - This script shows you how to use glob expression '*' and the parameter 'path_glob_permissive' of Start(). -You can also find information in [FAQ How-to-18](inst/doc/faq.md#18-use-glob-expression-to-define-the-file-path). + This script shows you how to use glob expression '*' and the parameter 'path_glob_permissive' of Start(). You can also find information in [FAQ How-to-18](inst/doc/faq.md#18-use-glob-expression-to-define-the-file-path). 10. [Use 'metadata_dims' to retrieve complete variable metadata](inst/doc/usecase/ex1_10_metadata_dims.R) - This script tells you how to use the parameter 'metadata_dims' in Start() to get the complete variable metadata. -You will see four difference cases and learn the rules. -You can find more explanation in FAQ [How-to-20](inst/doc/faq.md#20-use-metadata_dims-to-retrieve-variable-metadata). + This script tells you how to use the parameter 'metadata_dims' in Start() to get the complete variable metadata. You will see four difference cases and learn the rules. You can find more explanation in FAQ [How-to-20](inst/doc/faq.md#20-use-metadata_dims-to-retrieve-variable-metadata). 11. [Three methods to load experimental files with different member and version](inst/doc/usecase/ex1_11_expid_member_version.R) This script shows three ways to load the data with different expid - member - version combination. It is useful for climate prediction of multiple experiments. - 12. [Load and plot data in rotated coordintaes](inst/doc/usecase/ex1_12_rotated_coordinates.R) - This script shows how to load and plot data in rotated coordinates using **Monarch-dust** simulations. + 12. [Load and plot data in rotated coordintaes](inst/doc/usecase/ex1_12_rotated_coordinates.R) + This script shows how to load and plot data in rotated coordinates using **Monarch-dust** simulations. + - 13. [Use value array as selector to express dependency](inst/doc/usecase/ex1_13_implicit_dependency.R) - This script shows how to use a value array as the inner dimension selector to express -dependency on a file dimension. By this means, we do not need to specify the *_across -parameter and Start() can recognize this dependecy relationship. + 13. [Use value array as selector to express dependency](inst/doc/usecase/ex1_13_implicit_dependency.R) + This script shows how to use a value array as the inner dimension selector to express dependency on a file dimension. By this means, we do not need to specify the *_across parameter and Start() can recognize this dependecy relationship. - 14. [Specify the dependency between file dimensions](inst/doc/usecase/ex1_14_file_dependency.R) - This script shows how to define the dependency between file dimensions. Note that ex1_13 is for -the dependency between one inner dimension and one file dimension (i.e., the usage of *_across), while -this use case is for two file dimensions (i.e., the usage of *_depends). + 14. [Specify the dependency between file dimensions](inst/doc/usecase/ex1_14_file_dependency.R) + This script shows how to define the dependency between file dimensions. Note that ex1_13 is for the dependency between one inner dimension and one file dimension (i.e., the usage of *_across), while this use case is for two file dimensions (i.e., the usage of *_depends). - 15. [Load irregular grid data](inst/doc/usecase/ex1_15_irregular_grid_CDORemap.R) + 15. [Load irregular grid data](inst/doc/usecase/ex1_15_irregular_grid_CDORemap.R) This script shows how to use Start() to load irregular grid data , then regrid it by s2dv::CDORemap. - 16. [Merge files with different time dimension length](inst/doc/usecase/ex1_16_files_different_time_dim_length.R) - This script shows how to use Start() to load files with different time dimension length -and reshape the array without undesired NAs. + 16. [Merge files with different time dimension length](inst/doc/usecase/ex1_16_files_different_time_dim_length.R) + This script shows how to use Start() to load files with different time dimension length and reshape the array without undesired NAs. 2. **Execute computation (use `Compute()`)** 1. [Function working on time dimension](inst/doc/usecase/ex2_1_timedim.R) + 2. [Function using attributes of the data](inst/doc/usecase/ex2_2_attr.R) Using attributes is only available in startR_v0.1.3 or above. + 3. [Use function CDORemap for interpolation](inst/doc/usecase/ex2_3_cdo.R) Using parameter `CDO_module` is only available in startR_v0.1.3 or above. Interpolate data by using `s2dv::CDORemap` in the workflow. + 4. [Use two functions in workflow](inst/doc/usecase/ex2_4_two_func.R) - 5. + + 5. [RainFARM precipitation downscaling](https://earth.bsc.es/gitlab/es/startR/-/blob/develop-RainFARMCase/inst/doc/usecase/ex2_5_rainFARM.R) + This example shows how to apply a statistical downscaling function with startR and simultaneously (considering the memory size if unnecessary dimensions are included) saves the data by chunks (e.g., chunking those dimensions which are not required for downscaling) in the esarchive format. It is not recommended to save big outputs. Consider to perform some analysis and then retrieve the result instead of saving data. This is a simplified example of RainFARM for more information visit: https://www.medscope-project.eu/products/data/. +Find more explanation of this use case in FAQ [How-to-27](inst/doc/faq.md#27-utilize-chunk-number-in-the-function). + 6. [Use external parameters in atomic function](inst/doc/usecase/ex2_6_ext_param_func.R) 7. [Calculate the ensemble-adjusted Continuous Ranked Probability Score (CRPS)](inst/doc/usecase/ex2_7_seasonal_forecast_crps.R) @@ -94,17 +94,21 @@ and reshape the array without undesired NAs. If you need to apply your analysis in a few gridpoints, you may want to consider use case 1.6, but if you need to load a lot of grid points, maybe this a better solution. 10. [Apply an existing mask on data](inst/doc/usecase/ex2_10_existing_mask.R) - This use case shows you how to apply the existing mask file on your data. -If you need to create the mask file on your own, go to ex2_9_mask.R. + This use case shows you how to apply the existing mask file on your data. If you need to create the mask file on your own, go to ex2_9_mask.R. - 11. [Two datasets with different length of target dimensions](inst/doc/usecase/ex2_11_two_dat_inconsistent_target_dim.R) - This use case uses experimental and the corresponding observational data to calculate -the temporal mean and spatial weighted mean. Notice that the spatial resolutions of the two -datasets are different, but it still works because lat and lon are target dimensions. + 11. [Two datasets with different length of target dimensions](inst/doc/usecase/ex2_11_two_dat_inconsistent_target_dim.R) + This use case uses experimental and the corresponding observational data to calculate the temporal mean and spatial weighted mean. Notice that the spatial resolutions of the two datasets are different, but it still works because lat and lon are target dimensions. - 12. [Transform and chunk spatial dimensions](inst/doc/usecase/ex2_12_transform_and_chunk.R) - This use case provides an example of transforming and chunking latitude and longitude dimensions. + 12. [Transform and chunk spatial dimensions](inst/doc/usecase/ex2_12_transform_and_chunk.R) + This use case provides an example of transforming and chunking latitude and longitude dimensions. 13. [Load irregular grid data and interpolate it in the workflow](inst/doc/usecase/ex2_13_irregular_regrid.R) This script shows how to load irregular grid data by Start(), then regrid it by s2dv::CDORemap in the workflow. It is a solution before Start() can deal with irregular regridding directly. + +3. **Verification workflows** + 1. [Weekly ECV Subseasonal Hindcast Verification](inst/doc/usecase/ex3_1_SubseasonalECVHindcast.md) + This is a practical case to compute monthly skill scores for the ECMWF/S2S-ENSForhc +subseasonal hindcast using as a reference dataset ERA5. The ECV is air temperature at surface level (tas). +Note that since this case is practical, it is heavy and takes much time to finish running. + diff --git a/inst/doc/usecase/ex2_5_rainFARM.R b/inst/doc/usecase/ex2_5_rainFARM.R new file mode 100644 index 0000000000000000000000000000000000000000..8d315a03c1f47588f881b39513dbf6b76cf43b7e --- /dev/null +++ b/inst/doc/usecase/ex2_5_rainFARM.R @@ -0,0 +1,109 @@ +# ------------------------------------------------------------------------------ +# Downscaling precipitation using RainFARM +# ------------------------------------------------------------------------------ +# Note 1: The data could be first transformed with QuantileMapping from CSTools +# Note 2: Extra parameters could be used to downscale the data: weights, slope... +# See more information in: +# https://cran.r-project.org/web/packages/CSTools/vignettes/RainFARM_vignette.html +# ------------------------------------------------------------------------------ +# MOST IMPORTANT NOTE: +# startR aims to return a result that fits in your local memory. This aim is +# the oposite of downscaling, which increases the output size. Therefore, this +# example saves the data to NetCDF files and provides the mean which has a +# reduced size. +# Warning!!!!!! Use this example with caution to avoid saving not desired data. +#------------------------------------------------------------------------------- + +# Load libraries and functions: +library(startR) +library(CSTools) +library(ncdf4) +library(s2dv) + +# Define the data: +sdates <- paste0(1993:1996, '1101') # starting dates +path <- "/esarchive/exp/ecmwf/system5c3s/daily/$var$_s0-24h/$var$_$sdate$.nc" +data <- Start(dataset = path, + var = 'prlr', + sdate = sdates, + member = 1:3, + longitude = values(list(-10, 29)), + longitude_reorder = CircularSort(-180, 180), + latitude = values(list(18, 57)), + time = 'all', + return_vars = list(latitude = 'dataset', longitude = 'dataset', + time = NULL), + synonims = list(latitude = c('lat', 'latitude'), + longitude = c('lon', 'longitude'), + member = c('member', 'ensemble')), + retrieve = FALSE) + +# Define the function: +Chunk_RF <- function(x, nf, destination, startdates, nchunks = chunk_indices) { + lon <- as.numeric(attributes(x)$Variables$dat1$longitude) + lat <- as.numeric(attributes(x)$Variables$dat1$latitude) + down_data <- RainFARM(x, lon = lon, lat = lat, drop_realization_dim = TRUE, + nf, lon_dim = 'longitude', lat_dim = 'latitude', time_dim = 'time') + # detect the dates of forecast time for different start dates + time <- attributes(x)$Variables$common$time + dates <- lapply(startdates, function(x) {seq(as.Date(x, format = "%Y-%m-%d"), + x + length(time) - 1, 'day')}) + dimname <- names(dim(down_data$data)) + var_dims <- list(ncdim_def(name = 'lon', units = 'degrees', + vals = as.vector(down_data$lon), longname = 'longitude'), + ncdim_def(name = 'lat', units = 'degrees', + vals = as.vector(down_data$lat), longname = 'longitude'), + ncdim_def(name = 'ensemble', units = 'adim', + vals = 1 : dim(down_data$data)[which(dimname == 'member')], + longname = 'ensemble', create_dimvar = TRUE)) + metadata_var <- list(units = 'm s-1', + cdo_grid_name = paste0('r', length(lon), 'x', length(lat)), + projection = 'none') + # startdates and dates depends on the chunk: + CSTools:::.saveExp(down_data$data, startdates = startdates[nchunks['sdate']], + dates = dates[[nchunks['sdate']]], defined_dims = var_dims, + varname = 'prlr', metadata_var = metadata_var, + destination = destination) + down_data_mean <- s2dv::MeanDims(down_data$data, c('member', 'longitude', 'latitude')) + return(down_data_mean) +} + +step <- Step(Chunk_RF, + target_dims = c('member', 'longitude', 'latitude', 'time'), + output_dims = 'time', + use_libraries = c('CSTools', 'ncdf4'), + use_attributes = list(data = "Variables")) + +workflow <- AddStep(data, step, nf = 4, + destination = "/esarchive/scratch/nperez/git/Flor/cstools/test_RF_start/", + startdates = as.Date(sdates, format = "%Y%m%d")) + +res <- Compute(workflow, + chunks = list(sdate = 4), + threads_load = 2, + threads_compute = 4) + +#-----------modify according to your personal info--------- + queue_host = 'nord3' # your own host name for nord3v2 + temp_dir = '/gpfs/scratch/bsc32/bsc32339/startR_hpc/' + ecflow_suite_dir = '/home/Earth/nperez/startR_local/' # your own local directory +#------------------------------------------------------------ + +res <- Compute(workflow, + chunks = list(sdate = 4), + threads_load = 1, + threads_compute = 1, + cluster = list(queue_host = queue_host, + queue_type = 'slurm', + cores_per_job = 16, + r_module = 'R/4.1.2-foss-2019b', + temp_dir = temp_dir, + polling_period = 10, + job_wallclock = '01:00:00', + max_jobs = 4, + bidirectional = FALSE), + ecflow_suite_dir = ecflow_suite_dir, + wait = TRUE) + +# Visualize the temporal evolution of the result simultaneously for all sdates: +matplot(1:215, res$output1[, 1, 1, ], type = "l") diff --git a/inst/doc/usecase/ex3_1_SubseasonalECVHindcast.md b/inst/doc/usecase/ex3_1_SubseasonalECVHindcast.md new file mode 100644 index 0000000000000000000000000000000000000000..85015130a5c60462493cb96dd168f410d7d995af --- /dev/null +++ b/inst/doc/usecase/ex3_1_SubseasonalECVHindcast.md @@ -0,0 +1,275 @@ +# Weekly ECV Subseasonal Hindcast Verification + +This is a practical case to compute monthly skill scores for the ECMWF/S2S-ENSForhc subseasonal hindcast using as a reference dataset ERA5. The ECV is air temperature at surface level (tas). +Note that since this case is practical, it is heavy and takes much time to finish running. + +Here, we aim to calculate the skill scores of the corresponding hindcast of 2016. + +Note that to do this, we will detect the Mondays and Thursdays, which are the days of the week in which this model is initialized, during the year 2016 (this is the *$sdate$*). +However, we will analyze the 20 previous years (this is the *$syear$*) of those initializations. + +The figure to see details on the subeasonal hindcasts storage: + + + +After loading startR package, the paths to the hindcast should be defined, including labels for $var$, $sdate$ and $syear$: + +```r +library(startR) + +ecmwf_path <- paste0('/esarchive/exp/ecmwf/s2s-monthly_ensforhc/', + 'weekly_mean/$var$_f24h/$sdate$/$var$_$syear$.nc') +``` + +Now, create the sequence of start dates in 2016 following the scheme in this figure: + + + +```r +forecast.year <- 2016 +# Mondays +sdates.seq.mon <- format(seq(as.Date(paste(forecast.year, 01, 04, sep = '-')), + as.Date(paste(forecast.year, 12, 31, sep='-')), + by = 'weeks'), format = '%Y%m%d') +# Thursdays (Monday+3days) +sdates.seq.thu <- format(seq(as.Date(paste(forecast.year, 01, 04, sep = '-')) + 3, + as.Date(paste(forecast.year, 12, 31, sep = '-')), + by = 'weeks'), format = '%Y%m%d') +# Joint dates in order +sdates.seq <- c(sdates.seq.mon, sdates.seq.thu) +ind <- order(as.Date(sdates.seq, format = '%Y%m%d')) # dates in order +sdates.seq <- sdates.seq[ind] + +# Leap years, remove 29th of February: +pos.bis <- which(sdates.seq == paste0(forecast.year,"0229")) +if(length(pos.bis) != 0) sdates.seq <- sdates.seq[-pos.bis] + +exp <- Start(dat = ecmwf_path, + var = 'tas', + sdate = sdates.seq, + syear = 'all', # before hdate + time = 'all', + ensemble = "all", + latitude = indices(1:121), + longitude = indices(1:240), + syear_depends = 'sdate', + return_vars = list(latitude = NULL, + longitude = NULL, + time = c('sdate', 'syear')), + num_procs = 16, + retrieve = FALSE) +``` + +Now, we define the obs dates to load the reference dataset: + +_(NOTE: The time attributes of exp returned by Start() cannot be used for defining +the obs dates because the time values saved in exp data are with 1-week lag. E.g., file for 19600104 has time values 1960-01-11, 1960-01-18, 1960-01-25, 1960-02-01.)_ + +```r +# Corresponding ERA5 +## Generate the syears from the sdates.seq +syears <- t(sapply(sdates.seq, function(x) { + year <- as.numeric(substr(x, 1, 4)) + syears <- paste0((year-20):(year-1), substr(x, 5, 8)) + })) +names(dim(syears)) <- c('sdate', 'syear') + +## Generate the times from the syears +Sys.setenv(TZ='UTC') # this helps to get UTC times +times <- lapply(syears, function(x) { + x <- as.Date(x, format = "%Y%m%d", tz = "UTC") + x <- seq(x, x + 25, by = 'week') + format(x, "%Y%m%d") + }) +times <- Reduce(c, times) +dim(times) <- c(time = 4, sdate = 103, syear = 20) +times <- s2dv::Reorder(times, c(2,3,1)) + +obs_path <- paste0("/esarchive/recon/ecmwf/era5/weekly_mean/", + "$var$_f1h-240x121/$var$_$file_date$.nc") + +obs <- Start(dat = obs_path, + var = 'tas', + file_date = times, + latitude = indices(1:121), + longitude = indices(1:240), + split_multiselected_dims = TRUE, + return_vars = list(latitude = NULL, + longitude = NULL, + time = 'file_date'), + num_procs = 16, + retrieve = FALSE) +``` + +The next step is to define our function to calculate scores: + + + + + +```r +score_calc <- function(hindcast, reference, sdates.seq, scores = 'all') { + + # Anomaly computation + reference <- s2dv::InsertDim(reference, pos = 3, len = 1, name = 'ensemble') + clim <- s2dv:::.Clim(hindcast, reference, time_dim = 'syear', + memb_dim = 'ensemble', memb = FALSE) + hindcast <- s2dv::Ano(hindcast, clim$clim_exp) + reference <- s2dv::Ano(reference, clim$clim_obs) + + # Create objects to store the outputs + Scores <- NULL + Skill_Scores <- NULL + + for(month in 1:12) { + # take only data of 1 month (indices of each month in start_dates) + startdates <- which(as.integer(substr(sdates.seq, 5, 6)) == month) + hindcast_month <- ClimProjDiags::Subset(hindcast, 'sdate', + list(startdates)) + reference_month <- ClimProjDiags::Subset(reference, 'sdate', + list(startdates)) + # reshape dimension to be [sdate, ensemble] + dim(hindcast_month) <- c(sdate = prod(dim(hindcast_month)[c('sdate', 'syear')]), + dim(hindcast_month)['ensemble']) + dim(reference_month) <- c(sdate = prod(dim(reference_month)[c('sdate', 'syear')]), + dim(reference_month)['ensemble']) + + if (any(c('fairrpss', 'frpss', 'all') %in% tolower(scores))) { + Scores <- c(Scores, + unlist(easyVerification::veriApply("FairRpss", + fcst = hindcast_month, obs = reference_month, + prob = c(1/3, 2/3), tdim = 1, ensdim = 2))) + } + if (any(c('faircrpss', 'fcrpss', 'all') %in% tolower(scores))) { + Scores <- c(Scores, + unlist(easyVerification::veriApply("FairCrpss", + fcst = hindcast_month, obs = reference_month, + tdim = 1, ensdim = 2))) + } + if (any(c('enscorr', 'corr', 'all') %in% tolower(scores))) { + fcst_mean <- rowMeans(hindcast_month) + Scores <- c(Scores, + cor = cor(fcst_mean, reference_month, + use = "complete.obs"), + cor.pv = cor.test(fcst_mean, reference_month, + use = "complete.obs")$p.value) + } + if (any(c('bss10', 'fbss10', 'all') %in% scores)) { + Scores <- c(Scores, + unlist(easyVerification::veriApply("FairRpss", + fcst = hindcast_month, obs = reference_month, + prob = (1/10), tdim = 1, ensdim = 2))) + } + if (any(c('bss90', 'fbss90', 'all') %in% scores)) { + Scores <- c(Scores, + unlist(easyVerification::veriApply("FairRpss", + fcst = hindcast_month, + obs = reference_month, + prob = (9/10), tdim = 1, ensdim = 2))) + } + Skill_Scores <- cbind(Skill_Scores, Scores) + Scores <- NULL + } + return(Skill_Scores) +} +``` + +The workflow is created with Step() and AddStep() + + + + +```r +step <- Step(fun = score_calc, + target_dims = list(hindcast = c('sdate','syear','ensemble'), + reference = c('sdate', 'syear')), + output_dims = list(c('scores', 'month')), + use_libraries = c('easyVerification', + 'SpecsVerification', + 's2dv', 'ClimProjDiags')) +# Workflow: +wf <- AddStep(list(exp, obs), step, sdates.seq = sdates.seq, scores = 'all') +``` + +Finally, execute the analysis, for instance, in Nord3v2: + + +```r +#-----------modify according to your personal info--------- + queue_host = 'nord4' #your own host name for Nord3v2 + temp_dir = '/gpfs/scratch/bsc32/bsc32339/startR_hpc/' + ecflow_suite_dir = '/home/Earth/nperez/startR_local/' #your own local directory +#------------------------------------------------------------ +result <- Compute(wf, + chunks = list(time = 4, longitude = 2, latitude = 2), + threads_load = 2, + threads_compute = 12, + cluster = list(queue_host = queue_host, + queue_type = 'slurm', + cores_per_job = 12, + temp_dir = temp_dir, + polling_period = 10, + job_wallclock = '03:00:00', + max_jobs = 16, + bidirectional = FALSE), + ecflow_suite_dir = ecflow_suite_dir, + wait = TRUE) +``` +*Notice that the execution of `Compute` may last for ~2 hours each chunk. Consider set `wait` as false (see [practical guide](https://earth.bsc.es/gitlab/es/startR/-/blob/master/inst/doc/practical_guide.md#collect-and-the-ec-flow-gui)).* + +If 'all' scores are requested the order in the 'scores' dimension of the output will be: + - FairRpss_skillscore (1), FairRpss_sd (2), + - FairCrpss_skillscore (3), FairCrpss_sd (4), + - EnsCorr (5), EnsCorr_pv (6) + - FairBSS10 (7), FairBSS10_pv (8) + - FairBSS90 (9), FairBSS90_pv (10) + +If you choose a subset of 'scores', it will follow the same order omitting the non-declared scores. It is useful to display the dimension names to understand the order of the output to create the plots. The significance can be also calculated using the standard deviation. Here, it is shown a simplified method: + + +```r +dim(result$output1) +library(multiApply) # to use Apply and calculate significance +library(ClimProjDiags) # to use ArrayToList facilitating the use of PlotLayout +library(s2dv) # to use PlotLayout combined with PlotEquiMap +FRPSS.pv <- Apply(result$output1, target_dims = c('scores'), + fun = function(x, my.pvalue) { + (x[1] > 0) & (x[1] > x[2] * qnorm(1 - my.pvalue))}, + my.pvalue = 0.05, ncores = 4)$output1 + + +sig <- ArrayToList(FRPSS.pv[,1,1,1,,], 1, names = 'dots', + level = 'sublist') +vars <- ArrayToList(result$output1[1,,1,1,1,,], 1, names = '') +PlotLayout(fun = PlotEquiMap, + plot_dims = c('latitude', 'longitude'), + var = vars, + lon = attributes(exp)$Variables$common$longitude, + lat = attributes(exp)$Variables$common$latitude, + brks = seq(-1, 1, 0.2), + filled.continents = FALSE, sizetit = NULL, dot_symbol = '/', + special_args = sig, + toptitle = 'S2S FairRPSS - Hindcast 2016 - sweek 1', + title_scale = 0.7, + titles = month.name, bar_scale = 0.8, + fileout = 'startR/inst/doc/figures/subseasonal_5.png') +``` + + + + + +This multipanel plot shows the monthly skill score FairRPSS corresponding to the first lead time (i.e. first week of each month) of the 20 years hindcast of 2016. Blue (red) grid points correspond to low (high) values of the Fair RPSS in which the predictability is low (high). The highest values are found in tropical regions. However, the predictability is different for each month. + +Given the high number of significant gridpoints, an alternative to display this information could be to filter the data for the non-significant points. + +*References* + +Wilks, D. S. (2011). Statistical methods in the atmospheric sciences. Elsevier/Academic Press. + +Vitart, F., Buizza, R., Alonso Balmaseda, M., Balsamo, G., Bidlot, J.-R., Bonet, A., Fuentes, M., Hofstadler, A., Molteni, F., & Palmer, T. N. (2008). The new VarEPS-monthly forecasting system: A first step towards seamless prediction. Quarterly Journal of the Royal Meteorological Society, 134(636), 1789–1799. https://doi.org/10.1002/qj.322 + +*Credits to:* + - *Original code: Andrea Manrique* + - *Adaptation: Núria Pérez-Zanón* + - *Code improvement: An-Chi Ho* diff --git a/tests/testthat/test-Start-transform-all.R b/tests/testthat/test-Start-transform-all.R new file mode 100644 index 0000000000000000000000000000000000000000..8a9ca657f61e7cf32a1f55d4531cdc515c781479 --- /dev/null +++ b/tests/testthat/test-Start-transform-all.R @@ -0,0 +1,141 @@ +# This unit test uses 'all' to do the transformation and tests the output grid. +# The results should be identical and consistent with cdo result (with precision difference). +# The test contains three calls with different target grids: +# two with 'r128x64' (from different original grid) and one with 'r100x50'. + +context("Transform test target grid: lon and lat = 'all'") + +#--------------------------------------------------------------- +# cdo is used to verify the data values +# Test 1: original grid 'r360x180' +library(easyNCDF) +grid1 <- '/esarchive/exp/CMIP6/dcppA-hindcast/CanESM5/DCPP/CCCma/CanESM5/dcppA-hindcast/r1i1p2f1/Omon/tos/gr/v20190429/tos_Omon_CanESM5_dcppA-hindcast_s1980-r1i1p2f1_gr_198101-199012.nc' # 'r128x64' +path <- '/esarchive/exp/CMIP6/dcppA-hindcast/CESM1-1-CAM5-CMIP5/DCPP/NCAR/CESM1-1-CAM5-CMIP5/dcppA-hindcast/r1i1p1f1/Omon/tos/gr/v20191016/tos_Omon_CESM1-1-CAM5-CMIP5_dcppA-hindcast_s2015-r1i1p1f1_gr_201511-202512.nc' # 'r360x180' + +file <- NcOpen(path) +arr <- NcToArray(file, + dim_indices = list(lat = 1:180, lon = 1:360, time = 1:2), + vars_to_read = 'tos') +lats <- NcToArray(file, + dim_indices = list(lat = 1:180), vars_to_read = 'lat') +lons <- NcToArray(file, + dim_indices = list(lon = 1:360), vars_to_read = 'lon') +NcClose(file) + +lons <- as.vector(lons) +lats <- as.vector(lats) +suppressWarnings( + arr1 <- s2dv::CDORemap(arr, lons = as.vector(lons), lats = as.vector(lats), + grid = grid1, method = 'con', crop = FALSE) +) +suppressWarnings( + arr2 <- s2dv::CDORemap(arr, lons = as.vector(lons), lats = as.vector(lats), + grid = 'r100x50', method = 'con', crop = FALSE) +) + +#--------------------------------------------------------------- +# Test 2: Original grid 'r432x324' +path <- '/esarchive/exp/CMIP6/dcppA-hindcast/HadGEM3-GC31-MM/DCPP/MOHC/HadGEM3-GC31-MM/dcppA-hindcast/r1i1p1f2/Amon/tas/gn/v20200417/tas_Amon_HadGEM3-GC31-MM_dcppA-hindcast_s2009-r1i1p1f2_gn_201501-201512.nc' # 'r432x324' +file <- NcOpen(path) +arr <- NcToArray(file, + dim_indices = list(lat = 1:324, lon = 1:432, time = 1:2), + vars_to_read = 'tas') +lats <- NcToArray(file, + dim_indices = list(lat = 1:324), vars_to_read = 'lat') +lons <- NcToArray(file, + dim_indices = list(lon = 1:432), vars_to_read = 'lon') +NcClose(file) + +suppressWarnings( + arr3 <- s2dv::CDORemap(arr, lons = as.vector(lons), lats = as.vector(lats), + grid = 'r128x64', method = 'con', crop = FALSE) +) +#--------------------------------------------------------------- + +path1 <- '/esarchive/exp/CMIP6/dcppA-hindcast/CESM1-1-CAM5-CMIP5/DCPP/NCAR/CESM1-1-CAM5-CMIP5/dcppA-hindcast/r1i1p1f1/Omon/$var$/gr/v20191016/$var$_Omon_CESM1-1-CAM5-CMIP5_dcppA-hindcast_s$sdate$-r1i1p1f1_gr_$sdate$11-202512.nc' # 'r360x180' + +test_that("1. 'all'", { + suppressWarnings( + res1 <- Start(dat = path1, + var = 'tos', + lat = 'all', + lon = 'all', + sdate = '2015', fmonth = indices(1:2), + transform = CDORemapper, transform_extra_cells = 2, + transform_params = list(grid = grid1, method = 'con'), + transform_vars = c('lon','lat'), + synonims = list(lat = c('lat','latitude'), + lon = c('lon','longitude'), fmonth = c('fmonth','time')), + return_vars = list(lat = NULL, lon = NULL), + retrieve = TRUE) + ) + suppressWarnings( + res2 <- Start(dat = path1, + var = 'tos', + lat = 'all', + lon = 'all', + lat_reorder = CircularSort(-90, 90), + lon_reorder = CircularSort(-180, 180), + sdate = '2015', fmonth = indices(1:2), + transform = CDORemapper, transform_extra_cells = 2, + transform_params = list(grid = 'r100x50', method = 'con'), + transform_vars = c('lon','lat'), + synonims = list(lat = c('lat','latitude'), + lon = c('lon','longitude'), fmonth = c('fmonth','time')), + return_vars = list(lat = NULL, lon = NULL), + retrieve = FALSE) + ) + # res1 + expect_equal( + as.vector(res1), + as.vector(arr1$data_array) + ) + expect_equal( + as.vector(attributes(res1)$Variables$common$lon), + as.vector(arr1$lon) + ) + # res2 + expect_equal( + as.vector(attributes(res2)$Variables$common$lon + 180), + as.vector(arr2$lon) + ) + expect_equal( + as.vector(attributes(res2)$Variables$common$lat), + as.vector(arr2$lat) + ) +}) + +#--------------------------------------------------------------- + +path2 <- '/esarchive/exp/CMIP6/dcppA-hindcast/HadGEM3-GC31-MM/DCPP/MOHC/HadGEM3-GC31-MM/dcppA-hindcast/r1i1p1f2/Amon/$var$/gn/v20200417/$var$_Amon_HadGEM3-GC31-MM_dcppA-hindcast_s$sdate$-r1i1p1f2_gn_$sdate$11-201512.nc' # 'r432x324' + +test_that("2. test path 2", { + suppressWarnings( + res3 <- Start(dat = path2, + var = 'tas', + lon = 'all', + lat = 'all', + lon_reorder = CircularSort(-180, 180), + sdate = paste0(2015), fmonth = indices(1:2), + transform = CDORemapper, transform_extra_cells = 2, + transform_params = list(grid = "r128x64", method = 'con'), + transform_vars = c('lon','lat'), + synonims = list(lat = c('lat','latitude'), + lon = c('lon','longitude'), fmonth = c('fmonth','time')), + return_vars = list(lat = NULL, lon = NULL, time = 'sdate'), + retrieve = FALSE) + ) + # res3 + expect_equal( + as.vector(attributes(res3)$Variables$common$lon+180), + as.vector(arr3$lon) + ) + expect_equal( + as.vector(attributes(res3)$Variables$common$lat), + as.vector(arr3$lat) + ) +}) + + + +