From 5c5b1b511692ce3d47572fef022a40be2c7c657f Mon Sep 17 00:00:00 2001 From: aho Date: Mon, 10 Aug 2020 12:12:46 +0200 Subject: [PATCH 1/4] Bugfix for Start() parameter 'path_glob_permissive' and write the explanation in FAQ and Start() header. --- R/Start.R | 31 ++++++++++++++++++++++++++++--- inst/doc/faq.md | 10 ++++++++-- man/Start.Rd | 15 ++++++++++++--- 3 files changed, 48 insertions(+), 8 deletions(-) diff --git a/R/Start.R b/R/Start.R index 62d7e8c..927a67e 100644 --- a/R/Start.R +++ b/R/Start.R @@ -644,8 +644,8 @@ #'When specifying a path pattern for a dataset, it might contain shell glob #'experissions. For each dataset, the first file matching the path pattern is #'found, and the found file is used to work out fixed values for the glob -#'expressions that will be used for all the files of the dataset. However in -#'some cases the values of the shell glob expressions may not be constant for +#'expressions that will be used for all the files of the dataset. However, in +#'some cases, the values of the shell glob expressions may not be constant for #'all files in a dataset, and they need to be worked out for each file #'involved.\cr\cr #'For example, a path pattern could be as follows: \cr @@ -667,7 +667,16 @@ #' pattern with the original glob expressions in the 1st and 2nd levels (in the #' example, both asterisks would be preserved, thus would allow Start() #' to recognize files such as \cr -#' \code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'}). +#' \code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'}).\cr\cr +#'Note that each glob expression can only represent one possibility (Start() +#'chooses the first). Because /code{*} is not the tag, which means it cannot +#'be a dimension of the output array. Therefore, only one possibility can be +#'adopted. For example, if \cr +#'\code{'/path/to/dataset/precipitation_*/19901101_*_foo.nc'}\cr +#'has two matches:\cr +#'\code{'/path/to/dataset/precipitation_xxx/19901101_yyy_foo.nc'} and\cr +#'\code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'},\cr +#'only the first found file will be used. #'@param retrieve A logical value indicating whether to retrieve the data #' defined in the Start() call or to explore only its dimension lengths #' and names, and the values for the file and inner dimensions. The default @@ -1671,6 +1680,8 @@ Start <- function(..., # dim = indices/selectors, replace_values[[u_file_dim]] <- '*' depended_dim <- NULL depended_dim_values <- NA + +#NOTE: Here 'selectors' is always 1. Is it supposed to be like this? selectors <- dat_selectors[[u_file_dim]][[1]] if (u_file_dim %in% names(depending_file_dims)) { depended_dim <- depending_file_dims[[u_file_dim]] @@ -1772,6 +1783,20 @@ Start <- function(..., # dim = indices/selectors, sub_array_of_not_found_files[j] <- TRUE } else { file_path <- .ReplaceVariablesInString(dat[[i]][['path']], replace_values) + +#NOTE: After replacing tags, there is still * if path_glob_permissive is not FALSE. + if (grepl('\\*', file_path)) { + found_files <- Sys.glob(file_path) + file_path <- found_files[1] # choose only the first file. +#NOTE: Above line chooses only the first found file. Because * is not tags, which means +# it is not a dimension. So it cannot store more than one item. If use * to define +# the path, that * should only represent one possibility. + if (length(found_files) > 1) { + .warning("Using glob expression * to define the path, but more ", + "than one match is found. Choose the first match only.") + } + } + if (!(length(grep("^http", file_path)) > 0)) { if (grepl(file_path, '*', fixed = TRUE)) { file_path_full <- Sys.glob(file_path)[1] diff --git a/inst/doc/faq.md b/inst/doc/faq.md index 4ad94a0..a4971c7 100644 --- a/inst/doc/faq.md +++ b/inst/doc/faq.md @@ -685,12 +685,18 @@ The glob expression, or wildcard, '*', can also be used in the path definition, Please note that **'*' can only be used to replace the common part of all the files**. For example, if all the required files have the folder 'EC-Earth-Consortium/' in their path, then this part can be substituted with '*/'. It can save some effort to define the long and uncritical path, and also make the script cleaner. -However, if the part replaced by '*' is not same among all the files, Start() will use the first pattern it finds in the first file to substitute '*'. +However, if the part replaced by '*' is not same among all the files, Start() will use **the first pattern it finds in the first file to substitute '*'**. As a result, the rest files may not be found due to the wrong path pattern. For example, if the first file is under a folder named 'v20190302/' and the second file is under another one named 'v20190308/', and you define the path pattern as 'v*/', then Start() will use 'v20190302/' for both file paths. This is different from the common definition of glob expression that tries to expand to match all the existing patterns, so please be careful when using it. -There is a parameter 'path_glob_permissive' in Start(). If set it to TRUE, the '*' in the filename itself will remain (i.e., as the common definition), while the ones in the path to the filename will still be replaced by the pattern in the first found file. +There is a parameter 'path_glob_permissive' in Start() can be used to perserve the +functionality of '*'. It can be FALSE/TRUE or an integer indicating how many folder layers +in the path pattern, beginning from the end, the shell glob expressions to be preserved. + +The default value is FALSE (equal to 0), which means no '*' is preserved. +If set it to TRUE (equal to 1), the '*' in the filename will remain and represent different possiblities of the file path pattern. See more details in Start() parameter +'path_glob_permissive'. # Something goes wrong... diff --git a/man/Start.Rd b/man/Start.Rd index d69562c..9411940 100644 --- a/man/Start.Rd +++ b/man/Start.Rd @@ -333,8 +333,8 @@ is FALSE.} When specifying a path pattern for a dataset, it might contain shell glob experissions. For each dataset, the first file matching the path pattern is found, and the found file is used to work out fixed values for the glob -expressions that will be used for all the files of the dataset. However in -some cases the values of the shell glob expressions may not be constant for +expressions that will be used for all the files of the dataset. However, in +some cases, the values of the shell glob expressions may not be constant for all files in a dataset, and they need to be worked out for each file involved.\cr\cr For example, a path pattern could be as follows: \cr @@ -356,7 +356,16 @@ For example, a path pattern could be as follows: \cr pattern with the original glob expressions in the 1st and 2nd levels (in the example, both asterisks would be preserved, thus would allow Start() to recognize files such as \cr - \code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'}).} + \code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'}).\cr\cr +Note that each glob expression can only represent one possibility (Start() +chooses the first). Because /code{*} is not the tag, which means it cannot +be a dimension of the output array. Therefore, only one possibility can be +adopted. For example, if \cr +\code{'/path/to/dataset/precipitation_*/19901101_*_foo.nc'}\cr +has two matches:\cr +\code{'/path/to/dataset/precipitation_xxx/19901101_yyy_foo.nc'} and\cr +\code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'},\cr +only the first found file will be used.} \item{retrieve}{A logical value indicating whether to retrieve the data defined in the Start() call or to explore only its dimension lengths -- GitLab From 97b9c5a4c0945edecc8777d89c5deb87c94a97fb Mon Sep 17 00:00:00 2001 From: aho Date: Mon, 10 Aug 2020 12:18:40 +0200 Subject: [PATCH 2/4] Format fix for how-to-18 --- inst/doc/faq.md | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/inst/doc/faq.md b/inst/doc/faq.md index a4971c7..0cc5256 100644 --- a/inst/doc/faq.md +++ b/inst/doc/faq.md @@ -685,7 +685,7 @@ The glob expression, or wildcard, '*', can also be used in the path definition, Please note that **'*' can only be used to replace the common part of all the files**. For example, if all the required files have the folder 'EC-Earth-Consortium/' in their path, then this part can be substituted with '*/'. It can save some effort to define the long and uncritical path, and also make the script cleaner. -However, if the part replaced by '*' is not same among all the files, Start() will use **the first pattern it finds in the first file to substitute '*'**. +However, if the part replaced by '\*' is not same among all the files, Start() will use **the first pattern it finds in the first file to substitute '*'**. As a result, the rest files may not be found due to the wrong path pattern. For example, if the first file is under a folder named 'v20190302/' and the second file is under another one named 'v20190308/', and you define the path pattern as 'v*/', then Start() will use 'v20190302/' for both file paths. This is different from the common definition of glob expression that tries to expand to match all the existing patterns, so please be careful when using it. @@ -693,10 +693,9 @@ This is different from the common definition of glob expression that tries to ex There is a parameter 'path_glob_permissive' in Start() can be used to perserve the functionality of '*'. It can be FALSE/TRUE or an integer indicating how many folder layers in the path pattern, beginning from the end, the shell glob expressions to be preserved. - -The default value is FALSE (equal to 0), which means no '*' is preserved. -If set it to TRUE (equal to 1), the '*' in the filename will remain and represent different possiblities of the file path pattern. See more details in Start() parameter -'path_glob_permissive'. +The default value is FALSE (equal to 0), which means no '\*' is preserved. +If set it to TRUE (equal to 1), the '\*' in the filename will remain and represent different possiblities of the file path pattern. See more details in Start() parameter +'path\_glob\_permissive'. # Something goes wrong... -- GitLab From 995ec112c0f9f29fb67c3188bbc1801f699d718a Mon Sep 17 00:00:00 2001 From: aho Date: Mon, 17 Aug 2020 17:51:25 +0200 Subject: [PATCH 3/4] Create unit test and use case for path_glob_permissive --- NEWS.md | 1 + inst/doc/faq.md | 2 +- inst/doc/usecase.md | 4 + inst/doc/usecase/ex1_9_path_glob_permissive.R | 114 ++++++++++++++++++ .../test-Start-path_glob_permissive.R | 90 ++++++++++++++ 5 files changed, 210 insertions(+), 1 deletion(-) create mode 100644 inst/doc/usecase/ex1_9_path_glob_permissive.R create mode 100644 tests/testthat/test-Start-path_glob_permissive.R diff --git a/NEWS.md b/NEWS.md index 4ebe2d6..3577fbb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,7 @@ # startR v2.0.1 (Release date: 2020-08-) - Bugfix for metadata in the condition that reorder or transform is applied and 'return_vars' is NULL. - Bugfix for the missing first file case. It showed an error before when the first file is not found but now it works. +- Bugfix for the parameter 'path_glob_permissive' of Start(). # startR v2.0.0 (Release date: 2020-08-06) - Adopt Roxygen2 documentation format diff --git a/inst/doc/faq.md b/inst/doc/faq.md index 0cc5256..d7f10bf 100644 --- a/inst/doc/faq.md +++ b/inst/doc/faq.md @@ -695,7 +695,7 @@ functionality of '*'. It can be FALSE/TRUE or an integer indicating how many fol in the path pattern, beginning from the end, the shell glob expressions to be preserved. The default value is FALSE (equal to 0), which means no '\*' is preserved. If set it to TRUE (equal to 1), the '\*' in the filename will remain and represent different possiblities of the file path pattern. See more details in Start() parameter -'path\_glob\_permissive'. +'path\_glob\_permissive' and the use case [ex1_9](inst/doc/usecase/ex1_9_path_glob_permissive.R). # Something goes wrong... diff --git a/inst/doc/usecase.md b/inst/doc/usecase.md index 65dbd2d..e2ffdc0 100644 --- a/inst/doc/usecase.md +++ b/inst/doc/usecase.md @@ -38,6 +38,10 @@ The problem may occur when the dimension number of the splitted selector is more 8. [Loading tas and tos from Decadal Predictions performed with the EC-Earth model](inst/doc/usecase/ex1_8_tasandtos.R) Some climate indices needs to be computed loading 'tas' (air temperature at 2m) over land and 'tos' (ocean surface temperature) over sea. Using **startR**, you can load these data in a unique **Start** call or with multiple calls separately for each variable. + 9. [Use glob expression * to define the path](inst/doc/usecase/ex1_9_path_glob_permissive.R) + This script shows you how to use glob expression '*' and the parameter 'path_glob_permissive' of Start(). +You can also find information in [FAQ How-to-18](inst/doc/faq.md#18-use-glob-expression-to-define-the-file-path). + 2. **Execute computation (use `Compute()`)** 1. [Function working on time dimension](inst/doc/usecase/ex2_1_timedim.R) diff --git a/inst/doc/usecase/ex1_9_path_glob_permissive.R b/inst/doc/usecase/ex1_9_path_glob_permissive.R new file mode 100644 index 0000000..f52f7e0 --- /dev/null +++ b/inst/doc/usecase/ex1_9_path_glob_permissive.R @@ -0,0 +1,114 @@ +#--------------------------------------------------------------------- +# This script shows you how to use glob expression '*' and the parameter 'path_glob_permissive' of Start(). +# The regular way to define the path for Start() is using the tags, i.e., '$$'. +# However, we may want to use '*' to define the path under some conditions. +# Start() allows this usage but with some limitation. You can find more details in Start() documentation, the parameter 'path_glob_permissive'. +#--------------------------------------------------------------------- +library(startR) + +#----------------------------------------------------------------- +# Case 1: Use * to substitute a string which is common among files +#----------------------------------------------------------------- +## This is a lazy way to define the path. When there is a common string among all +## the files, you can use * instead of the string. +## However, it is not recommended to use because you may not notice when the file +## path is changed and Start() reads the unwanted files. The more clear-defined the +## path, the safer it is. + +## We want to read the two files, 1960 and 1961, from the repository. The full path +## definition is below. +repo_original <- paste0('/esarchive/exp/ecearth/a1st/diags/CMIP/EC-Earth-Consortium/', + 'EC-Earth3/historical/r7i1p1f1/Omon/$var$/gn/v20190302/', + '$var$_Omon_EC-Earth3_historical_r7i1p1f1_gn_$year$.nc') +years <- paste0(c(1960:1961), '01-', c(1960:1961), '12') + +## We know that there are some layers of the path are unique and identical between +## the two files. We can change those layers to *. +## Please note that if the string is different between files, * must be used with the +## parameter 'path_glob_permissive' (See Case 2 below). +repos <- paste0('/esarchive/exp/ecearth/a1st/diags/CMIP/*/', + '*/*/r7i1p1f1/Omon/$var$/*/v20190302/', + '$var$_Omon_*_*_r7i1p1f1_*_$year$.nc') + +data <- Start(dat = repos, + var = 'tosmean', + year = years, + time = indices(1), + region = indices(1), + return_vars = list(time = NULL, region = NULL), + retrieve = T) + + +#------------------------------------------------------------- +# Case 2: Use * to substitute a name which differs from files +#------------------------------------------------------------- +## We want to read two experiments together, while their member and version name are +## different. The data structure is like this: +## -- expid -- -- member -- -- version -- +## a1st r7i1p1f1 v20190302 +## a1sx r10i1p1f1 v20190308 +## +## We cannot simply substitue 'member' and 'version' with *, because Start() will only detect +## the first file path and use the string in the first file to represent the rest files. +## Therefore, the rest files cannot be found. That is, Start() will regard a1st and a1sx +## both have member 'r7i1p1f1' and version 'v20190302'. +## Fortunately, the expid only has one member and one version each. We can use * combined +## with the parameter 'path_glob_permissive' to make Start() work. + +## The * from the end in order represents: 1. member 2. version 3. member +repos <- paste0('/esarchive/exp/ecearth/$expid$/diags/CMIP/EC-Earth-Consortium/', + 'EC-Earth3/historical/*/Omon/$var$/gn/v*/', + '$var$_Omon_EC-Earth3_historical_*_gn_$year$.nc') +years <- paste0(c(1960:1961), '01-', c(1960:1961), '12') + +## The parameter 'path_glob_permissive' can be assigned with TRUE/FALSE or an integer. +## TRUE equals to 1, and FALSE equals to 0. The number means that how many layers from +## the end to preserve * for all files. That is, Start() won't replace * with the string +## found in the first file. Start() will use * to look for the pattern for each file. +## So, since the three * in above path are all different between a1st and a1sx, we need +## to preserve all of them. The furthest layer from the end is 6, so 'path_glob_permissive' +## should be 6. +data <- Start(dat = repos, + var = 'tosmean', + expid = c('a1st', 'a1sx'), + year = years, + time = indices(1), + region = indices(1), + path_glob_permissive = 6, + return_vars = list(time = NULL, region = NULL), + retrieve = T) + +## What if certain expid has more than one 'member' or 'version'? For example, +## -- expid -- -- member -- -- version -- +## a1st r7i1p1f1 v20190302 +## v20200302 +## a1sx r10i1p1f1 v20190308 + +## In this case, Start() will only catch 'v20190302' for a1st. This is the main difference +## between * and tag. Tag is the dimension of the output array, so it can save as many +## value as you want. On the other hand, * is not a dimension, so it can only represent one +## possibility for one file path. + + +## You may think of the usage of '_depends', which also tells Start() that one tag (i.e., +## dimension) is dependent on another one. However, '_depends' can only be used when the +## dependency is between two. In this case, we can define the dependency between 'expid' +## and 'member', and use * for 'version'. +repos <- paste0('/esarchive/exp/ecearth/$expid$/diags/CMIP/EC-Earth-Consortium/', + 'EC-Earth3/historical/$member$/Omon/$var$/gn/v*/', + '$var$_Omon_EC-Earth3_historical_*_gn_$year$.nc') + +## You can see that the layers contain * are the last two. Therefore, 'path_glob_permissive' +## is 2 here. +data <- Start(dat = repos, + var = 'tosmean', + expid = c('a1st', 'a1sx'), + year = years, + member = indices(1), + member_depends = 'expid', + time = indices(1), + region = indices(1), + path_glob_permissive = 2, + return_vars = list(time = NULL, region = NULL), + retrieve = T) + diff --git a/tests/testthat/test-Start-path_glob_permissive.R b/tests/testthat/test-Start-path_glob_permissive.R new file mode 100644 index 0000000..9e14c02 --- /dev/null +++ b/tests/testthat/test-Start-path_glob_permissive.R @@ -0,0 +1,90 @@ +context("Start() path_glob_permissive check") + +test_that("1. expid/member/version", { + +years <- paste0(c(1960:1961), '01-', c(1960:1961), '12') + +# from the end, each *: 1. member 2. version 3. member +repos <- paste0('/esarchive/exp/ecearth/$expid$/diags/CMIP/EC-Earth-Consortium/', + 'EC-Earth3/historical/*/Omon/$var$/gn/v*/', + '$var$_Omon_EC-Earth3_historical_*_gn_$year$.nc') + +data <- Start(dat = repos, + var = 'tosmean', + expid = c('a1st', 'a1sx'), + year = years, + time = indices(1), + region = indices(1), + path_glob_permissive = 6, #TRUE, + return_vars = list(time = NULL, region = NULL), + retrieve = T) + + + expect_equal( + dim(data), + c(dat = 1, var = 1, expid = 2, year = 2, time = 1, region = 1) + ) + expect_equal( + dim(attr(data, 'Files')), + c(dat = 1, var = 1, expid = 2, year = 2) + ) + expect_equal( + attr(data, 'Files'), + array(c("/esarchive/exp/ecearth/a1st/diags/CMIP/EC-Earth-Consortium/EC-Earth3/historical/r7i1p1f1/Omon/tosmean/gn/v20190302/tosmean_Omon_EC-Earth3_historical_r7i1p1f1_gn_196001-196012.nc", + "/esarchive/exp/ecearth/a1sx/diags/CMIP/EC-Earth-Consortium/EC-Earth3/historical/r10i1p1f1/Omon/tosmean/gn/v20190308/tosmean_Omon_EC-Earth3_historical_r10i1p1f1_gn_196001-196012.nc", + "/esarchive/exp/ecearth/a1st/diags/CMIP/EC-Earth-Consortium/EC-Earth3/historical/r7i1p1f1/Omon/tosmean/gn/v20190302/tosmean_Omon_EC-Earth3_historical_r7i1p1f1_gn_196101-196112.nc", + "/esarchive/exp/ecearth/a1sx/diags/CMIP/EC-Earth-Consortium/EC-Earth3/historical/r10i1p1f1/Omon/tosmean/gn/v20190308/tosmean_Omon_EC-Earth3_historical_r10i1p1f1_gn_196101-196112.nc"), + dim = c(dat = 1, var = 1, expid = 2, year = 2)) + ) + expect_equal( + data[1, 1, , , 1, 1], + matrix(c(18.60422, 17.13862, 18.52348, 17.21780), 2, 2), + tolerance = 0.0001 + ) + + +# from the end, each *: 1. member 2. version +repos <- paste0('/esarchive/exp/ecearth/$expid$/diags/CMIP/EC-Earth-Consortium/', + 'EC-Earth3/historical/$member$/Omon/$var$/gn/v*/', + '$var$_Omon_EC-Earth3_historical_*_gn_$year$.nc') + +data <- Start(dat = repos, + var = 'tosmean', + expid = c('a1st', 'a1sx'), + year = years, + member = indices(1), + member_depends = 'expid', + time = indices(1), + region = indices(1), + path_glob_permissive = 2, #TRUE, + return_vars = list(time = NULL, region = NULL), + retrieve = T) + + + expect_equal( + dim(data), + c(dat = 1, var = 1, expid = 2, year = 2, member = 1, time = 1, region = 1) + ) + expect_equal( + dim(attr(data, 'Files')), + c(dat = 1, var = 1, expid = 2, year = 2, member = 1) + ) + expect_equal( + attr(data, 'Files'), + array(c("/esarchive/exp/ecearth/a1st/diags/CMIP/EC-Earth-Consortium/EC-Earth3/historical/r7i1p1f1/Omon/tosmean/gn/v20190302/tosmean_Omon_EC-Earth3_historical_r7i1p1f1_gn_196001-196012.nc", + "/esarchive/exp/ecearth/a1sx/diags/CMIP/EC-Earth-Consortium/EC-Earth3/historical/r10i1p1f1/Omon/tosmean/gn/v20190308/tosmean_Omon_EC-Earth3_historical_r10i1p1f1_gn_196001-196012.nc", + "/esarchive/exp/ecearth/a1st/diags/CMIP/EC-Earth-Consortium/EC-Earth3/historical/r7i1p1f1/Omon/tosmean/gn/v20190302/tosmean_Omon_EC-Earth3_historical_r7i1p1f1_gn_196101-196112.nc", + "/esarchive/exp/ecearth/a1sx/diags/CMIP/EC-Earth-Consortium/EC-Earth3/historical/r10i1p1f1/Omon/tosmean/gn/v20190308/tosmean_Omon_EC-Earth3_historical_r10i1p1f1_gn_196101-196112.nc"), + dim = c(dat = 1, var = 1, expid = 2, year = 2, member = 1)) + ) + expect_equal( + data[1, 1, , , 1, 1, 1], + matrix(c(18.60422, 17.13862, 18.52348, 17.21780), 2, 2), + tolerance = 0.0001 + ) + expect_equal( + attr(data, 'FileSelectors')$dat1$member, + list(a1st = 'r7i1p1f1', a1sx = 'r10i1p1f1') + ) + +}) -- GitLab From 6096b98acd4c9ddfa7db6a74495407dbb98a0a12 Mon Sep 17 00:00:00 2001 From: aho Date: Mon, 17 Aug 2020 17:53:41 +0200 Subject: [PATCH 4/4] Format fix --- inst/doc/usecase.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/doc/usecase.md b/inst/doc/usecase.md index e2ffdc0..a25f2a9 100644 --- a/inst/doc/usecase.md +++ b/inst/doc/usecase.md @@ -38,7 +38,7 @@ The problem may occur when the dimension number of the splitted selector is more 8. [Loading tas and tos from Decadal Predictions performed with the EC-Earth model](inst/doc/usecase/ex1_8_tasandtos.R) Some climate indices needs to be computed loading 'tas' (air temperature at 2m) over land and 'tos' (ocean surface temperature) over sea. Using **startR**, you can load these data in a unique **Start** call or with multiple calls separately for each variable. - 9. [Use glob expression * to define the path](inst/doc/usecase/ex1_9_path_glob_permissive.R) + 9. [Use glob expression * to define the path](inst/doc/usecase/ex1_9_path_glob_permissive.R) This script shows you how to use glob expression '*' and the parameter 'path_glob_permissive' of Start(). You can also find information in [FAQ How-to-18](inst/doc/faq.md#18-use-glob-expression-to-define-the-file-path). -- GitLab