diff --git a/NEWS.md b/NEWS.md index 2a122b2baca54eca0a10875bdeea01190d765e98..e364792c7eda21368ac68a31cf891841913eab79 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,6 +4,7 @@ '1 data set, 1 variable'. For 1 data set case, all the variables should be listed under $common in the attributes; for more than 1 data set case, the variables should be listed under each $dat. - Bugfix for the missing first file case. It showed an error before when the first file is not found but now it works. +- Bugfix for the parameter 'path_glob_permissive' of Start(). # startR v2.0.0 (Release date: 2020-08-06) - Adopt Roxygen2 documentation format diff --git a/R/Start.R b/R/Start.R index 990d8e7b8823b44bbe9f4825eb80164e8a79b8c6..422372d630fd56c8dd8b90417ccc7347c290d1a5 100644 --- a/R/Start.R +++ b/R/Start.R @@ -644,8 +644,8 @@ #'When specifying a path pattern for a dataset, it might contain shell glob #'experissions. For each dataset, the first file matching the path pattern is #'found, and the found file is used to work out fixed values for the glob -#'expressions that will be used for all the files of the dataset. However in -#'some cases the values of the shell glob expressions may not be constant for +#'expressions that will be used for all the files of the dataset. However, in +#'some cases, the values of the shell glob expressions may not be constant for #'all files in a dataset, and they need to be worked out for each file #'involved.\cr\cr #'For example, a path pattern could be as follows: \cr @@ -667,7 +667,16 @@ #' pattern with the original glob expressions in the 1st and 2nd levels (in the #' example, both asterisks would be preserved, thus would allow Start() #' to recognize files such as \cr -#' \code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'}). +#' \code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'}).\cr\cr +#'Note that each glob expression can only represent one possibility (Start() +#'chooses the first). Because /code{*} is not the tag, which means it cannot +#'be a dimension of the output array. Therefore, only one possibility can be +#'adopted. For example, if \cr +#'\code{'/path/to/dataset/precipitation_*/19901101_*_foo.nc'}\cr +#'has two matches:\cr +#'\code{'/path/to/dataset/precipitation_xxx/19901101_yyy_foo.nc'} and\cr +#'\code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'},\cr +#'only the first found file will be used. #'@param retrieve A logical value indicating whether to retrieve the data #' defined in the Start() call or to explore only its dimension lengths #' and names, and the values for the file and inner dimensions. The default @@ -1686,6 +1695,8 @@ Start <- function(..., # dim = indices/selectors, replace_values[[u_file_dim]] <- '*' depended_dim <- NULL depended_dim_values <- NA + +#NOTE: Here 'selectors' is always 1. Is it supposed to be like this? selectors <- dat_selectors[[u_file_dim]][[1]] if (u_file_dim %in% names(depending_file_dims)) { depended_dim <- depending_file_dims[[u_file_dim]] @@ -1787,6 +1798,20 @@ Start <- function(..., # dim = indices/selectors, sub_array_of_not_found_files[j] <- TRUE } else { file_path <- .ReplaceVariablesInString(dat[[i]][['path']], replace_values) + +#NOTE: After replacing tags, there is still * if path_glob_permissive is not FALSE. + if (grepl('\\*', file_path)) { + found_files <- Sys.glob(file_path) + file_path <- found_files[1] # choose only the first file. +#NOTE: Above line chooses only the first found file. Because * is not tags, which means +# it is not a dimension. So it cannot store more than one item. If use * to define +# the path, that * should only represent one possibility. + if (length(found_files) > 1) { + .warning("Using glob expression * to define the path, but more ", + "than one match is found. Choose the first match only.") + } + } + if (!(length(grep("^http", file_path)) > 0)) { if (grepl(file_path, '*', fixed = TRUE)) { file_path_full <- Sys.glob(file_path)[1] diff --git a/inst/doc/faq.md b/inst/doc/faq.md index 7121ca13e3526c3c9173e4b7a2ac3805f7d0293e..8f5c884c20ce531a3adf122743c191b1744fcb59 100644 --- a/inst/doc/faq.md +++ b/inst/doc/faq.md @@ -686,12 +686,17 @@ The glob expression, or wildcard, '*', can also be used in the path definition, Please note that **'*' can only be used to replace the common part of all the files**. For example, if all the required files have the folder 'EC-Earth-Consortium/' in their path, then this part can be substituted with '*/'. It can save some effort to define the long and uncritical path, and also make the script cleaner. -However, if the part replaced by '*' is not same among all the files, Start() will use the first pattern it finds in the first file to substitute '*'. +However, if the part replaced by '\*' is not same among all the files, Start() will use **the first pattern it finds in the first file to substitute '*'**. As a result, the rest files may not be found due to the wrong path pattern. For example, if the first file is under a folder named 'v20190302/' and the second file is under another one named 'v20190308/', and you define the path pattern as 'v*/', then Start() will use 'v20190302/' for both file paths. This is different from the common definition of glob expression that tries to expand to match all the existing patterns, so please be careful when using it. -There is a parameter 'path_glob_permissive' in Start(). If set it to TRUE, the '*' in the filename itself will remain (i.e., as the common definition), while the ones in the path to the filename will still be replaced by the pattern in the first found file. +There is a parameter 'path_glob_permissive' in Start() can be used to perserve the +functionality of '*'. It can be FALSE/TRUE or an integer indicating how many folder layers +in the path pattern, beginning from the end, the shell glob expressions to be preserved. +The default value is FALSE (equal to 0), which means no '\*' is preserved. +If set it to TRUE (equal to 1), the '\*' in the filename will remain and represent different possiblities of the file path pattern. See more details in Start() parameter +'path\_glob\_permissive' and the use case [ex1_9](inst/doc/usecase/ex1_9_path_glob_permissive.R). ### 19. Use 'metadata_dims' to retrieve variable metadata In addition to retrieve the data value, Start() can retrieve the auxiliary data as well. diff --git a/inst/doc/usecase.md b/inst/doc/usecase.md index 1d3e8774d740f1f5277f5d000c13dc3c6468ffff..521200cc679975c2bf856a70b350b038efe7b4e6 100644 --- a/inst/doc/usecase.md +++ b/inst/doc/usecase.md @@ -38,11 +38,16 @@ The problem may occur when the dimension number of the splitted selector is more 8. [Loading tas and tos from Decadal Predictions performed with the EC-Earth model](inst/doc/usecase/ex1_8_tasandtos.R) Some climate indices needs to be computed loading 'tas' (air temperature at 2m) over land and 'tos' (ocean surface temperature) over sea. Using **startR**, you can load these data in a unique **Start** call or with multiple calls separately for each variable. + 9. [Use glob expression * to define the path](inst/doc/usecase/ex1_9_path_glob_permissive.R) + This script shows you how to use glob expression '*' and the parameter 'path_glob_permissive' of Start(). +You can also find information in [FAQ How-to-18](inst/doc/faq.md#18-use-glob-expression-to-define-the-file-path). + 10. [Use 'metadata_dims' to retrieve complete variable metadata](inst/doc/usecase/ex1_10_metadata_dims.R) This script tells you how to use the parameter 'metadata_dims' in Start() to get the complete variable metadata. You will see four difference cases and learn the rules. You can find more explanation in FAQ [How-to-19](inst/doc/faq.md#19-use-metadata_dims-to-retrieve-variable-metadata). + 2. **Execute computation (use `Compute()`)** 1. [Function working on time dimension](inst/doc/usecase/ex2_1_timedim.R) 2. [Function using attributes of the data](inst/doc/usecase/ex2_2_attr.R) diff --git a/inst/doc/usecase/ex1_9_path_glob_permissive.R b/inst/doc/usecase/ex1_9_path_glob_permissive.R new file mode 100644 index 0000000000000000000000000000000000000000..f52f7e0ee9f942fb57273aac63efec255f02e87b --- /dev/null +++ b/inst/doc/usecase/ex1_9_path_glob_permissive.R @@ -0,0 +1,114 @@ +#--------------------------------------------------------------------- +# This script shows you how to use glob expression '*' and the parameter 'path_glob_permissive' of Start(). +# The regular way to define the path for Start() is using the tags, i.e., '$$'. +# However, we may want to use '*' to define the path under some conditions. +# Start() allows this usage but with some limitation. You can find more details in Start() documentation, the parameter 'path_glob_permissive'. +#--------------------------------------------------------------------- +library(startR) + +#----------------------------------------------------------------- +# Case 1: Use * to substitute a string which is common among files +#----------------------------------------------------------------- +## This is a lazy way to define the path. When there is a common string among all +## the files, you can use * instead of the string. +## However, it is not recommended to use because you may not notice when the file +## path is changed and Start() reads the unwanted files. The more clear-defined the +## path, the safer it is. + +## We want to read the two files, 1960 and 1961, from the repository. The full path +## definition is below. +repo_original <- paste0('/esarchive/exp/ecearth/a1st/diags/CMIP/EC-Earth-Consortium/', + 'EC-Earth3/historical/r7i1p1f1/Omon/$var$/gn/v20190302/', + '$var$_Omon_EC-Earth3_historical_r7i1p1f1_gn_$year$.nc') +years <- paste0(c(1960:1961), '01-', c(1960:1961), '12') + +## We know that there are some layers of the path are unique and identical between +## the two files. We can change those layers to *. +## Please note that if the string is different between files, * must be used with the +## parameter 'path_glob_permissive' (See Case 2 below). +repos <- paste0('/esarchive/exp/ecearth/a1st/diags/CMIP/*/', + '*/*/r7i1p1f1/Omon/$var$/*/v20190302/', + '$var$_Omon_*_*_r7i1p1f1_*_$year$.nc') + +data <- Start(dat = repos, + var = 'tosmean', + year = years, + time = indices(1), + region = indices(1), + return_vars = list(time = NULL, region = NULL), + retrieve = T) + + +#------------------------------------------------------------- +# Case 2: Use * to substitute a name which differs from files +#------------------------------------------------------------- +## We want to read two experiments together, while their member and version name are +## different. The data structure is like this: +## -- expid -- -- member -- -- version -- +## a1st r7i1p1f1 v20190302 +## a1sx r10i1p1f1 v20190308 +## +## We cannot simply substitue 'member' and 'version' with *, because Start() will only detect +## the first file path and use the string in the first file to represent the rest files. +## Therefore, the rest files cannot be found. That is, Start() will regard a1st and a1sx +## both have member 'r7i1p1f1' and version 'v20190302'. +## Fortunately, the expid only has one member and one version each. We can use * combined +## with the parameter 'path_glob_permissive' to make Start() work. + +## The * from the end in order represents: 1. member 2. version 3. member +repos <- paste0('/esarchive/exp/ecearth/$expid$/diags/CMIP/EC-Earth-Consortium/', + 'EC-Earth3/historical/*/Omon/$var$/gn/v*/', + '$var$_Omon_EC-Earth3_historical_*_gn_$year$.nc') +years <- paste0(c(1960:1961), '01-', c(1960:1961), '12') + +## The parameter 'path_glob_permissive' can be assigned with TRUE/FALSE or an integer. +## TRUE equals to 1, and FALSE equals to 0. The number means that how many layers from +## the end to preserve * for all files. That is, Start() won't replace * with the string +## found in the first file. Start() will use * to look for the pattern for each file. +## So, since the three * in above path are all different between a1st and a1sx, we need +## to preserve all of them. The furthest layer from the end is 6, so 'path_glob_permissive' +## should be 6. +data <- Start(dat = repos, + var = 'tosmean', + expid = c('a1st', 'a1sx'), + year = years, + time = indices(1), + region = indices(1), + path_glob_permissive = 6, + return_vars = list(time = NULL, region = NULL), + retrieve = T) + +## What if certain expid has more than one 'member' or 'version'? For example, +## -- expid -- -- member -- -- version -- +## a1st r7i1p1f1 v20190302 +## v20200302 +## a1sx r10i1p1f1 v20190308 + +## In this case, Start() will only catch 'v20190302' for a1st. This is the main difference +## between * and tag. Tag is the dimension of the output array, so it can save as many +## value as you want. On the other hand, * is not a dimension, so it can only represent one +## possibility for one file path. + + +## You may think of the usage of '_depends', which also tells Start() that one tag (i.e., +## dimension) is dependent on another one. However, '_depends' can only be used when the +## dependency is between two. In this case, we can define the dependency between 'expid' +## and 'member', and use * for 'version'. +repos <- paste0('/esarchive/exp/ecearth/$expid$/diags/CMIP/EC-Earth-Consortium/', + 'EC-Earth3/historical/$member$/Omon/$var$/gn/v*/', + '$var$_Omon_EC-Earth3_historical_*_gn_$year$.nc') + +## You can see that the layers contain * are the last two. Therefore, 'path_glob_permissive' +## is 2 here. +data <- Start(dat = repos, + var = 'tosmean', + expid = c('a1st', 'a1sx'), + year = years, + member = indices(1), + member_depends = 'expid', + time = indices(1), + region = indices(1), + path_glob_permissive = 2, + return_vars = list(time = NULL, region = NULL), + retrieve = T) + diff --git a/man/Start.Rd b/man/Start.Rd index f80f3bbd3b64184c990a9cc269200cb08d67e942..c41c9619f4489bb9f6655b4e3203cfb4616c532d 100644 --- a/man/Start.Rd +++ b/man/Start.Rd @@ -333,8 +333,8 @@ is FALSE.} When specifying a path pattern for a dataset, it might contain shell glob experissions. For each dataset, the first file matching the path pattern is found, and the found file is used to work out fixed values for the glob -expressions that will be used for all the files of the dataset. However in -some cases the values of the shell glob expressions may not be constant for +expressions that will be used for all the files of the dataset. However, in +some cases, the values of the shell glob expressions may not be constant for all files in a dataset, and they need to be worked out for each file involved.\cr\cr For example, a path pattern could be as follows: \cr @@ -356,7 +356,16 @@ For example, a path pattern could be as follows: \cr pattern with the original glob expressions in the 1st and 2nd levels (in the example, both asterisks would be preserved, thus would allow Start() to recognize files such as \cr - \code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'}).} + \code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'}).\cr\cr +Note that each glob expression can only represent one possibility (Start() +chooses the first). Because /code{*} is not the tag, which means it cannot +be a dimension of the output array. Therefore, only one possibility can be +adopted. For example, if \cr +\code{'/path/to/dataset/precipitation_*/19901101_*_foo.nc'}\cr +has two matches:\cr +\code{'/path/to/dataset/precipitation_xxx/19901101_yyy_foo.nc'} and\cr +\code{'/path/to/dataset/precipitation_zzz/19901101_yyy_foo.nc'},\cr +only the first found file will be used.} \item{retrieve}{A logical value indicating whether to retrieve the data defined in the Start() call or to explore only its dimension lengths diff --git a/tests/testthat/test-Start-path_glob_permissive.R b/tests/testthat/test-Start-path_glob_permissive.R new file mode 100644 index 0000000000000000000000000000000000000000..9e14c0252df03d7a1b9503a12fc2b219cc2baeac --- /dev/null +++ b/tests/testthat/test-Start-path_glob_permissive.R @@ -0,0 +1,90 @@ +context("Start() path_glob_permissive check") + +test_that("1. expid/member/version", { + +years <- paste0(c(1960:1961), '01-', c(1960:1961), '12') + +# from the end, each *: 1. member 2. version 3. member +repos <- paste0('/esarchive/exp/ecearth/$expid$/diags/CMIP/EC-Earth-Consortium/', + 'EC-Earth3/historical/*/Omon/$var$/gn/v*/', + '$var$_Omon_EC-Earth3_historical_*_gn_$year$.nc') + +data <- Start(dat = repos, + var = 'tosmean', + expid = c('a1st', 'a1sx'), + year = years, + time = indices(1), + region = indices(1), + path_glob_permissive = 6, #TRUE, + return_vars = list(time = NULL, region = NULL), + retrieve = T) + + + expect_equal( + dim(data), + c(dat = 1, var = 1, expid = 2, year = 2, time = 1, region = 1) + ) + expect_equal( + dim(attr(data, 'Files')), + c(dat = 1, var = 1, expid = 2, year = 2) + ) + expect_equal( + attr(data, 'Files'), + array(c("/esarchive/exp/ecearth/a1st/diags/CMIP/EC-Earth-Consortium/EC-Earth3/historical/r7i1p1f1/Omon/tosmean/gn/v20190302/tosmean_Omon_EC-Earth3_historical_r7i1p1f1_gn_196001-196012.nc", + "/esarchive/exp/ecearth/a1sx/diags/CMIP/EC-Earth-Consortium/EC-Earth3/historical/r10i1p1f1/Omon/tosmean/gn/v20190308/tosmean_Omon_EC-Earth3_historical_r10i1p1f1_gn_196001-196012.nc", + "/esarchive/exp/ecearth/a1st/diags/CMIP/EC-Earth-Consortium/EC-Earth3/historical/r7i1p1f1/Omon/tosmean/gn/v20190302/tosmean_Omon_EC-Earth3_historical_r7i1p1f1_gn_196101-196112.nc", + "/esarchive/exp/ecearth/a1sx/diags/CMIP/EC-Earth-Consortium/EC-Earth3/historical/r10i1p1f1/Omon/tosmean/gn/v20190308/tosmean_Omon_EC-Earth3_historical_r10i1p1f1_gn_196101-196112.nc"), + dim = c(dat = 1, var = 1, expid = 2, year = 2)) + ) + expect_equal( + data[1, 1, , , 1, 1], + matrix(c(18.60422, 17.13862, 18.52348, 17.21780), 2, 2), + tolerance = 0.0001 + ) + + +# from the end, each *: 1. member 2. version +repos <- paste0('/esarchive/exp/ecearth/$expid$/diags/CMIP/EC-Earth-Consortium/', + 'EC-Earth3/historical/$member$/Omon/$var$/gn/v*/', + '$var$_Omon_EC-Earth3_historical_*_gn_$year$.nc') + +data <- Start(dat = repos, + var = 'tosmean', + expid = c('a1st', 'a1sx'), + year = years, + member = indices(1), + member_depends = 'expid', + time = indices(1), + region = indices(1), + path_glob_permissive = 2, #TRUE, + return_vars = list(time = NULL, region = NULL), + retrieve = T) + + + expect_equal( + dim(data), + c(dat = 1, var = 1, expid = 2, year = 2, member = 1, time = 1, region = 1) + ) + expect_equal( + dim(attr(data, 'Files')), + c(dat = 1, var = 1, expid = 2, year = 2, member = 1) + ) + expect_equal( + attr(data, 'Files'), + array(c("/esarchive/exp/ecearth/a1st/diags/CMIP/EC-Earth-Consortium/EC-Earth3/historical/r7i1p1f1/Omon/tosmean/gn/v20190302/tosmean_Omon_EC-Earth3_historical_r7i1p1f1_gn_196001-196012.nc", + "/esarchive/exp/ecearth/a1sx/diags/CMIP/EC-Earth-Consortium/EC-Earth3/historical/r10i1p1f1/Omon/tosmean/gn/v20190308/tosmean_Omon_EC-Earth3_historical_r10i1p1f1_gn_196001-196012.nc", + "/esarchive/exp/ecearth/a1st/diags/CMIP/EC-Earth-Consortium/EC-Earth3/historical/r7i1p1f1/Omon/tosmean/gn/v20190302/tosmean_Omon_EC-Earth3_historical_r7i1p1f1_gn_196101-196112.nc", + "/esarchive/exp/ecearth/a1sx/diags/CMIP/EC-Earth-Consortium/EC-Earth3/historical/r10i1p1f1/Omon/tosmean/gn/v20190308/tosmean_Omon_EC-Earth3_historical_r10i1p1f1_gn_196101-196112.nc"), + dim = c(dat = 1, var = 1, expid = 2, year = 2, member = 1)) + ) + expect_equal( + data[1, 1, , , 1, 1, 1], + matrix(c(18.60422, 17.13862, 18.52348, 17.21780), 2, 2), + tolerance = 0.0001 + ) + expect_equal( + attr(data, 'FileSelectors')$dat1$member, + list(a1st = 'r7i1p1f1', a1sx = 'r10i1p1f1') + ) + +})