From f6b2f3cadbee689f84cfa6e5eba0e0953a1aae1a Mon Sep 17 00:00:00 2001 From: aho Date: Mon, 13 Jul 2020 22:48:14 +0200 Subject: [PATCH 1/5] Bugfix for first file missing case. The error happened when retrieving metadata. --- R/Start.R | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/R/Start.R b/R/Start.R index b23e0eb..54236e1 100644 --- a/R/Start.R +++ b/R/Start.R @@ -3104,6 +3104,9 @@ Start <- function(..., # dim = indices/selectors, .message(progress_message, appendLF = FALSE) } } + +# NOTE: In .LoadDataFile(), metadata is saved in metadata_folder/1 or /2 etc. Before here, +# the path name is created in work_pieces but the path hasn't been built yet. if (num_procs == 1) { found_files <- lapply(work_pieces, .LoadDataFile, shared_matrix_pointer = shared_matrix_pointer, @@ -3264,7 +3267,13 @@ Start <- function(..., # dim = indices/selectors, # Load metadata and remove the metadata folder if (!is.null(metadata_dims)) { loaded_metadata_files <- list.files(metadata_folder) - loaded_metadata <- lapply(paste0(metadata_folder, '/', loaded_metadata_files), readRDS) + + if (!identical(loaded_metadata_files, character(0))) { # old code + loaded_metadata <- lapply(paste0(metadata_folder, '/', loaded_metadata_files), readRDS) + } else { + loaded_metadata <- NULL + } + unlink(metadata_folder, recursive = TRUE) return_metadata <- vector('list', length = prod(dim(array_of_metadata_flags)[metadata_dims])) return_metadata[as.numeric(loaded_metadata_files)] <- loaded_metadata @@ -3332,7 +3341,18 @@ Start <- function(..., # dim = indices/selectors, if (!silent) { .message("Successfully retrieved data.") } - var_backup <- attr(data_array, 'Variables')[[1]] + + if (all(sapply(attr(data_array, 'Variables'), is.null))) { + var_backup <- NULL + .warning(paste0("Metadata cannot be retrieved. The reason may be the ", + "non-existence of the first file. Use parameter 'metadata_dims'", + " to assign to file dimensions along which to return metadata, ", + "or check the existence of the first file.")) + } else { + zxc <- unlist(lapply(attr(data_array, 'Variables'), is.null)) + var_backup <- attr(data_array, 'Variables')[!zxc][[1]] + } + attr(data_array, 'Variables') <- NULL attributes(data_array) <- c(attributes(data_array), list(Variables = c(list(common = c(picked_common_vars, var_backup)), -- GitLab From 9e449d9ca60ffc892e1540fc6da490edc58111a3 Mon Sep 17 00:00:00 2001 From: aho Date: Tue, 14 Jul 2020 09:59:17 +0200 Subject: [PATCH 2/5] Add unit test for first file missing case --- .../testthat/test-Start-first_file_missing.R | 123 ++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 tests/testthat/test-Start-first_file_missing.R diff --git a/tests/testthat/test-Start-first_file_missing.R b/tests/testthat/test-Start-first_file_missing.R new file mode 100644 index 0000000..5f7a5a6 --- /dev/null +++ b/tests/testthat/test-Start-first_file_missing.R @@ -0,0 +1,123 @@ +context("Start() retrieves files that the first file is missing") + +# When some of the files are missing, Start() still can retrieve the data and +# put NA in those missing positions. However, when the first file is missing, +# Start() returned error before because of failing to find metadata. The bug is +# fixed now, so even the first file does not exist, Start() will still retrieve +# the data. The parameter 'metadata_dims' can also be used in this case. + +file <- "/esarchive/exp/ncep/cfs-v2/weekly_mean/s2s/$var$_f24h/$var$_$file_date$.nc" +var <- 'tas' +sdates1 <- c('20130611') #exists +sdates2 <- c('20130618') #does not exist +sdates3 <- c("20130611", "20130618") #1st exists, 2nd missing +sdates4 <- c("20130618", "20130611") #1st missing, 2nd exists + + +test_that("1. first file missing, no assign parameter 'metadata_dims'", { + +data <- Start(dat = file, + var = var, + file_date = sdates4, + time = indices(1:4), + latitude = values(list(20, 30)), + latitude_reorder = Sort(decreasing = TRUE), + longitude = values(list(-20, -10)), + longitude_reorder = CircularSort(-180, 180), + ensemble = indices(1), + synonims = list(latitude = c('lat', 'latitude'), + longitude = c('lon', 'longitude')), + return_vars = list(latitude = 'dat', + longitude = 'dat', + time = 'file_date'), + #metadata_dims = c('file_date'), + retrieve = T) + + expect_equal( + dim(data), + c(dat = 1, var = 1, file_date = 2, time = 4, latitude = 11, longitude = 11, ensemble = 1) + ) + expect_equal( + data[1, 1, , 1, 1, 1, 1], + c(NA, 293.9133), + tolerance = 0.0001 + ) + expect_equal( + length(data[is.na(data)]), + 484 + ) + expect_equal( + names(attr(data, 'Variables')$common), + 'time' + ) + +}) + +test_that("2. Use parameter 'metadata_dims'", { + +data <- Start(dat = file, + var = var, + file_date = sdates4, + time = indices(1:4), + latitude = values(list(20, 30)), + latitude_reorder = Sort(decreasing = TRUE), + longitude = values(list(-20, -10)), + longitude_reorder = CircularSort(-180, 180), + ensemble = indices(1), + synonims = list(latitude = c('lat', 'latitude'), + longitude = c('lon', 'longitude')), + return_vars = list(latitude = 'dat', + longitude = 'dat', + time = 'file_date'), + metadata_dims = c('file_date'), + retrieve = T) + + expect_equal( + dim(data), + c(dat = 1, var = 1, file_date = 2, time = 4, latitude = 11, longitude = 11, ensemble = 1) + ) + expect_equal( + data[1, 1, , 1, 1, 1, 1], + c(NA, 293.9133), + tolerance = 0.0001 + ) + expect_equal( + length(data[is.na(data)]), + 484 + ) + expect_equal( + names(attr(data, 'Variables')$common), + c('time', 'tas') + ) +}) + +test_that("3. Use parameter 'metadata_dims', all common attributes", { + +data <- Start(dat = file, + var = var, + file_date = sdates4, + time = indices(1:4), + latitude = values(list(20, 30)), + latitude_reorder = Sort(decreasing = TRUE), + longitude = values(list(-20, -10)), + longitude_reorder = CircularSort(-180, 180), + ensemble = indices(1), + synonims = list(latitude = c('lat', 'latitude'), + longitude = c('lon', 'longitude')), + return_vars = list(latitude = NULL, + longitude = NULL, + time = 'file_date'), + metadata_dims = c('file_date'), + retrieve = T) + + expect_equal( + names(attr(data, 'Variables')$common), + c('latitude', 'longitude', 'time', 'tas') + ) + expect_equal( + as.vector(attr(data, 'NotFoundFiles')), + c("/esarchive/exp/ncep/cfs-v2/weekly_mean/s2s/tas_f24h/tas_20130618.nc", NA) + ) + +}) + -- GitLab From a67438fb088dca54de29e3e9c61e490a4623b656 Mon Sep 17 00:00:00 2001 From: aho Date: Tue, 14 Jul 2020 10:05:22 +0200 Subject: [PATCH 3/5] Add the fix in NEWS.md --- NEWS.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/NEWS.md b/NEWS.md index baf436e..8db9ba4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,9 @@ +# startR v1.0.4 (Release date: 2020-) +- Bugfix for non-existent first file. Start() could not retireve the data if the first file +is missing because the metadata is not found. The fix allows Start() to retrieve data without +metadata. To get the metadata from the following files, assign the parameter 'metadata_dims' +in Start(). + # startR v1.0.3 (Release date: 2020-06-19) - Bugfix for requiring the repetitive values from a single file when using 'merge_across_dims' and 'split_multiselected_dims'. The value positions were not -- GitLab From d62bed35db732ba2be004a633e31d8ce9aa587ce Mon Sep 17 00:00:00 2001 From: aho Date: Tue, 14 Jul 2020 15:22:40 +0200 Subject: [PATCH 4/5] Add FAQ for first-file-missing case --- inst/doc/faq.md | 79 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/inst/doc/faq.md b/inst/doc/faq.md index 4ad94a0..5572da5 100644 --- a/inst/doc/faq.md +++ b/inst/doc/faq.md @@ -22,6 +22,7 @@ This document intends to be the first reference for any doubts that you may have 16. [Use parameter 'return_vars' in Start()](#16-use-parameter-return_vars-in-start) 17. [Use parameter 'split_multiselected_dims' in Start()](#17-use-parameter-split_multiselected_dims-in-start) 18. [Use glob expression '*' to define the file path](#18-use-glob-expression-to-define-the-file-path) + 19. [Get metadata when the first file does not exist](#19-get-metadata-when-the-first-file-does-not-exist) 2. **Something goes wrong...** @@ -692,6 +693,84 @@ This is different from the common definition of glob expression that tries to ex There is a parameter 'path_glob_permissive' in Start(). If set it to TRUE, the '*' in the filename itself will remain (i.e., as the common definition), while the ones in the path to the filename will still be replaced by the pattern in the first found file. +### 19. Get metadata when the first file does not exist +Start() can retrieve the data even if some of them do not exist. The returned array will be filled with NA at the positions of missing files. +However, Start() retrieves the metadata from the first file of each data set by default. When the first file does not exist, the metadata cannot be found then the returned array will be lack of metadata of the variable. +In this case, Start() shows a warning: `Metadata cannot be retrieved. The reason may be the non-existence of the first file. Use parameter 'metadata_dims' to assign to file dimensions along which to return metadata, or check the existence of the first file.` + +To get the metadata, we can ensure the first file exists, or use the parameter 'metadata_dims' in Start(). + +(1) Ensure the first file exists +If the first file exists, you have no problem with metadata. You can check manually, or use the script like below: +```r + first.exists <- FALSE + n <- 1 + while(!first.exists) { + if(!file.exists(dir[n])) { + n <- n + 1 + } else { + first.exists <- TRUE + if (n > 1) { + init.year <- init.year + (n - 1) + all.years <- paste(strtoi(init.year):strtoi(end.year), sep = "") + warning(paste0("NEW INIT YEAR: ", init.year)) + } + } + } +``` + +(2) Use parameter 'metadata_dims' +This parameter expects to receive a vector of character strings with the names of the file dimensions which to return metadata for. +Start() by default returns the auxiliary data read for only the first file of each data set in the pattern dimension. +However, it can be configured to return the metadata for all the files along any set of file dimensions. The following example uses `metadata_dims = 'file_date'`, so even the first file is missing, Start() can find the metatdata from the second files. + +```r +file <- "/esarchive/exp/ncep/cfs-v2/weekly_mean/s2s/$var$_f24h/$var$_$file_date$.nc" +var <- 'tas' +sdates <- c("20130618", "20130611") #1st missing, 2nd exists + +dat1 <- Start(dat = file, + var = var, + file_date = sdates4, + time = indices(1:4), + latitude = values(list(20, 30)), + latitude_reorder = Sort(decreasing = TRUE), + longitude = values(list(-20, -10)), + longitude_reorder = CircularSort(-180, 180), + ensemble = indices(1), + synonims = list(latitude = c('lat', 'latitude'), + longitude = c('lon', 'longitude')), + return_vars = list(latitude = 'dat', + longitude = 'dat', + time = 'file_date'), + retrieve = T) + +# Check the attributes. There is no 'tas' metadata +names(attr(data, 'Variables')$common) +[1] "time" + +dat1 <- Start(dat = file, + var = var, + file_date = sdates4, + time = indices(1:4), + latitude = values(list(20, 30)), + latitude_reorder = Sort(decreasing = TRUE), + longitude = values(list(-20, -10)), + longitude_reorder = CircularSort(-180, 180), + ensemble = indices(1), + metadata_dims = 'file_date', + synonims = list(latitude = c('lat', 'latitude'), + longitude = c('lon', 'longitude')), + return_vars = list(latitude = 'dat', + longitude = 'dat', + time = 'file_date'), + retrieve = T) + +# Check the attributes. 'tas' metadata exists +names(attr(data, 'Variables')$common) +[1] "time" "tas" +``` + # Something goes wrong... -- GitLab From 2e52084c632d7fface3b326a1803e42f9ff6beea Mon Sep 17 00:00:00 2001 From: aho Date: Thu, 20 Aug 2020 19:44:16 +0200 Subject: [PATCH 5/5] Remove trash file --- R/.nfs0000000000216b150000002c | Bin 16384 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 R/.nfs0000000000216b150000002c diff --git a/R/.nfs0000000000216b150000002c b/R/.nfs0000000000216b150000002c deleted file mode 100644 index 279efb3cd9209d7600c28d21cf0410000c31bc30..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16384 zcmeHNTaX-886Lzdf`XL#=!v1&ovPW{kSM{VAlckZG&gqxXptCuy3fpUD#Ik&_idRcZExe#&K#dBb1|fJ?6g3!4M8E&^e`aS+HsAvfR@qbe zH@p4y`Okm8|JuFVS##U2Jz{`ZZxhqam(S=_iVpx+tx-S=xe(AqMwclWsn#B zb!*qX;JqiHaM~2Ob#{7odXBzn&%WS_8mt+p8F=X!xT>*k?G0;4-|Dxn64$=r)|XCo zt*K_9W}s%EW}s%EW}s%EW}s%EX5fFGfg;=7xD!^sj4i#H|G&)p|KI#Mz}Ii_@aGxd z$k#vS;s0VBuNCa)q=#Q%{5^dAYBs>xah~y=j0Yb6PsX=1{y7i-2jg29f5yYlF@7WC zYnTwX|KAz6?@4e^*TbJgO7*Tc^;{$9pUd-&fNzn$?LFR#9F`~Q{k+Zg|dhyTUf z|BQz}%lJNCKk@Q${eNcs5aaV6{wK!!jGyxGXBdA!;~QTwuK$mWJOAC|;eTM<#nqD@ z{(Ht<{JiCr;+I{c|4v1mimyzv(sO z`=4ffGvgof@Sij8{QFf8{~6q++U;R{ntr@5ps2Qjk zs2Qjk_>pMY$KoW)`8gXEAH5>or-Hc4SFgNU>yZ83u# z`pQ6$GscKi7owxf6!&Nm2$RWBbyYaJMO*Az(N^eLf(SuhVL_3rUQgyQVL=srdy=g> z$;v{)5_iZp?hWnLfX*yr8krV0&Sj?aLg=o@KntBX1vQ0AKm*7!B2p%cgMp0N;s6c- zK4oUl4uv`+T|pcVh>1j}s?cM0Y~MPy1C)vdogeMeH&7q+WFpf7=hkss+}@X|H5ux% zAP*wdOJG+R>O3M(uxpGgiprwu?ktz{N|%*6hD@`b=;hKFXWeX>rd9>8k)yX=NECUH z8te&UZ~&ELu{DgGCB>c~6SFcZK?W=n;-D&Ia@>AvX)jf5?JF~Tdk7Mott*ocdc_gl zp_@u0#R8o>AGI)u!!owcvF913n!>E|4mp`t!jMT5mpQoFzh$l^cF*l-A*NEfsku=E zh^nwD`W4>;LA=JXjA?&kQMo9?(#Ap2r$D9H40W1Hn-H6X>4TO?Afk<=)djMFQo;=2 z{={r*wqS+|`{IM8EQ#X^5UQZV88R;k!oEu7^v?ZT>2@*(M~5GhGF2gIw9$$cleiG6 zyxKd6Y1hu!1!zME78ISNgQ>8*4+9GUtN~Q~2?Y3Ltl<;{oHc`#<}%UP7TQeNK|tBO zj}&FE)fEK=5|OT>Mj$D{Vi0Q_t~zR@5VllP%tp~^@QN%yF+&7f`+hiCAxyy%3@HWCOD5N}4 z>Aa5T5etr;Nz){C7g?E0)FDZCk`b{Es3vM6lGKD{PI)F~ae6}}3e}}(p(sx%cVw!| zUcXJ=q4gSza*@a)u>0b+1PVnkAE-F!#Ih~+4Qo$p%zLgbLy$0QWmicz%%I)aTU)G29c9kNX^3NHm&7AtT-eZiE- zm8{G&jpM_1s<8nSqKr+UfrP{f0&QF&Q%9$04?7Dgqs$PZNV9w@qR_(@(9~6n&bPL$ zz}2+VE=w_G9qjD~^X{HhVi`6dBM0apsD#6F;n+~{^6eh|!`daAaji^@jO`f}A553{M|s!@)48)&FI4ay_#uGSr-HS*u1FxC&lE zd_o~R#zo~waM4|XI2h_k9=GwZG!?6(sQ7{DBh0DPW4hq32~Czx4q_YFD--TzT{;+# zSr=DUq3v&>RJMGe{@O4Zp!~*Sf%e8JZN{QLBZAS0P|A9kIDAh-8KTl&MmMiG=H$*$ z`_n=}<$z|I%BKiYjY$!~o7-uETLf~8=Eo*^mq#`p^WZJf&4WZz0TmeEu()au!O){( za@^bK%2rXC*-;CwMlXO7+-}^QNil%>U=+$5>BQG0Si~T9jFMI%=VccYqLOJiXd_6f zI6>1xq1Hk!Vywj|W+N@==&BRim4DluWBebXd` zk)Q+Es8!GM@F`%P(Te!hOt1SYln1_m9{kFQ6e?eBVlrYc`igr zvo#j#x=O|o*`~Y52GLi&KGu^(v0SwKVDgt!h!0cJ6985QbVM|;YGSGiAm&$!1t`S$ zghWAcun~aA8tyCTf6&^o-D$*pdWx7qln#uwVYv`e2}%c0xK}TrZF~PaXg8~^~3f(XcHF+5cpMr zG5Fo)B)u6toGBT~JMO>>er_54%J7CJ+Py>EEOyN7p4(0;@Wyj; Yl~evA8k*wz>%}UA7ldii6vrF?0e|(b%7 -- GitLab