diff --git a/NEWS.md b/NEWS.md index 827fa583c8fbe4a0f58725b978af023fa2edf963..f1befcf65525b445a4f41b1bb9822043b1aeb9d1 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,11 @@ # startR v1.0.2 (Release date: 2020-) -- Bugfix for longitude transformation when the required grid point across the borders. The bug apprears at v1.0.0 and v1.0.1. +- Bugfix for longitude transformation when the required grid point across the borders. The bug apprears at v1.0.0 and v1.0.1. +- Add one new parameter 'merge_across_dims_narm' in Start(). If it is TRUE, +the additional NAs in the across dimension will be removed. It is useful when +a continuous time series is required, or parameter 'split_multiselected_dims' is +TRUE and expected dimensions are supposed to have no NAs. +- Bugfix for the possible mixed dimension problem when 'split_multiselected_dims' and +'merge_across_dims' are both used. # startR v1.0.1 (Release date: 2020-04-21) - Bugfix for global longitude across the borders. diff --git a/R/Start.R b/R/Start.R index 156ee53f905394ce7627e2e1da1f6f162e895175..02d2b4c5edb27dec589d80296386b9143ec531dc 100644 --- a/R/Start.R +++ b/R/Start.R @@ -20,6 +20,7 @@ Start <- function(..., # dim = indices/selectors, metadata_dims = NULL, selector_checker = SelectorChecker, merge_across_dims = FALSE, + merge_across_dims_narm = FALSE, split_multiselected_dims = FALSE, path_glob_permissive = FALSE, retrieve = FALSE, @@ -132,6 +133,17 @@ Start <- function(..., # dim = indices/selectors, stop("Parameter 'merge_across_dims' must be TRUE or FALSE.") } + # Check merge_across_dims_narm + if (!is.logical(merge_across_dims_narm)) { + stop("Parameter 'merge_across_dims_narm' must be TRUE or FALSE.") + } + if (!merge_across_dims & merge_across_dims_narm) { + merge_across_dims_narm <- FALSE + warning(paste0("Parameter 'merge_across_dims_narm' can only be TRUE when ", + "'merge_across_dims' is TRUE. Set 'merge_across_dims_narm'", + " to FALSE.")) + } + # Leave alone the dimension parameters in the variable dim_params if (length(c(var_params_ind, dim_reorder_params_ind, tolerance_params_ind, depends_params_ind, across_params_ind)) > 0) { @@ -2406,6 +2418,8 @@ print("-> THE INNER DIMENSION GOES ACROSS A FILE DIMENSION.") } } else { if (inner_dim %in% names(dim(sub_array_of_values))) { +# NOTE: Put across-inner-dim at the 1st position. +# POSSIBLE PROB!! Only organize inner dim, the rest dims may not in the same order as sub_array_of_selectors below. inner_dim_pos_in_sub_array <- which(names(dim(sub_array_of_values)) == inner_dim) if (inner_dim_pos_in_sub_array != 1) { new_sub_array_order <- (1:length(dim(sub_array_of_values)))[-inner_dim_pos_in_sub_array] @@ -2414,6 +2428,9 @@ print("-> THE INNER DIMENSION GOES ACROSS A FILE DIMENSION.") } } } + +# NOTE: Put across-inner-dim at the 1st position. +# POSSIBLE PROB!! Only organize inner dim, the rest dims may not in the same order as sub_array_of_values above. inner_dim_pos_in_sub_array <- which(names(dim(sub_array_of_selectors)) == inner_dim) if (inner_dim_pos_in_sub_array != 1) { new_sub_array_order <- (1:length(dim(sub_array_of_selectors)))[-inner_dim_pos_in_sub_array] @@ -2771,6 +2788,11 @@ print("-> PROCEEDING TO CROP VARIABLES") 1:length(split_dims)) } old_dim_pos <- which(names(final_dims_fake) == names(dim_params)[dim_param]) + +# NOTE: Three steps to create new dims. +# 1st: Put in the dims before split_dim. +# 2nd: Replace the old_dim with split_dims. +# 3rd: Put in the dims after split_dim. new_dims <- c() if (old_dim_pos > 1) { new_dims <- c(new_dims, final_dims_fake[1:(old_dim_pos - 1)]) @@ -2784,6 +2806,28 @@ print("-> PROCEEDING TO CROP VARIABLES") } } } + if (merge_across_dims_narm) { + # only merge_across_dims -> the 'time' dim length needs to be adjusted + across_inner_dim <- inner_dims_across_files[[1]] #TODO: more than one? + across_file_dim <- names(inner_dims_across_files) #TODO: more than one? + # Get the length of each inner_dim ('time') along each file_dim ('file_date') + length_inner_across_dim <- lapply(dat[[i]][['selectors']][[across_inner_dim]][['fri']], length) + + if (!split_multiselected_dims) { + final_dims_fake_name <- names(final_dims_fake) + pos_across_inner_dim <- which(final_dims_fake_name == across_inner_dim) + new_length_inner_dim <- sum(unlist(length_inner_across_dim)) + if (pos_across_inner_dim != length(final_dims_fake)) { + final_dims_fake <- c(final_dims_fake[1:(pos_across_inner_dim - 1)], + new_length_inner_dim, + final_dims_fake[(pos_across_inner_dim + 1):length(final_dims_fake)]) + } else { + final_dims_fake <- c(final_dims_fake[1:(pos_across_inner_dim - 1)], + new_length_inner_dim) + } + names(final_dims_fake) <- final_dims_fake_name + } + } if (!silent) { .message("Detected dimension sizes:") @@ -2807,6 +2851,32 @@ print("-> PROCEEDING TO CROP VARIABLES") indent = 2) } +# NOTE: If split_multiselected_dims + merge_across_dims, the dim order may need to be changed. +# The inner_dim needs to be the first dim among split dims. +# Cannot control the rest dims are in the same order or not... +# Suppose users put the same order of across inner and file dims. + if (split_multiselected_dims & merge_across_dims) { + # TODO: More than one split? + inner_dim_pos_in_split_dims <- which(names(all_split_dims[[1]]) == inner_dims_across_files) + # if inner_dim is not the first, change! + if (inner_dim_pos_in_split_dims != 1) { + split_dims <- c(split_dims[inner_dim_pos_in_split_dims], + split_dims[1:length(split_dims)][-inner_dim_pos_in_split_dims]) + split_dims_pos <- which(!is.na(match(names(final_dims_fake), names(split_dims)))) + # Save the current final_dims_fake for later reorder back + final_dims_fake_output <- final_dims_fake + new_dims <- c() + if (split_dims_pos[1] != 1) { + new_dims <- c(new_dims, final_dims_fake[1:(split_dims_pos[1] - 1)]) + } + new_dims <- c(new_dims, split_dims) + if (split_dims_pos[length(split_dims_pos)] < length(final_dims_fake)) { + new_dims <- c(new_dims, final_dims_fake[(split_dims_pos[length(split_dims_pos)] + 1):length(final_dims_fake)]) + } + final_dims_fake <- new_dims + } + } + # The following several lines will only be run if retrieve = TRUE if (retrieve) { @@ -3055,7 +3125,61 @@ print("-> WORK PIECES BUILT") } } #print("P") - data_array <- array(bigmemory::as.matrix(data_array), dim = final_dims_fake) + +# NOTE: If merge_across_dims = TRUE, there might be additional NAs due to +# unequal inner_dim ('time') length across file_dim ('file_date'). +# If merge_across_dims_narm = TRUE, add additional lines to remove these NAs. +# TODO: Now it assumes that only one '_across'. Add a for loop for more-than-one case. + if (merge_across_dims_narm) { + + # Get the length of these two dimensions in final_dims + length_inner_across_store_dims <- final_dims[across_inner_dim] + length_file_across_store_dims <- final_dims[across_file_dim] + + # Create a logical array for merge_across_dims + logi_array <- array(rep(FALSE, + length_file_across_store_dims * length_inner_across_store_dims), + dim = c(length_inner_across_store_dims, length_file_across_store_dims)) + for (i in 1:length_file_across_store_dims) { #1:4 + logi_array[1:length_inner_across_dim[[i]], i] <- TRUE + } + + # First, get the data array with final_dims dimension + data_array_final_dims <- array(bigmemory::as.matrix(data_array), dim = final_dims) + + # Change the NA derived from additional spaces to -9999, then remove these -9999 + func_remove_blank <- function(data_array, logi_array) { + # dim(data_array) = [time, file_date] + # dim(logi_array) = [time, file_date] + # Change the blank spaces from NA to -9999 + data_array[which(!logi_array)] <- -9999 + return(data_array) + } + data_array_final_dims <- multiApply::Apply(data_array_final_dims, + target_dims = c(across_inner_dim, across_file_dim), #c('time', 'file_date') + output_dims = c(across_inner_dim, across_file_dim), + fun = func_remove_blank, + logi_array = logi_array)$output1 + ## reorder back to the correct dim + tmp <- match(names(final_dims), names(dim(data_array_final_dims))) + data_array_final_dims <- .aperm2(data_array_final_dims, tmp) + data_array_tmp <- data_array_final_dims[data_array_final_dims != -9999] # become a vector + + data_array <- array(data_array_tmp, dim = final_dims_fake) + + } else { # merge_across_dims_narm = F (old version) + data_array <- array(bigmemory::as.matrix(data_array), dim = final_dims_fake) + } + +# NOTE: If split_multiselected_dims + merge_across_dims, the dimension order may change above. +# To get the user-required dim order, we need to reorder the array again. + if (split_multiselected_dims & merge_across_dims) { + if (inner_dim_pos_in_split_dims != 1) { + correct_order <- match(names(final_dims_fake_output), names(final_dims_fake)) + data_array <- .aperm2(data_array, correct_order) + } + } + gc() # Load metadata and remove the metadata folder @@ -3097,6 +3221,10 @@ print("-> WORK PIECES BUILT") } # End if (retrieve) + # Change final_dims_fake back because retrieve = FALSE will use it for attributes later + if (exists("final_dims_fake_output")) { + final_dims_fake <- final_dims_fake_output + } # Replace the vars and common vars by the transformed vars and common vars for (i in 1:length(dat)) { if (length(names(transformed_vars[[i]])) > 0) { diff --git a/R/Utils.R b/R/Utils.R index 0d2b2fde7ed7d014c4344e9f1813f219d863fe1d..bdf8fda10a1d150113636b867a1de66d4627413c 100644 --- a/R/Utils.R +++ b/R/Utils.R @@ -30,10 +30,12 @@ chunk <- function(chunk, n_chunks, selectors) { if (length(chunk) != length(n_chunks)) { stop("Wrong chunk specification.") } - if (!is.null(attr(selectors, 'values'))) { - stop("Multidimensional chunking only available when selector ", - "values provided.") - } +#NOTE: 1. It should be for above? not nultidimensional selector +# 2. it was !is.null before, but it should be is.null (?) +# if (!is.null(attr(selectors, 'values'))) { +# stop("Multidimensional chunking only available when selector ", +# "values provided.") +# } if (is.null(dim(selectors))) { stop("Multidimensional chunking only available when multidimensional ", "selector values provided.") diff --git a/inst/doc/faq.md b/inst/doc/faq.md index cb15d9df352598de332af9c35e930290c2172af5..66df1e143ed50359f54bfb81da39edae336c2248 100644 --- a/inst/doc/faq.md +++ b/inst/doc/faq.md @@ -5,7 +5,7 @@ This document intends to be the first reference for any doubts that you may have ## Index 1. **How to** 1. [Choose the number of chunks/jobs/cores in Compute()](#1-choose-the-number-of-chunksjobscores-in-compute) - 2. [Merge/Reorder dimension in Start() (using parameter 'xxx_across' and 'merge_across_dims')](#2-mergereorder-dimension-in-start-using-parameter-xxx_across-and-merge_across_dims) + 2. [Indicate dependent dimension and use merge parameters in Start()](#2-indicate-dependent-dimension-and-use-merge-parameters-in-start) 3. [Use self-defined function in Compute()](#3-use-self-defined-function-in-compute) 4. [Use package function in Compute()](#4-use-package-function-in-compute) 5. [Do interpolation in Start() (using parameter 'transform')](#5-do-interpolation-in-start-using-parameter-transform) @@ -20,6 +20,7 @@ This document intends to be the first reference for any doubts that you may have 14. [Find the error log when jobs are launched on Power9](#14-find-the-error-log-when-jobs-are-launched-on-power9) 15. [Specify extra function arguments in the workflow](#15-specify-extra-function-arguments-in-the-workflow) 16. [Use parameter 'return_vars' in Start()](#16-use-parameter-return_vars-in-start) + 17. [Use parameter 'split_multiselected_dims' in Start()](#17-use-parameter-split_multiselected_dims-in-start) 2. **Something goes wrong...** @@ -38,22 +39,46 @@ Divide data into chunks according to the size of machine memory module (Power9 i Find more details in practical_guide.md [How to choose the number of chunks, jobs and cores](inst/doc/practical_guide.md#how-to-choose-the-number-of-chunks-jobs-and-cores) -### 2. Merge/Reorder dimension in Start() (using parameter 'xxx_across' and 'merge_across_dims') -The parameter `'xxx_across = yyy'` indicates that the inner dimension 'xxx' is continuous along the file dimension 'yyy'. A common example is 'time_across = chunk', when the experiment runs through many years and the result is saved in several chunk files. Find more details in startR documentation. +### 2. Indicate dependent dimension and use merge parameters in Start() +The parameter `'xxx_across = yyy'` indicates that the inner dimension 'xxx' is continuous along the file dimension 'yyy'. +A common example is 'time_across = chunk', when the experiment runs through many years +and the result is saved in several chunk files. -If you define this parameter, you can specify 'xxx' with the indices throughout the whole 'yyy' files, not only within one file. See Example 1 below, 'time = indices(1:24)' is available when 'time_across = chunk' is specified. If not, 'time' can only be 12 for most. +If you indicate this dependent relation, you can specify 'xxx' with the indices +throughout the whole 'yyy' files, instead of only within one file. See Example 1 below, +'time = indices(1:24)' is available when 'time_across = chunk' is specified. If not, 'time' can only be 12 for most. -One example making advantage of 'xxx_across' is extracting an climate event across years, like El Niño. If the event starts from Nov 2014 to May 2016 (19 months in total), simply specify 'time = indices(11:29)' (Example 2) +One example taking advantage of 'xxx_across' is extracting an climate event across years, like El Niño. +If the event starts from Nov 2014 to May 2016 (19 months in total), simply specify 'time = indices(11:29)' (Example 2). -The thing you should bear in mind when using this parameter is the returned data structure. First, **the length of the return xxx dimension is the length of the longest xxx in all files**. Take the El Niño above as an example. The first chunk has 2 months, the second chunk has 12 months, and the third chunk has 5 months. Therefore, the length of time dimension will be 12, and the length of chunk dimension will be 3. +The thing you should bear in mind when using this parameter is the returned data structure. +First, **the length of the return xxx dimension is the length of the longest xxx in all files**. +Take the El Niño above as an example. The first chunk has 2 months, the second chunk has 12 months, +and the third chunk has 5 months. Therefore, the length of time dimension will be 12, and the length of chunk dimension will be 3. -Second, the way Start() store data is **put data at the left-most position**. Take the El Niño (Example 2) above as an example again. The first chunk has only 2 months, so position 1 and 2 have values (which are Nov and Dec 2014). The second chunk has 12 months, so all positions have values (Jan to Dec 2015), while position 3 to 12 will be NA. The third chunk has 5 months, so position 1 to 5 have values (which are Jan to May 2016), while position 6 to 12 will be NA. +Second, the way Start() store data is **put data at the left-most position**. + Take the El Niño (Example 2) above as an example again. The first chunk has only 2 months, +so position 1 and 2 have values (which are Nov and Dec 2014). The second chunk has 12 months, +so all positions have values (Jan to Dec 2015), while position 3 to 12 will be NA. +The third chunk has 5 months, so position 1 to 5 have values (which are Jan to May 2016), while position 6 to 12 will be NA. -It seems more reasonable to put NA at position 1 to 10 in first chunk (Jan to Oct 2014) and and position 6 to 12 in the third chunk (June to Dec 2016). But if the data is not continuous or picked irregularly , it is hard to judge the correct NA position (see Example 3). +It seems more reasonable to put NA at position 1 to 10 in first chunk (Jan to Oct 2014) +and and position 6 to 12 in the third chunk (June to Dec 2016). But if the data is not continuous or picked irregularly , +it is hard to judge the correct NA position (see Example 3). -Since Start() is very flexible with any possible way to read-in data, it is difficult to include all the possibilities and make the output data structure reasonable all the time. Therefore, it is recommended to understand the way Start() rolls first, then you know what you should expect from the output and will not get confused with what it returns to you. +Since Start() is very flexible with any possible way to read-in data, it is difficult to include +all the possibilities and make the output data structure reasonable all the time. +Therefore, it is recommended to understand the way Start() rolls first, +then you know what you should expect from the output and will not get confused with what it returns to you. -As for parameter 'merge_across_dims', it decides whether to connect all 'xxx' together along 'yyy' or not. See Example 1. If 'merge_across_dims = TRUE', the chunk dimension will disappear. 'merge_across_dims' simply attaches data one after another, so the NA values (if exist) will be the same places as the unmerged one (see Example 2). +If you want to connet xxx along yyy, the parameter 'merge_across_dims' and 'merge_across_dims_narm' can help you achieve it. +See Example 1. If 'merge_across_dims = TRUE', the chunk dimension will disappear. +'merge_across_dims' simply attaches data one after another, so the NA values (if exist) will be the same places as the unmerged one (see Example 2). + +If you want to remove those additional NAs, you can use 'merge_across_dims_narm = TRUE', +then the NAs will be removed when merging into one dimension. (see Example 2). + +You can find more use cases at [ex1_2_exp_obs_attr.R](inst/doc/usecase/ex1_2_exp_obs_attr.R) and [ex1_3_attr_loadin.R](inst/doc/usecase/ex1_3_attr_loadin.R). Example 1 @@ -134,6 +159,26 @@ data <- Start(dat = repos, [11,] 297.9998 296.8002 NA [12,] 299.4571 299.0254 NA +# use merge parameters + +data <- Start(dat = repos, + var = 'tos', + memb = 'r24i1p1f1', + time = indices(4:27), # Apr 1957 to Mar 1959 + chunk = c('195701-195712', '195801-195812', '195901-195912'), + lat = 'all', + lon = 'all', + time_across = 'chunk', + merge_across_dims = TRUE, + merge_across_dims_narm = TRUE, + return_vars = list(lat = NULL, lon = NULL), + retrieve = TRUE) + +data[1,1,1,,100,100] + [1] 300.7398 299.6569 298.3954 297.1931 295.9608 295.4735 295.8538 297.9998 + [9] 299.4571 300.7659 301.8241 301.6472 301.0621 299.1324 297.4028 296.1619 +[17] 295.2794 295.0474 295.4571 296.8002 299.0254 301.7128 301.4781 301.3807 + ``` Example 3: Read in three winters (DJF) @@ -541,8 +586,73 @@ List of 1 $ longitude: num [1:37(1d)] 10 10.3 10.6 10.8 11.1 ... ``` +### 17. Use parameter 'split_multiselected_dims' in Start() +The selectors can be not only vectors, but also multidimensional array. For instance, the 'time' dimension +can be assigned by a two-dimensional array `[sdate = 12, time = 31]`, which is the daily data of 12 months. +You may want to have both 'sdate' and 'time' in the output dimension, even though 'sdate' is not explicitly specified in Start(). +The parameter 'split_multiselected_dims' is for this goal. It is common in the case that uses experimental data attributes to get the corresponding observational data. + +The following script is part of the use case [ex1_2_exp_obs_attr.R](inst/doc/usecase/ex1_2_exp_obs_attr.R). +The time selector for observational data comes from experimental data above (neglected here). The dimension number of the selector is two. +Notice that dimension name, which is 'time' here, must also be one of the dimension names in the selector. + +The result dimensions include 'sdate' because it is splited from 'time'. In the meanwhile, +'date' disappears because 'merge_across_dims = TRUE' (see more explanation at [How-to-#2](#2-indicate-dependent-dimension-and-use-merge-parameters-in-start)). + +```r +# use time attributes from experimental data +dates <- attr(exp, 'Variables')$common$time +dim(dates) +#sdate time +# 4 3 + +obs <- Start(dat = repos_obs, + var = 'tas', + date = unique(format(dates, '%Y%m')), + time = values(dates), #dim: [sdate = 4, time = 3] + time_across = 'date', + lat = 'all', + lon = 'all', + merge_across_dims = TRUE, + split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'date'), + retrieve = FALSE) + +print(attr(obs, 'Dimensions')) +# dat var sdate time lat lon +# 1 1 4 3 256 512 +``` + +Notice that when 'split_multiselected_dims' and 'merge_across_dims' are both used, +and dimension number of the splitted selector is more than two, a potential +problem of mixed dimension might occur. The following script is from part of +the usecase [ex1_7_split_merge.R](inst/doc/usecase/ex1_7_split_merge.R). + +It is important to check if **the order of file_date is in line with dates dimensions order!** +Regardless 'time', which is explicitly specified in Start(), the vector should list 'sdate' first, then 'syear'. +As you can see below, the first element '199607' is sdate = 1, and the second element '199612' is sdate = 2. If the order is wrong, you will still get a +return data but with mixed dimensions. Because 'sdate' and 'syear' are only +implied in the given selectors, Start() cannot check if the order of file_date and dates are consistent or not. + +```r +dates <- attr(hcst, 'Variables')$common$time +dim(dates) +#sdate syear time +# 2 3 12 + +file_date <- sort(unique(gsub('-', '', + sapply(as.character(dates), substr, 1, 7)))) +print(file_date) +#[1] "199607" "199612" "199701" "199707" "199712" "199801" "199807" "199812" +#[9] "199901" +``` + -## Something goes wrong... +# Something goes wrong... ### 1. No space left on device diff --git a/inst/doc/usecase.md b/inst/doc/usecase.md index 35130e00a43345e730c26596da7c9a0e0e47c4d6..68bdbf3b8d6279eccbab5e3deaa6dcf47168c84a 100644 --- a/inst/doc/usecase.md +++ b/inst/doc/usecase.md @@ -17,7 +17,8 @@ In this document, you can link to the example scripts for various demands. For t **'merge_across_dims'**, and **'split_multiselected_dims'**. 3. [Use experimental data attribute to load in oberservational data](inst/doc/usecase/ex1_3_attr_loadin.R) - Load the experimental data first (with `retrieve = FALSE`), then retreive its dates and time attributes to use in the observational data load-in. It also shows how to use parameters `xxx_tolerance`, `xxx_across`, `merge_across_dims`, `split_multiselected_dims`. + Like ex1_2, it shows how to retrieve the experimental data and observational data +in a comparable structure. It also shows how to use parameters `xxx_tolerance`, `xxx_across`, `merge_across_dims`, `merge_across_dims_narm`, and `split_multiselected_dims`. 4. [Checking impact of start date order in the number of members](inst/doc/usecase/ex1_4_variable_nmember.R) Mixing start dates of different months can lead to load different number of members, check the code provided and the [FAQ 10](/inst/doc/faq.md). @@ -30,6 +31,9 @@ for more explanation. 6. [Loading gridpoint data](inst/doc/usecase/ex1_6_gridpoint_data.R) **Start** can be used to load single point data by providing a vector of longitudes and latitudes. This use case also ilustrates how to reformat it to get a 'gridpoint' dimension. + 7. [Use split and merge parameters together](inst/doc/usecase/ex1_7_split_merge.R) + This usecase shows the things to be noticed when the parameters 'split_multiselected_dims' and 'merge_across_dims' are both used. +The problem may occur when the dimension number of the splitted selector is more than two. If you are not familiar with the usage of these parameters, please see usecases ex1_2 and ex1_3 first, which are less complicated. You can also go to FAQ How-to-#17 for more explanation. 2. **Execute computation (use `Compute()`)** diff --git a/inst/doc/usecase/ex1_2_exp_obs_attr.R b/inst/doc/usecase/ex1_2_exp_obs_attr.R index de1926e426d74ed719950c8ea785a1f2c458c47b..5a906f3ac2f4eecfb71eab374537290ee184c228 100644 --- a/inst/doc/usecase/ex1_2_exp_obs_attr.R +++ b/inst/doc/usecase/ex1_2_exp_obs_attr.R @@ -38,8 +38,8 @@ lats <- attr(exp, 'Variables')$common$lat # The 'time' attribute is dependent on 'sdate'. You can see the dimension below. dates <- attr(exp, 'Variables')$common$time # dim(dates) -#sdate ftime -# 4 3 +#sdate time +# 4 3 #------------------------------------------- diff --git a/inst/doc/usecase/ex1_3_attr_loadin.R b/inst/doc/usecase/ex1_3_attr_loadin.R index ce37db5bc03fd3f301c280e2278e98bf511c6161..3dca648fa83851059679bb8356b28d7355e70c84 100644 --- a/inst/doc/usecase/ex1_3_attr_loadin.R +++ b/inst/doc/usecase/ex1_3_attr_loadin.R @@ -1,9 +1,22 @@ +#--------------------------------------------------------------------- +# This usecase shows you how to load experimental and observational data in a +# consistent way. + +# First, load the experimental data. Then, use the time attributes of experimental data to define the selectors for observational data. + +# You can see how to use parameter '*_across', 'merge_across_dims', 'merge_across_dims_narm', +# and 'split_multiselected_dims' to create the same dimension structure. +#--------------------------------------------------------------------- + +# experimental data +# May to December 1994, monthly file with 6-hourly frequency repos <- paste0('/esarchive/exp/ecmwf/system4_m1/6hourly/', '$var$/$var$_$sdate$.nc') - + sdate <- sapply(1994, function(x) paste0(x, sprintf('%02d', 5:12), '01')) + system4 <- Start(dat = repos, var = 'tas', - sdate = sapply(1994, function(x) paste0(x, sprintf('%02d', 5:12), '01')), + sdate = sdate, time = indices(seq(1, 124, 4)), #first time step per day ensemble = 'all', latitude = indices(1:10), @@ -20,38 +33,112 @@ # ------- retrieve the attributes for obs load-in ---------- dates <- attr(system4, 'Variables')$common$time -#> dim(dates) +# dim(dates) #sdate time # 8 31 + +# NOTE: Even though June, September, and November only have 30 days, it reads +# 31 days for each month. Therefore, the last day of these months is the +# first day of the following month. No NA in this dates array. + +# Manually retrieve the observation dates in the required format dates_file <- sort(unique(gsub('-', '', sapply(as.character(dates), substr, 1, 7)))) -#> dates_file +# dates_file #[1] "199405" "199406" "199407" "199408" "199409" "199410" "199411" "199412" # ----------------------------------------------------------- +# observational data +# May to December 1994, monthly file with 6-hourly frequency repos_obs <- paste0('/esarchive/recon/ecmwf/erainterim/6hourly/', '$var$/$var$_$file_date$.nc') - erai <- Start(dat = repos_obs, - var = 'tas', - file_date = dates_file, - time = values(dates), #use dates from exp - latitude = indices(1:10), - longitude = indices(1:10), - time_var = 'time', - #because time is assigned by 'values', set the tolerance to avoid too distinct match - time_tolerance = as.difftime(1, units = 'hours'), - #time is defined by dates, which dimension is [sdate = 8, time = 200] - time_across = 'file_date', - return_vars = list(latitude = NULL, - longitude = NULL, - time = 'file_date'), - #combine time and file_date - merge_across_dims = TRUE, - #split time, because it is two-dimensional - split_multiselected_dims = TRUE) + erai <- Start(dat = repos_obs, + var = 'tas', + file_date = dates_file, + time = values(dates), + latitude = indices(1:10), + longitude = indices(1:10), + time_var = 'time', + #because time is assigned by 'values', set the tolerance to avoid too distinct match + time_tolerance = as.difftime(1, units = 'hours'), + #time values are across all the files + time_across = 'file_date', + #combine time and file_date dims + merge_across_dims = TRUE, + #exclude the additional NAs generated by merge_across_dims + merge_across_dims_narm = TRUE, + #split time dim, because it is two-dimensional + split_multiselected_dims = TRUE, + return_vars = list(latitude = NULL, + longitude = NULL, + time = 'file_date'), + retrieve = TRUE) + +# NOTE: 'merge_across_dims_narm = TRUE' is necessary because the observational +# data have unequal time length of 30-day and 31-day months. +# If the NAs are not removed, unwanted NAs will exist and make the +# values misplaced in the array. See 'bonus' below for more explanation. #------- Check erai ----------- - attr(erai, 'Dimensions') +dim(erai) # dat var sdate time latitude longitude # 1 1 8 31 10 10 + +erai[1, 1, 1, , 1, 1] +# [1] 255.0120 256.8095 254.3654 254.6059 257.0551 255.5087 256.8167 257.9717 +# [9] 258.7491 259.2942 259.6682 260.7215 260.0988 261.2605 263.3590 265.6683 +#[17] 262.4813 262.6136 263.0591 262.8377 261.7276 263.9910 264.7755 266.0213 +#[25] 268.5927 267.8699 268.9210 269.4702 267.6735 267.9255 268.2216 + +erai[1, 1, , 1, 2, 2] +#[1] 254.5793 269.6221 274.5021 274.0269 269.5855 253.7458 243.9750 244.2415 + +# NOTE: You will see that the observation array is the same as experiment one +# that the last day in 30-day months are the first day of the following +# month. +erai[1, 1, 3, 1, 1, 1] # 1st March +#[1] 274.6019 +erai[1, 1, 2, 31, 1, 1] # 1st March also, since June only has 30 days +#[1] 274.6019 +#------------------------------ + +# The experimental and observational data are comparable with same structure. + + +# //////////////////"BONUS"////////////////////// +# Here is something more to show the usage of parameter 'merge_across_dims_narm'. +# If the last day of 30-day months is NA instead of the first day of the following month, +# NAs are needed to exist in the array. In this case, 'merge_across_dims_narm' +# should be FALSE. + + dates <- attr(system4, 'Variables')$common$time + dates[2, 31] +#[1] "1994-07-01 CEST" + dates[2, 31] <- NA # Jun + dates[5, 31] <- NA # Sep + dates[7, 31] <- NA # Nov + + erai <- Start(dat = repos_obs, + var = 'tas', + file_date = dates_file, + time = values(dates), + latitude = indices(1:10), + longitude = indices(1:10), + time_var = 'time', + time_across = 'file_date', + merge_across_dims = TRUE, + #keep NAs of the last day in 30-day months + merge_across_dims_narm = FALSE, + split_multiselected_dims = TRUE, + return_vars = list(latitude = NULL, + longitude = NULL, + time = 'file_date'), + retrieve = TRUE) + +#------- Check erai ----------- +erai[1, 1, 2, , 1, 1] # June +# [1] 269.9410 269.6855 268.7380 268.5008 270.3236 271.5151 270.5046 270.1686 +# [9] 270.5395 272.0379 272.5489 271.1494 270.7764 270.5678 272.0331 273.7856 +#[17] 273.9849 274.5904 273.4369 273.8404 274.4068 274.2292 274.7375 275.5104 +#[25] 275.4324 274.9408 274.8679 276.5602 275.0995 274.6409 NA #------------------------------ diff --git a/inst/doc/usecase/ex1_7_split_merge.R b/inst/doc/usecase/ex1_7_split_merge.R new file mode 100644 index 0000000000000000000000000000000000000000..e72cceefb202eddd0dd11a90293c89d70b2c5bc2 --- /dev/null +++ b/inst/doc/usecase/ex1_7_split_merge.R @@ -0,0 +1,114 @@ +#--------------------------------------------------------------------- +# This usecase shows the things to be noticed when the parameters +# 'split_multiselected_dims' and 'merge_across_dims' are both used. +# The problem may occur when the dimension number of the splitted selector +# is more than two. + +# If you're not familiar with the usage of these parameters, please see usecases ex1_2 and ex1_3 first, which are less complicated. +# See FAQ How-to-#17 for more explanation. +#--------------------------------------------------------------------- + +library(startR) + +var_name <- 'sfcWind' + +# experimental data +path.exp <- '/esarchive/exp/ecmwf/s2s-monthly_ensforhc/daily/$var$_f24h/$sdate$/$var$_$syear$.nc' + +hcst <- Start(dat = path.exp, + var = var_name, + sdate = c('20160704', '20161222'), + syear = indices(1:3), #2016:2018 + syear_depends = 'sdate', + time = indices(1:12), #4th-15th Jul; 22nd Dec-2nd Jan + latitude = indices(1:10), + longitude = indices(1:10), + ensemble = 'all', + return_vars = list(latitude = 'dat', + longitude = 'dat', + time = c('sdate', 'syear') #time depends on both sdate and syear + ), + retrieve = F) + +#----------------------------------------------------------------------- +# The time attribute is going to be used in retrieving observational data. +# Because 'time' depends on both 'sdate' and 'syear', the dimension number +# is 3. + +dates <- attr(hcst, 'Variables')$common$time +dim(dates) +#sdate syear time +# 2 3 12 +#----------------------------------------------------------------------- + +#----------------------------------------------------------------------- +# This two lines should NOT be used!! It is an example showing the potential +# problem when using 'split_multiselected_dims' and 'merge_across_dims'. +# If 'dates' is reordered to 'syear' ahead of 'sdate', while 'file_date' below +# remains the same, the result will have mixed dimension. + +library(s2dv) +dates <- Reorder(dates, c('syear','time','sdate')) +#----------------------------------------------------------------------- + +#----------------------------------------------------------------------- +# Use 'dates' generated above to get the required files. +# It is important to check if the order of file_date is in line with +# dates dimensions! Regardless 'time', the vector should list 'sdate' first, +# then 'syear'. As you can see below, the first element '199607' is sdate = 1, +# and the second element '199612' is sdate = 2. + +file_date <- sort(unique(gsub('-', '', + sapply(as.character(dates), substr, 1, 7)))) +print(file_date) +#[1] "199607" "199612" "199701" "199707" "199712" "199801" "199807" "199812" +#[9] "199901" +#----------------------------------------------------------------------- + +# observational data +path.obs <- '/esarchive/recon/ecmwf/era5/1hourly/$var$/$var$_$file_date$.nc' + +obs <- Start(dat = path.obs, + var = var_name, + file_date = file_date, # a vector + latitude = indices(1:10), + longitude = indices(1:10), + time = values(dates), # a 3-dim array (sdate, syear, time) + time_across = 'file_date', + merge_across_dims = TRUE, + merge_across_dims_narm = TRUE, + split_multiselected_dims = TRUE, + synonims = list(latitude = c('lat','latitude'), + longitude = c('lon','longitude')), + return_vars = list(latitude = 'dat', + longitude = 'dat', + time = 'file_date'), + retrieve = T) + +# check obs data +dim(obs) +# dat var latitude longitude sdate syear time +# 1 1 10 10 2 3 12 +obs[1, 1, 1, 1, 2, 1:2, ] +# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] +#[1,] 4.402223 2.657466 7.296539 10.263944 6.367464 5.433421 3.021327 7.498292 +#[2,] 6.802123 7.110264 7.584915 4.255134 2.047740 3.619044 5.648496 8.322672 +# [,9] [,10] [,11] [,12] +#[1,] 15.321060 1.131008 6.326981 5.301850 +#[2,] 7.942419 7.594263 6.189313 7.627579 + +# check with ncdf4 +library(ncdf4) +file199612 <- nc_open('/esarchive/recon/ecmwf/era5/1hourly/sfcWind/sfcWind_199612.nc') +wind199612 <- ncvar_get(file199612, 'sfcWind') +file199701 <- nc_open('/esarchive/recon/ecmwf/era5/1hourly/sfcWind/sfcWind_199701.nc') +wind199701 <- ncvar_get(file199701, 'sfcWind') + +data_wanted_199612 <- seq(506, 722, 24) +wind199612[1, 1, data_wanted_199612] +# [1] 4.402223 2.657466 7.296539 10.263944 6.367464 5.433421 3.021327 +# [8] 7.498292 15.321060 1.131008 +data_wanted_199701 <- seq(2, 26, 24) +wind199701[1, 1, data_wanted_199701] +#[1] 6.326981 5.301850 + diff --git a/tests/testthat/test-Start-split-merge.R b/tests/testthat/test-Start-split-merge.R new file mode 100644 index 0000000000000000000000000000000000000000..a88be8ba073eb06adfc8793b13f3638ae2bf71cc --- /dev/null +++ b/tests/testthat/test-Start-split-merge.R @@ -0,0 +1,144 @@ +context("Start() split + merge dim and value check") + +var_name <- 'sfcWind' +path.exp <- '/esarchive/exp/ecmwf/s2s-monthly_ensforhc/daily/$var$_f24h/$sdate$/$var$_$syear$.nc' + +hcst <- Start(dat = path.exp, + var = var_name, + sdate = c('20160704', '20161222'), + syear = indices(1:3), #2016:2018 + syear_depends = 'sdate', + time = indices(1:12), #4th-15th Jul; 22nd Dec-2nd Jan + latitude = indices(1:10), + longitude = indices(1:10), + ensemble = 'all', + return_vars = list(latitude = 'dat', + longitude = 'dat', + time = c('sdate', 'syear')), + retrieve = F, + silent = TRUE) +dates <- attr(hcst, 'Variables')$common$time +file_date <- sort(unique(gsub('-', '', + sapply(as.character(dates), substr, 1, 7)))) + +path.obs <- '/esarchive/recon/ecmwf/era5/1hourly/$var$/$var$_$file_date$.nc' + + +test_that("1. split + merge + narm", { + +obs <- Start(dat = path.obs, + var = var_name, + file_date = file_date, # a vector + latitude = indices(1:10), + longitude = indices(1:10), + time = values(dates), # a 3-dim array (sdate, syear, time) + time_across = 'file_date', + merge_across_dims = TRUE, + merge_across_dims_narm = TRUE, + split_multiselected_dims = TRUE, + synonims = list(latitude = c('lat','latitude'), + longitude = c('lon','longitude')), + return_vars = list(latitude = 'dat', + longitude = 'dat', + time = 'file_date'), + retrieve = T) + + expect_equal( + dim(obs), + c(dat = 1, var = 1, latitude = 10, longitude = 10, sdate = 2, syear = 3, time = 12) + ) + expect_equal( + obs[1, 1, 1, 1, 2, 2, 1:3], + c(6.802123, 7.110264, 7.584915), + tolerance = 0.0001 + ) + expect_equal( + mean(obs), + 5.751444, + tolerance = 0.0001 + ) + expect_equal( + length(obs[which(is.na(obs))]), + 0 + ) +}) + + +test_that("2. no split + merge + narm", { + +obs <- Start(dat = path.obs, + var = var_name, + file_date = file_date, # a vector + latitude = indices(1:10), + longitude = indices(1:10), + time = values(dates), # a 3-dim array (sdate, syear, time) + time_across = 'file_date', + merge_across_dims = TRUE, + merge_across_dims_narm = TRUE, + split_multiselected_dims = FALSE, + synonims = list(latitude = c('lat','latitude'), + longitude = c('lon','longitude')), + return_vars = list(latitude = 'dat', + longitude = 'dat', + time = 'file_date'), + retrieve = T) + + expect_equal( + dim(obs), + c(dat = 1, var = 1, latitude = 10, longitude = 10, time = 72) + ) + expect_equal( + obs[1, 1, 1, 1, 10:13], + c(4.076760, 7.644944, 4.589063, 4.402223), + tolerance = 0.0001 + ) + expect_equal( + mean(obs), + 5.751444, + tolerance = 0.0001 + ) + expect_equal( + length(obs[which(is.na(obs))]), + 0 + ) +}) + + +test_that("3. no split + merge + no narm", { + +obs <- Start(dat = path.obs, + var = var_name, + file_date = file_date, # a vector + latitude = indices(1:10), + longitude = indices(1:10), + time = values(dates), # a 3-dim array (sdate, syear, time) + time_across = 'file_date', + merge_across_dims = TRUE, + merge_across_dims_narm = FALSE, + split_multiselected_dims = FALSE, + synonims = list(latitude = c('lat','latitude'), + longitude = c('lon','longitude')), + return_vars = list(latitude = 'dat', + longitude = 'dat', + time = 'file_date'), + retrieve = T) + + expect_equal( + dim(obs), + c(dat = 1, var = 1, latitude = 10, longitude = 10, time = 108) + ) + expect_equal( + obs[1, 1, 1, 1, 10:13], + c(4.076760, 7.644944, 4.589063, 4.402223), + tolerance = 0.0001 + ) + expect_equal( + mean(obs, na.rm = T), + 5.751444, + tolerance = 0.0001 + ) + expect_equal( + length(obs[which(is.na(obs))]), + 3600 + ) +})