From d1c94c1664cded5765b6f76448b6dfbd0cf71f1f Mon Sep 17 00:00:00 2001 From: aho Date: Mon, 27 Apr 2020 17:43:12 +0200 Subject: [PATCH 01/23] Temporary fix for merge_across_dims & split_multiselected_dims removing NAs. --- R/Start.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/R/Start.R b/R/Start.R index 156ee53..8c76087 100644 --- a/R/Start.R +++ b/R/Start.R @@ -3055,7 +3055,10 @@ print("-> WORK PIECES BUILT") } } #print("P") - data_array <- array(bigmemory::as.matrix(data_array), dim = final_dims_fake) +# NOTE: The three lines here are temporary solutions!! + data_array_tmp <- array(bigmemory::as.matrix(data_array)) + data_array_tmp <- data_array_tmp[which(!is.na(data_array_tmp))] + data_array <- array(data_array_tmp, dim = final_dims_fake) gc() # Load metadata and remove the metadata folder -- GitLab From 7c4d0b9998130b277910992d7ec90df734e5e8fc Mon Sep 17 00:00:00 2001 From: aho Date: Wed, 29 Apr 2020 19:04:51 +0200 Subject: [PATCH 02/23] Add new parameter 'merge_across_dims_narm' and develop codes for the case 'merge_across_dims_narm = TRUE'. --- R/Start.R | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 80 insertions(+), 4 deletions(-) diff --git a/R/Start.R b/R/Start.R index 8c76087..d5b8ecb 100644 --- a/R/Start.R +++ b/R/Start.R @@ -20,6 +20,7 @@ Start <- function(..., # dim = indices/selectors, metadata_dims = NULL, selector_checker = SelectorChecker, merge_across_dims = FALSE, + merge_across_dims_narm = FALSE, split_multiselected_dims = FALSE, path_glob_permissive = FALSE, retrieve = FALSE, @@ -132,6 +133,17 @@ Start <- function(..., # dim = indices/selectors, stop("Parameter 'merge_across_dims' must be TRUE or FALSE.") } + # Check merge_across_dims_narm + if (!is.logical(merge_across_dims_narm)) { + stop("Parameter 'merge_across_dims_narm' must be TRUE or FALSE.") + } + if (!merge_across_dims & merge_across_dims_narm) { + merge_across_dims_narm <- FALSE + warning(paste0("Parameter 'merge_across_dims_narm' can only be TRUE when ", + "'merge_across_dims' is TRUE. Set 'merge_across_dims_narm'", + " to FALSE.")) + } + # Leave alone the dimension parameters in the variable dim_params if (length(c(var_params_ind, dim_reorder_params_ind, tolerance_params_ind, depends_params_ind, across_params_ind)) > 0) { @@ -3055,10 +3067,74 @@ print("-> WORK PIECES BUILT") } } #print("P") -# NOTE: The three lines here are temporary solutions!! - data_array_tmp <- array(bigmemory::as.matrix(data_array)) - data_array_tmp <- data_array_tmp[which(!is.na(data_array_tmp))] - data_array <- array(data_array_tmp, dim = final_dims_fake) + +# NOTE: If merge_across_dims = TRUE, there might be additional NAs due to +# unequal inner_dim ('time') length across file_dim ('file_date'). +# If merge_across_dims_narm = TRUE, add additional lines to remove these NAs. +# TODO: Now it assumes that only one '_across'. Add a for loop for more-than-one case. + if (merge_across_dims_narm) { + + across_inner_dim <- inner_dims_across_files[[1]] #TODO: more than one? + across_file_dim <- names(inner_dims_across_files) #TODO: more than one? + # Get the length of each inner_dim ('time') along each file_dim ('file_date') + wp_inner_across_dim <- lapply(lapply(work_pieces, '[[', 'first_round_indices'), + '[[', across_inner_dim) + length_inner_across_dim <- lapply(wp_inner_across_dim, length) + + # Get the length of these two dimensions in final_dims + length_inner_across_store_dims <- final_dims[across_inner_dim] + length_file_across_store_dims <- final_dims[across_file_dim] + + # Create a logical array for merge_across_dims + logi_array <- array(rep(FALSE, + length_file_across_store_dims * length_inner_across_store_dims), + dim = c(length_inner_across_store_dims, length_file_across_store_dims)) + for (i in 1:length_file_across_store_dims) { #1:4 + logi_array[1:length_inner_across_dim[[i]], i] <- TRUE + } + + # First, get the data array with final_dims dimension + data_array_final_dims <- array(bigmemory::as.matrix(data_array), dim = final_dims) + + # Change the NA derived from additional spaces to -9999, then remove these -9999 + func_remove_blank <- function(data_array, logi_array) { + # dim(data_array) = [time, file_date] + # dim(logi_array) = [time, file_date] + # Change the blank spaces from NA to -9999 + data_array[which(!logi_array)] <- -9999 + return(data_array) + } + data_array_final_dims <- Apply(data_array_final_dims, + target_dims = c(across_inner_dim, across_file_dim), #c('time', 'file_date') + output_dims = c(across_inner_dim, across_file_dim), + fun = func_remove_blank, + logi_array = logi_array)$output1 + ## reorder back to the correct dim + tmp <- match(names(final_dims), names(dim(data_array_final_dims))) + data_array_final_dims <- s2dverification:::.aperm2(data_array_final_dims, tmp) + data_array_tmp <- data_array_final_dims[which(data_array_final_dims != -9999)] # become a vector + + if (!split_multiselected_dims) { + # only merge_across_dims -> the 'time' dim length needs to be adjusted + final_dims_fake_name <- names(final_dims_fake) + pos_across_inner_dim <- which(final_dims_fake_name == across_inner_dim) + new_length_inner_dim <- sum(unlist(length_inner_across_dim)) + if (pos_across_inner_dim != length(final_dims_fake)) { + final_dims_fake <- c(final_dims_fake[1:(pos_across_inner_dim - 1)], + new_length_inner_dim, + final_dims_fake[(pos_across_inner_dim + 1):length(final_dims_fake)]) + } else { + final_dims_fake <- c(final_dims_fake[1:(pos_across_inner_dim - 1)], + new_length_inner_dim) + } + names(final_dims_fake) <- final_dims_fake_name + } + data_array <- array(data_array_tmp, dim = final_dims_fake) + + } else { # merge_across_dims_narm = F (old version) + data_array <- array(bigmemory::as.matrix(data_array), dim = final_dims_fake) + } + gc() # Load metadata and remove the metadata folder -- GitLab From 5e763ea1655bb1ce3c4fe832293a46cc77c80cab Mon Sep 17 00:00:00 2001 From: aho Date: Wed, 29 Apr 2020 19:30:09 +0200 Subject: [PATCH 03/23] Add multiApply in front of Apply() --- R/Start.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/Start.R b/R/Start.R index d5b8ecb..9939b51 100644 --- a/R/Start.R +++ b/R/Start.R @@ -3104,7 +3104,7 @@ print("-> WORK PIECES BUILT") data_array[which(!logi_array)] <- -9999 return(data_array) } - data_array_final_dims <- Apply(data_array_final_dims, + data_array_final_dims <- multiApply::Apply(data_array_final_dims, target_dims = c(across_inner_dim, across_file_dim), #c('time', 'file_date') output_dims = c(across_inner_dim, across_file_dim), fun = func_remove_blank, -- GitLab From e5be2fe01ee3efb51f51def56ad55f20d379dfd8 Mon Sep 17 00:00:00 2001 From: aho Date: Thu, 30 Apr 2020 20:33:01 +0200 Subject: [PATCH 04/23] Fix to remain NAs, not removed along with -9999 --- R/Start.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/Start.R b/R/Start.R index 9939b51..1a50174 100644 --- a/R/Start.R +++ b/R/Start.R @@ -3112,7 +3112,7 @@ print("-> WORK PIECES BUILT") ## reorder back to the correct dim tmp <- match(names(final_dims), names(dim(data_array_final_dims))) data_array_final_dims <- s2dverification:::.aperm2(data_array_final_dims, tmp) - data_array_tmp <- data_array_final_dims[which(data_array_final_dims != -9999)] # become a vector + data_array_tmp <- data_array_final_dims[data_array_final_dims != -9999] # become a vector if (!split_multiselected_dims) { # only merge_across_dims -> the 'time' dim length needs to be adjusted -- GitLab From 097d080bf33225bc12412e49ef24037e72de9e67 Mon Sep 17 00:00:00 2001 From: aho Date: Thu, 30 Apr 2020 22:06:05 +0200 Subject: [PATCH 05/23] Move adjusted final_dims_fake toward, so the printed detected dimensions will be correct. --- R/Start.R | 43 +++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/R/Start.R b/R/Start.R index 1a50174..fac7504 100644 --- a/R/Start.R +++ b/R/Start.R @@ -2796,6 +2796,27 @@ print("-> PROCEEDING TO CROP VARIABLES") } } } + if (merge_across_dims & !split_multiselected_dims) { + # only merge_across_dims -> the 'time' dim length needs to be adjusted + across_inner_dim <- inner_dims_across_files[[1]] #TODO: more than one? + across_file_dim <- names(inner_dims_across_files) #TODO: more than one? + # Get the length of each inner_dim ('time') along each file_dim ('file_date') length_inner_across_dim <- lapply(dat[[i]][['selectors']][[across_inner_dim]][['fri']], length) +print('length_inner_across_dim') +print(length_inner_across_dim) + + final_dims_fake_name <- names(final_dims_fake) + pos_across_inner_dim <- which(final_dims_fake_name == across_inner_dim) + new_length_inner_dim <- sum(unlist(length_inner_across_dim)) + if (pos_across_inner_dim != length(final_dims_fake)) { + final_dims_fake <- c(final_dims_fake[1:(pos_across_inner_dim - 1)], + new_length_inner_dim, + final_dims_fake[(pos_across_inner_dim + 1):length(final_dims_fake)]) + } else { + final_dims_fake <- c(final_dims_fake[1:(pos_across_inner_dim - 1)], + new_length_inner_dim) + } + names(final_dims_fake) <- final_dims_fake_name + } if (!silent) { .message("Detected dimension sizes:") @@ -3074,13 +3095,6 @@ print("-> WORK PIECES BUILT") # TODO: Now it assumes that only one '_across'. Add a for loop for more-than-one case. if (merge_across_dims_narm) { - across_inner_dim <- inner_dims_across_files[[1]] #TODO: more than one? - across_file_dim <- names(inner_dims_across_files) #TODO: more than one? - # Get the length of each inner_dim ('time') along each file_dim ('file_date') - wp_inner_across_dim <- lapply(lapply(work_pieces, '[[', 'first_round_indices'), - '[[', across_inner_dim) - length_inner_across_dim <- lapply(wp_inner_across_dim, length) - # Get the length of these two dimensions in final_dims length_inner_across_store_dims <- final_dims[across_inner_dim] length_file_across_store_dims <- final_dims[across_file_dim] @@ -3114,21 +3128,6 @@ print("-> WORK PIECES BUILT") data_array_final_dims <- s2dverification:::.aperm2(data_array_final_dims, tmp) data_array_tmp <- data_array_final_dims[data_array_final_dims != -9999] # become a vector - if (!split_multiselected_dims) { - # only merge_across_dims -> the 'time' dim length needs to be adjusted - final_dims_fake_name <- names(final_dims_fake) - pos_across_inner_dim <- which(final_dims_fake_name == across_inner_dim) - new_length_inner_dim <- sum(unlist(length_inner_across_dim)) - if (pos_across_inner_dim != length(final_dims_fake)) { - final_dims_fake <- c(final_dims_fake[1:(pos_across_inner_dim - 1)], - new_length_inner_dim, - final_dims_fake[(pos_across_inner_dim + 1):length(final_dims_fake)]) - } else { - final_dims_fake <- c(final_dims_fake[1:(pos_across_inner_dim - 1)], - new_length_inner_dim) - } - names(final_dims_fake) <- final_dims_fake_name - } data_array <- array(data_array_tmp, dim = final_dims_fake) } else { # merge_across_dims_narm = F (old version) -- GitLab From 48acc384184151787b7b1d40178ce63c3ccb3754 Mon Sep 17 00:00:00 2001 From: aho Date: Thu, 30 Apr 2020 22:08:19 +0200 Subject: [PATCH 06/23] Typo fix. --- R/Start.R | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/R/Start.R b/R/Start.R index fac7504..8d56e09 100644 --- a/R/Start.R +++ b/R/Start.R @@ -2800,9 +2800,8 @@ print("-> PROCEEDING TO CROP VARIABLES") # only merge_across_dims -> the 'time' dim length needs to be adjusted across_inner_dim <- inner_dims_across_files[[1]] #TODO: more than one? across_file_dim <- names(inner_dims_across_files) #TODO: more than one? - # Get the length of each inner_dim ('time') along each file_dim ('file_date') length_inner_across_dim <- lapply(dat[[i]][['selectors']][[across_inner_dim]][['fri']], length) -print('length_inner_across_dim') -print(length_inner_across_dim) + # Get the length of each inner_dim ('time') along each file_dim ('file_date') +length_inner_across_dim <- lapply(dat[[i]][['selectors']][[across_inner_dim]][['fri']], length) final_dims_fake_name <- names(final_dims_fake) pos_across_inner_dim <- which(final_dims_fake_name == across_inner_dim) -- GitLab From cde7e1e5d214d99992cc90959b1d1605ac977f57 Mon Sep 17 00:00:00 2001 From: aho Date: Thu, 30 Apr 2020 22:16:41 +0200 Subject: [PATCH 07/23] Change the condition of final_dims_fake adjustment from merge_across_dims to merge_across_dims_narm --- R/Start.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/Start.R b/R/Start.R index 8d56e09..0e17136 100644 --- a/R/Start.R +++ b/R/Start.R @@ -2796,7 +2796,7 @@ print("-> PROCEEDING TO CROP VARIABLES") } } } - if (merge_across_dims & !split_multiselected_dims) { + if (merge_across_dims_narm & !split_multiselected_dims) { # only merge_across_dims -> the 'time' dim length needs to be adjusted across_inner_dim <- inner_dims_across_files[[1]] #TODO: more than one? across_file_dim <- names(inner_dims_across_files) #TODO: more than one? -- GitLab From 6c7be03438ce7ecb47f1c6d708ce0955e7685be4 Mon Sep 17 00:00:00 2001 From: aho Date: Mon, 4 May 2020 10:39:26 +0200 Subject: [PATCH 08/23] Update NEW.md. Add new parameter 'merge_across_dims_narm' in Start(). --- NEWS.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 827fa58..84dad0f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,9 @@ # startR v1.0.2 (Release date: 2020-) -- Bugfix for longitude transformation when the required grid point across the borders. The bug apprears at v1.0.0 and v1.0.1. +- Bugfix for longitude transformation when the required grid point across the borders. The bug apprears at v1.0.0 and v1.0.1. +- Add one new parameter 'merge_across_dims_narm' in Start(). If it is TRUE, +the additional NAs in the across dimension will be removed. It is useful when +a continuous time series is required, or parameter 'split_multiselected_dims' is +TRUE and expected dimensions are supposed to have no NAs. # startR v1.0.1 (Release date: 2020-04-21) - Bugfix for global longitude across the borders. -- GitLab From 1621f614aae4f3555bf41813af3d90f5026e8fb8 Mon Sep 17 00:00:00 2001 From: aho Date: Mon, 4 May 2020 23:10:06 +0200 Subject: [PATCH 09/23] Fix mixed dim problem when merge_across_dims and split_multiselected_dims are both used. --- R/Start.R | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/R/Start.R b/R/Start.R index 0e17136..bfd7a21 100644 --- a/R/Start.R +++ b/R/Start.R @@ -2418,6 +2418,8 @@ print("-> THE INNER DIMENSION GOES ACROSS A FILE DIMENSION.") } } else { if (inner_dim %in% names(dim(sub_array_of_values))) { +# NOTE: Put across-inner-dim at the 1st position. +# POSSIBLE PROB!! Only organize inner dim, the rest dims may not in the same order as sub_array_of_selectors below. inner_dim_pos_in_sub_array <- which(names(dim(sub_array_of_values)) == inner_dim) if (inner_dim_pos_in_sub_array != 1) { new_sub_array_order <- (1:length(dim(sub_array_of_values)))[-inner_dim_pos_in_sub_array] @@ -2426,6 +2428,9 @@ print("-> THE INNER DIMENSION GOES ACROSS A FILE DIMENSION.") } } } + +# NOTE: Put across-inner-dim at the 1st position. +# POSSIBLE PROB!! Only organize inner dim, the rest dims may not in the same order as sub_array_of_values above. inner_dim_pos_in_sub_array <- which(names(dim(sub_array_of_selectors)) == inner_dim) if (inner_dim_pos_in_sub_array != 1) { new_sub_array_order <- (1:length(dim(sub_array_of_selectors)))[-inner_dim_pos_in_sub_array] @@ -2783,10 +2788,28 @@ print("-> PROCEEDING TO CROP VARIABLES") 1:length(split_dims)) } old_dim_pos <- which(names(final_dims_fake) == names(dim_params)[dim_param]) + +# NOTE: Three steps to create new dims. +# 1st: Put in the dims before split_dim. +# 2nd: Replace the old_dim with split_dims. +# 3rd: Put in the dims after split_dim. new_dims <- c() if (old_dim_pos > 1) { new_dims <- c(new_dims, final_dims_fake[1:(old_dim_pos - 1)]) } + +# NOTE: If merge_across_dims, the dim order is changed. Put inner_dim the first. +# Cannot control the rest dims are in the same order or not... +# Suppose users put the same order of across inner and file dims. + if (merge_across_dims) { +# TODO: More than one split? + inner_dim_pos_in_split_dims <- which(names(split_dims) == inner_dims_across_files) + if (inner_dim_pos_in_split_dims != 1) { + split_dims <- c(split_dims[inner_dim_pos_in_split_dims], + split_dims[1:length(split_dims)][-inner_dim_pos_in_split_dims]) + } + } + new_dims <- c(new_dims, split_dims) if (old_dim_pos < length(final_dims_fake)) { new_dims <- c(new_dims, final_dims_fake[(old_dim_pos + 1):length(final_dims_fake)]) @@ -2796,13 +2819,14 @@ print("-> PROCEEDING TO CROP VARIABLES") } } } - if (merge_across_dims_narm & !split_multiselected_dims) { + if (merge_across_dims_narm) { # only merge_across_dims -> the 'time' dim length needs to be adjusted across_inner_dim <- inner_dims_across_files[[1]] #TODO: more than one? across_file_dim <- names(inner_dims_across_files) #TODO: more than one? # Get the length of each inner_dim ('time') along each file_dim ('file_date') -length_inner_across_dim <- lapply(dat[[i]][['selectors']][[across_inner_dim]][['fri']], length) + length_inner_across_dim <- lapply(dat[[i]][['selectors']][[across_inner_dim]][['fri']], length) + if (!split_multiselected_dims) { final_dims_fake_name <- names(final_dims_fake) pos_across_inner_dim <- which(final_dims_fake_name == across_inner_dim) new_length_inner_dim <- sum(unlist(length_inner_across_dim)) @@ -2816,6 +2840,7 @@ length_inner_across_dim <- lapply(dat[[i]][['selectors']][[across_inner_dim]][[' } names(final_dims_fake) <- final_dims_fake_name } + } if (!silent) { .message("Detected dimension sizes:") @@ -3124,7 +3149,7 @@ print("-> WORK PIECES BUILT") logi_array = logi_array)$output1 ## reorder back to the correct dim tmp <- match(names(final_dims), names(dim(data_array_final_dims))) - data_array_final_dims <- s2dverification:::.aperm2(data_array_final_dims, tmp) + data_array_final_dims <- .aperm2(data_array_final_dims, tmp) data_array_tmp <- data_array_final_dims[data_array_final_dims != -9999] # become a vector data_array <- array(data_array_tmp, dim = final_dims_fake) -- GitLab From 13a508dd38b6289381345236e79f2caf8dd6b057 Mon Sep 17 00:00:00 2001 From: aho Date: Tue, 5 May 2020 15:57:05 +0200 Subject: [PATCH 10/23] Add new param 'merge_across_dims_narm' and add value checks. --- inst/doc/usecase/ex1_3_attr_loadin.R | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/inst/doc/usecase/ex1_3_attr_loadin.R b/inst/doc/usecase/ex1_3_attr_loadin.R index ce37db5..f1970ec 100644 --- a/inst/doc/usecase/ex1_3_attr_loadin.R +++ b/inst/doc/usecase/ex1_3_attr_loadin.R @@ -40,18 +40,31 @@ time_var = 'time', #because time is assigned by 'values', set the tolerance to avoid too distinct match time_tolerance = as.difftime(1, units = 'hours'), - #time is defined by dates, which dimension is [sdate = 8, time = 200] + #time is defined by dates, which dimension is [sdate = 8, time = 31] time_across = 'file_date', return_vars = list(latitude = NULL, longitude = NULL, time = 'file_date'), #combine time and file_date merge_across_dims = TRUE, + #exclude the additional NAs generated by merge_across_dims + merge_across_dims_narm = TRUE, #split time, because it is two-dimensional - split_multiselected_dims = TRUE) + split_multiselected_dims = TRUE, + retrieve = TRUE) #------- Check erai ----------- - attr(erai, 'Dimensions') +dim(erai) # dat var sdate time latitude longitude # 1 1 8 31 10 10 + +erai[1, 1, 1, , 1, 1] +# [1] 255.0120 256.8095 254.3654 254.6059 257.0551 255.5087 256.8167 257.9717 +# [9] 258.7491 259.2942 259.6682 260.7215 260.0988 261.2605 263.3590 265.6683 +#[17] 262.4813 262.6136 263.0591 262.8377 261.7276 263.9910 264.7755 266.0213 +#[25] 268.5927 267.8699 268.9210 269.4702 267.6735 267.9255 268.2216 + +erai[1, 1, , 1, 2, 2] +#[1] 254.5793 269.6221 274.5021 274.0269 269.5855 253.7458 243.9750 244.2415 + #------------------------------ -- GitLab From bc6196ea1cf7c68d64c71eb10b48b7b93670053f Mon Sep 17 00:00:00 2001 From: aho Date: Tue, 5 May 2020 15:59:20 +0200 Subject: [PATCH 11/23] Move adjusted split dim order part to after 'Detect dimension' print. Then reorder dimension back to the user-required order after getting array. --- R/Start.R | 48 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 35 insertions(+), 13 deletions(-) diff --git a/R/Start.R b/R/Start.R index bfd7a21..0be57a0 100644 --- a/R/Start.R +++ b/R/Start.R @@ -2797,19 +2797,6 @@ print("-> PROCEEDING TO CROP VARIABLES") if (old_dim_pos > 1) { new_dims <- c(new_dims, final_dims_fake[1:(old_dim_pos - 1)]) } - -# NOTE: If merge_across_dims, the dim order is changed. Put inner_dim the first. -# Cannot control the rest dims are in the same order or not... -# Suppose users put the same order of across inner and file dims. - if (merge_across_dims) { -# TODO: More than one split? - inner_dim_pos_in_split_dims <- which(names(split_dims) == inner_dims_across_files) - if (inner_dim_pos_in_split_dims != 1) { - split_dims <- c(split_dims[inner_dim_pos_in_split_dims], - split_dims[1:length(split_dims)][-inner_dim_pos_in_split_dims]) - } - } - new_dims <- c(new_dims, split_dims) if (old_dim_pos < length(final_dims_fake)) { new_dims <- c(new_dims, final_dims_fake[(old_dim_pos + 1):length(final_dims_fake)]) @@ -2864,6 +2851,32 @@ print("-> PROCEEDING TO CROP VARIABLES") indent = 2) } +# NOTE: If split_multiselected_dims + merge_across_dims, the dim order may need to be changed. +# The inner_dim needs to be the first dim among split dims. +# Cannot control the rest dims are in the same order or not... +# Suppose users put the same order of across inner and file dims. + if (split_multiselected_dims & merge_across_dims) { + # TODO: More than one split? + inner_dim_pos_in_split_dims <- which(names(all_split_dims[[1]]) == inner_dims_across_files) + # if inner_dim is not the first, change! + if (inner_dim_pos_in_split_dims != 1) { + split_dims <- c(split_dims[inner_dim_pos_in_split_dims], + split_dims[1:length(split_dims)][-inner_dim_pos_in_split_dims]) + split_dims_pos <- which(!is.na(match(names(final_dims_fake), names(split_dims)))) + # Save the current final_dims_fake for later reorder back + final_dims_fake_output <- final_dims_fake + new_dims <- c() + if (split_dims_pos[1] != 1) { + new_dims <- c(new_dims, final_dims_fake[1:(split_dims_pos[1] - 1)]) + } + new_dims <- c(new_dims, split_dims) + if (split_dims_pos[length(split_dims_pos)] < length(final_dims_fake)) { + new_dims <- c(new_dims, final_dims_fake[(split_dims_pos[length(split_dims_pos)] + 1):length(final_dims_fake)]) + } + final_dims_fake <- new_dims + } + } + # The following several lines will only be run if retrieve = TRUE if (retrieve) { @@ -3158,6 +3171,15 @@ print("-> WORK PIECES BUILT") data_array <- array(bigmemory::as.matrix(data_array), dim = final_dims_fake) } +# NOTE: If split_multiselected_dims + merge_across_dims, the dimension order may change above. +# To get the user-required dim order, we need to reorder the array again. + if (split_multiselected_dims & merge_across_dims) { + if (inner_dim_pos_in_split_dims != 1) { + correct_order <- match(names(final_dims_fake_output), names(final_dims_fake)) + data_array <- .aperm2(data_array, correct_order) + } + } + gc() # Load metadata and remove the metadata folder -- GitLab From a805910b32accd9bf75de1c0198fd1a117020802 Mon Sep 17 00:00:00 2001 From: aho Date: Tue, 5 May 2020 18:30:37 +0200 Subject: [PATCH 12/23] Change final_dims_fake back after reoder array because if retrieve = F, it will be used in attributes later --- R/Start.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/R/Start.R b/R/Start.R index 0be57a0..e80e6df 100644 --- a/R/Start.R +++ b/R/Start.R @@ -3221,6 +3221,9 @@ print("-> WORK PIECES BUILT") } # End if (retrieve) + # Change final_dims_fake back because retrieve = FALSE will use it for attributes later + final_dims_fake <- final_dims_fake_output + # Replace the vars and common vars by the transformed vars and common vars for (i in 1:length(dat)) { if (length(names(transformed_vars[[i]])) > 0) { -- GitLab From 027530e9b2f387de845d498fa3fb683f3710da7f Mon Sep 17 00:00:00 2001 From: aho Date: Tue, 5 May 2020 18:34:10 +0200 Subject: [PATCH 13/23] Amend usecase ex1_3. Add complete explanation and details about 'merge_across_dims_narm'. --- inst/doc/usecase.md | 3 +- inst/doc/usecase/ex1_3_attr_loadin.R | 124 +++++++++++++++++++++------ 2 files changed, 101 insertions(+), 26 deletions(-) diff --git a/inst/doc/usecase.md b/inst/doc/usecase.md index 35130e0..9ec2572 100644 --- a/inst/doc/usecase.md +++ b/inst/doc/usecase.md @@ -17,7 +17,8 @@ In this document, you can link to the example scripts for various demands. For t **'merge_across_dims'**, and **'split_multiselected_dims'**. 3. [Use experimental data attribute to load in oberservational data](inst/doc/usecase/ex1_3_attr_loadin.R) - Load the experimental data first (with `retrieve = FALSE`), then retreive its dates and time attributes to use in the observational data load-in. It also shows how to use parameters `xxx_tolerance`, `xxx_across`, `merge_across_dims`, `split_multiselected_dims`. + Like ex1_2, it shows how to retrieve the experimental data and observational data +in a comparable structure. It also shows how to use parameters `xxx_tolerance`, `xxx_across`, `merge_across_dims`, `merge_across_dims_narm`, and `split_multiselected_dims`. 4. [Checking impact of start date order in the number of members](inst/doc/usecase/ex1_4_variable_nmember.R) Mixing start dates of different months can lead to load different number of members, check the code provided and the [FAQ 10](/inst/doc/faq.md). diff --git a/inst/doc/usecase/ex1_3_attr_loadin.R b/inst/doc/usecase/ex1_3_attr_loadin.R index f1970ec..3dca648 100644 --- a/inst/doc/usecase/ex1_3_attr_loadin.R +++ b/inst/doc/usecase/ex1_3_attr_loadin.R @@ -1,9 +1,22 @@ +#--------------------------------------------------------------------- +# This usecase shows you how to load experimental and observational data in a +# consistent way. + +# First, load the experimental data. Then, use the time attributes of experimental data to define the selectors for observational data. + +# You can see how to use parameter '*_across', 'merge_across_dims', 'merge_across_dims_narm', +# and 'split_multiselected_dims' to create the same dimension structure. +#--------------------------------------------------------------------- + +# experimental data +# May to December 1994, monthly file with 6-hourly frequency repos <- paste0('/esarchive/exp/ecmwf/system4_m1/6hourly/', '$var$/$var$_$sdate$.nc') - + sdate <- sapply(1994, function(x) paste0(x, sprintf('%02d', 5:12), '01')) + system4 <- Start(dat = repos, var = 'tas', - sdate = sapply(1994, function(x) paste0(x, sprintf('%02d', 5:12), '01')), + sdate = sdate, time = indices(seq(1, 124, 4)), #first time step per day ensemble = 'all', latitude = indices(1:10), @@ -20,38 +33,51 @@ # ------- retrieve the attributes for obs load-in ---------- dates <- attr(system4, 'Variables')$common$time -#> dim(dates) +# dim(dates) #sdate time # 8 31 + +# NOTE: Even though June, September, and November only have 30 days, it reads +# 31 days for each month. Therefore, the last day of these months is the +# first day of the following month. No NA in this dates array. + +# Manually retrieve the observation dates in the required format dates_file <- sort(unique(gsub('-', '', sapply(as.character(dates), substr, 1, 7)))) -#> dates_file +# dates_file #[1] "199405" "199406" "199407" "199408" "199409" "199410" "199411" "199412" # ----------------------------------------------------------- +# observational data +# May to December 1994, monthly file with 6-hourly frequency repos_obs <- paste0('/esarchive/recon/ecmwf/erainterim/6hourly/', '$var$/$var$_$file_date$.nc') - erai <- Start(dat = repos_obs, - var = 'tas', - file_date = dates_file, - time = values(dates), #use dates from exp - latitude = indices(1:10), - longitude = indices(1:10), - time_var = 'time', - #because time is assigned by 'values', set the tolerance to avoid too distinct match - time_tolerance = as.difftime(1, units = 'hours'), - #time is defined by dates, which dimension is [sdate = 8, time = 31] - time_across = 'file_date', - return_vars = list(latitude = NULL, - longitude = NULL, - time = 'file_date'), - #combine time and file_date - merge_across_dims = TRUE, - #exclude the additional NAs generated by merge_across_dims - merge_across_dims_narm = TRUE, - #split time, because it is two-dimensional - split_multiselected_dims = TRUE, - retrieve = TRUE) + erai <- Start(dat = repos_obs, + var = 'tas', + file_date = dates_file, + time = values(dates), + latitude = indices(1:10), + longitude = indices(1:10), + time_var = 'time', + #because time is assigned by 'values', set the tolerance to avoid too distinct match + time_tolerance = as.difftime(1, units = 'hours'), + #time values are across all the files + time_across = 'file_date', + #combine time and file_date dims + merge_across_dims = TRUE, + #exclude the additional NAs generated by merge_across_dims + merge_across_dims_narm = TRUE, + #split time dim, because it is two-dimensional + split_multiselected_dims = TRUE, + return_vars = list(latitude = NULL, + longitude = NULL, + time = 'file_date'), + retrieve = TRUE) + +# NOTE: 'merge_across_dims_narm = TRUE' is necessary because the observational +# data have unequal time length of 30-day and 31-day months. +# If the NAs are not removed, unwanted NAs will exist and make the +# values misplaced in the array. See 'bonus' below for more explanation. #------- Check erai ----------- dim(erai) @@ -67,4 +93,52 @@ erai[1, 1, 1, , 1, 1] erai[1, 1, , 1, 2, 2] #[1] 254.5793 269.6221 274.5021 274.0269 269.5855 253.7458 243.9750 244.2415 +# NOTE: You will see that the observation array is the same as experiment one +# that the last day in 30-day months are the first day of the following +# month. +erai[1, 1, 3, 1, 1, 1] # 1st March +#[1] 274.6019 +erai[1, 1, 2, 31, 1, 1] # 1st March also, since June only has 30 days +#[1] 274.6019 +#------------------------------ + +# The experimental and observational data are comparable with same structure. + + +# //////////////////"BONUS"////////////////////// +# Here is something more to show the usage of parameter 'merge_across_dims_narm'. +# If the last day of 30-day months is NA instead of the first day of the following month, +# NAs are needed to exist in the array. In this case, 'merge_across_dims_narm' +# should be FALSE. + + dates <- attr(system4, 'Variables')$common$time + dates[2, 31] +#[1] "1994-07-01 CEST" + dates[2, 31] <- NA # Jun + dates[5, 31] <- NA # Sep + dates[7, 31] <- NA # Nov + + erai <- Start(dat = repos_obs, + var = 'tas', + file_date = dates_file, + time = values(dates), + latitude = indices(1:10), + longitude = indices(1:10), + time_var = 'time', + time_across = 'file_date', + merge_across_dims = TRUE, + #keep NAs of the last day in 30-day months + merge_across_dims_narm = FALSE, + split_multiselected_dims = TRUE, + return_vars = list(latitude = NULL, + longitude = NULL, + time = 'file_date'), + retrieve = TRUE) + +#------- Check erai ----------- +erai[1, 1, 2, , 1, 1] # June +# [1] 269.9410 269.6855 268.7380 268.5008 270.3236 271.5151 270.5046 270.1686 +# [9] 270.5395 272.0379 272.5489 271.1494 270.7764 270.5678 272.0331 273.7856 +#[17] 273.9849 274.5904 273.4369 273.8404 274.4068 274.2292 274.7375 275.5104 +#[25] 275.4324 274.9408 274.8679 276.5602 275.0995 274.6409 NA #------------------------------ -- GitLab From 2d7bf79c4c39333e2f5bc26804d9215518668285 Mon Sep 17 00:00:00 2001 From: aho Date: Tue, 5 May 2020 19:10:26 +0200 Subject: [PATCH 14/23] Add condition to changing final_dims_fake back. --- R/Start.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/Start.R b/R/Start.R index e80e6df..02d2b4c 100644 --- a/R/Start.R +++ b/R/Start.R @@ -3222,8 +3222,9 @@ print("-> WORK PIECES BUILT") } # End if (retrieve) # Change final_dims_fake back because retrieve = FALSE will use it for attributes later + if (exists("final_dims_fake_output")) { final_dims_fake <- final_dims_fake_output - + } # Replace the vars and common vars by the transformed vars and common vars for (i in 1:length(dat)) { if (length(names(transformed_vars[[i]])) > 0) { -- GitLab From a6d985f0f8984ff4cf1e344ffcc0b226ede66f8c Mon Sep 17 00:00:00 2001 From: aho Date: Tue, 5 May 2020 22:44:17 +0200 Subject: [PATCH 15/23] Amend FAQ how-to-2 for new parameter 'merge_across_dims_narm'. --- inst/doc/faq.md | 63 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 53 insertions(+), 10 deletions(-) diff --git a/inst/doc/faq.md b/inst/doc/faq.md index cb15d9d..ccd4686 100644 --- a/inst/doc/faq.md +++ b/inst/doc/faq.md @@ -5,7 +5,7 @@ This document intends to be the first reference for any doubts that you may have ## Index 1. **How to** 1. [Choose the number of chunks/jobs/cores in Compute()](#1-choose-the-number-of-chunksjobscores-in-compute) - 2. [Merge/Reorder dimension in Start() (using parameter 'xxx_across' and 'merge_across_dims')](#2-mergereorder-dimension-in-start-using-parameter-xxx_across-and-merge_across_dims) + 2. [Indicate dependent dimension and use merge parameters in Start()](#2-indicate-dependent-dimension-and-use-merge-parameters-in-start) 3. [Use self-defined function in Compute()](#3-use-self-defined-function-in-compute) 4. [Use package function in Compute()](#4-use-package-function-in-compute) 5. [Do interpolation in Start() (using parameter 'transform')](#5-do-interpolation-in-start-using-parameter-transform) @@ -38,22 +38,44 @@ Divide data into chunks according to the size of machine memory module (Power9 i Find more details in practical_guide.md [How to choose the number of chunks, jobs and cores](inst/doc/practical_guide.md#how-to-choose-the-number-of-chunks-jobs-and-cores) -### 2. Merge/Reorder dimension in Start() (using parameter 'xxx_across' and 'merge_across_dims') -The parameter `'xxx_across = yyy'` indicates that the inner dimension 'xxx' is continuous along the file dimension 'yyy'. A common example is 'time_across = chunk', when the experiment runs through many years and the result is saved in several chunk files. Find more details in startR documentation. +### 2. Indicate dependent dimension and use merge parameters in Start() +The parameter `'xxx_across = yyy'` indicates that the inner dimension 'xxx' is continuous along the file dimension 'yyy'. +A common example is 'time_across = chunk', when the experiment runs through many years +and the result is saved in several chunk files. -If you define this parameter, you can specify 'xxx' with the indices throughout the whole 'yyy' files, not only within one file. See Example 1 below, 'time = indices(1:24)' is available when 'time_across = chunk' is specified. If not, 'time' can only be 12 for most. +If you indicate this dependent relation, you can specify 'xxx' with the indices +throughout the whole 'yyy' files, instead of only within one file. See Example 1 below, +'time = indices(1:24)' is available when 'time_across = chunk' is specified. If not, 'time' can only be 12 for most. -One example making advantage of 'xxx_across' is extracting an climate event across years, like El Niño. If the event starts from Nov 2014 to May 2016 (19 months in total), simply specify 'time = indices(11:29)' (Example 2) +One example taking advantage of 'xxx_across' is extracting an climate event across years, like El Niño. +If the event starts from Nov 2014 to May 2016 (19 months in total), simply specify 'time = indices(11:29)' (Example 2). -The thing you should bear in mind when using this parameter is the returned data structure. First, **the length of the return xxx dimension is the length of the longest xxx in all files**. Take the El Niño above as an example. The first chunk has 2 months, the second chunk has 12 months, and the third chunk has 5 months. Therefore, the length of time dimension will be 12, and the length of chunk dimension will be 3. +The thing you should bear in mind when using this parameter is the returned data structure. +First, **the length of the return xxx dimension is the length of the longest xxx in all files**. +Take the El Niño above as an example. The first chunk has 2 months, the second chunk has 12 months, +and the third chunk has 5 months. Therefore, the length of time dimension will be 12, and the length of chunk dimension will be 3. -Second, the way Start() store data is **put data at the left-most position**. Take the El Niño (Example 2) above as an example again. The first chunk has only 2 months, so position 1 and 2 have values (which are Nov and Dec 2014). The second chunk has 12 months, so all positions have values (Jan to Dec 2015), while position 3 to 12 will be NA. The third chunk has 5 months, so position 1 to 5 have values (which are Jan to May 2016), while position 6 to 12 will be NA. +Second, the way Start() store data is **put data at the left-most position**. + Take the El Niño (Example 2) above as an example again. The first chunk has only 2 months, +so position 1 and 2 have values (which are Nov and Dec 2014). The second chunk has 12 months, +so all positions have values (Jan to Dec 2015), while position 3 to 12 will be NA. +The third chunk has 5 months, so position 1 to 5 have values (which are Jan to May 2016), while position 6 to 12 will be NA. -It seems more reasonable to put NA at position 1 to 10 in first chunk (Jan to Oct 2014) and and position 6 to 12 in the third chunk (June to Dec 2016). But if the data is not continuous or picked irregularly , it is hard to judge the correct NA position (see Example 3). +It seems more reasonable to put NA at position 1 to 10 in first chunk (Jan to Oct 2014) +and and position 6 to 12 in the third chunk (June to Dec 2016). But if the data is not continuous or picked irregularly , +it is hard to judge the correct NA position (see Example 3). -Since Start() is very flexible with any possible way to read-in data, it is difficult to include all the possibilities and make the output data structure reasonable all the time. Therefore, it is recommended to understand the way Start() rolls first, then you know what you should expect from the output and will not get confused with what it returns to you. +Since Start() is very flexible with any possible way to read-in data, it is difficult to include +all the possibilities and make the output data structure reasonable all the time. +Therefore, it is recommended to understand the way Start() rolls first, +then you know what you should expect from the output and will not get confused with what it returns to you. -As for parameter 'merge_across_dims', it decides whether to connect all 'xxx' together along 'yyy' or not. See Example 1. If 'merge_across_dims = TRUE', the chunk dimension will disappear. 'merge_across_dims' simply attaches data one after another, so the NA values (if exist) will be the same places as the unmerged one (see Example 2). +If you want to connet xxx along yyy, the parameter 'merge_across_dims' and 'merge_across_dims_narm' can help you achieve it. +See Example 1. If 'merge_across_dims = TRUE', the chunk dimension will disappear. +'merge_across_dims' simply attaches data one after another, so the NA values (if exist) will be the same places as the unmerged one (see Example 2). + +If you want to remove those additional NAs, you can use 'merge_across_dims_narm = TRUE', +then the NAs will be removed when merging into one dimension. (see Example 2). Example 1 @@ -134,6 +156,27 @@ data <- Start(dat = repos, [11,] 297.9998 296.8002 NA [12,] 299.4571 299.0254 NA +# use merge parameters + + +data <- Start(dat = repos, + var = 'tos', + memb = 'r24i1p1f1', + time = indices(4:27), # Apr 1957 to Mar 1959 + chunk = c('195701-195712', '195801-195812', '195901-195912'), + lat = 'all', + lon = 'all', + time_across = 'chunk', + merge_across_dims = TRUE, + merge_across_dims_narm = TRUE, + return_vars = list(lat = NULL, lon = NULL), + retrieve = TRUE) + +data[1,1,1,,100,100] + [1] 300.7398 299.6569 298.3954 297.1931 295.9608 295.4735 295.8538 297.9998 + [9] 299.4571 300.7659 301.8241 301.6472 301.0621 299.1324 297.4028 296.1619 +[17] 295.2794 295.0474 295.4571 296.8002 299.0254 301.7128 301.4781 301.3807 + ``` Example 3: Read in three winters (DJF) -- GitLab From 07a5237f577f6829c56a5ed1ee2db433bd6e4824 Mon Sep 17 00:00:00 2001 From: aho Date: Wed, 6 May 2020 11:24:42 +0200 Subject: [PATCH 16/23] Remove one check in chunk() because it doesn't seem reasonable. --- R/Utils.R | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/R/Utils.R b/R/Utils.R index 0d2b2fd..bdf8fda 100644 --- a/R/Utils.R +++ b/R/Utils.R @@ -30,10 +30,12 @@ chunk <- function(chunk, n_chunks, selectors) { if (length(chunk) != length(n_chunks)) { stop("Wrong chunk specification.") } - if (!is.null(attr(selectors, 'values'))) { - stop("Multidimensional chunking only available when selector ", - "values provided.") - } +#NOTE: 1. It should be for above? not nultidimensional selector +# 2. it was !is.null before, but it should be is.null (?) +# if (!is.null(attr(selectors, 'values'))) { +# stop("Multidimensional chunking only available when selector ", +# "values provided.") +# } if (is.null(dim(selectors))) { stop("Multidimensional chunking only available when multidimensional ", "selector values provided.") -- GitLab From e7d1f54f8bf1b7706f50c06b9c1119292261567a Mon Sep 17 00:00:00 2001 From: aho Date: Wed, 6 May 2020 11:25:11 +0200 Subject: [PATCH 17/23] Typo fix. --- inst/doc/usecase/ex1_2_exp_obs_attr.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inst/doc/usecase/ex1_2_exp_obs_attr.R b/inst/doc/usecase/ex1_2_exp_obs_attr.R index de1926e..5a906f3 100644 --- a/inst/doc/usecase/ex1_2_exp_obs_attr.R +++ b/inst/doc/usecase/ex1_2_exp_obs_attr.R @@ -38,8 +38,8 @@ lats <- attr(exp, 'Variables')$common$lat # The 'time' attribute is dependent on 'sdate'. You can see the dimension below. dates <- attr(exp, 'Variables')$common$time # dim(dates) -#sdate ftime -# 4 3 +#sdate time +# 4 3 #------------------------------------------- -- GitLab From daf4e9d7a08870428fdd7ba30156246bd3444eac Mon Sep 17 00:00:00 2001 From: aho Date: Wed, 6 May 2020 13:59:30 +0200 Subject: [PATCH 18/23] Add usecase ex1_7 and FAQ How-to-#17 for split + merge case. --- inst/doc/faq.md | 71 ++++++++++++++++- inst/doc/usecase/ex1_7_split_merge.R | 114 +++++++++++++++++++++++++++ 2 files changed, 183 insertions(+), 2 deletions(-) create mode 100644 inst/doc/usecase/ex1_7_split_merge.R diff --git a/inst/doc/faq.md b/inst/doc/faq.md index ccd4686..a27d473 100644 --- a/inst/doc/faq.md +++ b/inst/doc/faq.md @@ -20,6 +20,7 @@ This document intends to be the first reference for any doubts that you may have 14. [Find the error log when jobs are launched on Power9](#14-find-the-error-log-when-jobs-are-launched-on-power9) 15. [Specify extra function arguments in the workflow](#15-specify-extra-function-arguments-in-the-workflow) 16. [Use parameter 'return_vars' in Start()](#16-use-parameter-return_vars-in-start) + 17. [Use parameter 'split_multiselected_dims' in Start()](#17-use-parameter-split-multiselected-dims-in-start) 2. **Something goes wrong...** @@ -77,6 +78,8 @@ See Example 1. If 'merge_across_dims = TRUE', the chunk dimension will disappear If you want to remove those additional NAs, you can use 'merge_across_dims_narm = TRUE', then the NAs will be removed when merging into one dimension. (see Example 2). +You can find more use cases at [ex1_2_exp_obs_attr.R](inst/doc/usecase/ex1_2_exp_obs_attr.R) and [ex1_3_attr_loadin.R](inst/doc/usecase/ex1_3_attr_loadin.R). + Example 1 ```r @@ -158,7 +161,6 @@ data <- Start(dat = repos, # use merge parameters - data <- Start(dat = repos, var = 'tos', memb = 'r24i1p1f1', @@ -584,8 +586,73 @@ List of 1 $ longitude: num [1:37(1d)] 10 10.3 10.6 10.8 11.1 ... ``` +### 17. Use parameter 'split_multiselected_dims' in Start() +The selectors can be not only vectors, but also multidimensional array. For instance, the 'time' dimension +can be assigned by a two-dimensional array `[sdate = 12, time = 31]`, which is the daily data of 12 months. +You may want to have both 'sdate' and 'time' in the output dimension, even though 'sdate' is not explicitly specified in Start(). +The parameter 'split_multiselected_dims' is for this goal. It is commenly used in the case that uses experimental data attributes to get the corresponding observational data. + +The following script is part of the use case [ex1_2_exp_obs_attr.R](inst/doc/usecase/ex1_2_exp_obs_attr.R). +The time selector for observational data comes from experimental data above (neglected here). The dimension number of the selector is two. +Notice that dimension name, which is 'time' here, must also be one of the dimension names in the selector. + +The result dimensions include 'sdate' because it is splited from 'time'. In the meanwhile, +'date' disappears because 'merge_across_dims = TRUE' (see more explanation at [How-to-#2](#2-indicate-dependent-dimension-and-use-merge-parameters-in-start)). + +```r +# use time attributes from experimental data +dates <- attr(exp, 'Variables')$common$time +dim(dates) +#sdate time +# 4 3 + +obs <- Start(dat = repos_obs, + var = 'tas', + date = unique(format(dates, '%Y%m')), + time = values(dates), #dim: [sdate = 4, time = 3] + time_across = 'date', + lat = 'all', + lon = 'all', + merge_across_dims = TRUE, + split_multiselected_dims = TRUE, + synonims = list(lat = c('lat', 'latitude'), + lon = c('lon', 'longitude')), + return_vars = list(lon = NULL, + lat = NULL, + time = 'date'), + retrieve = FALSE) + +print(attr(obs, 'Dimensions')) +# dat var sdate time lat lon +# 1 1 4 3 256 512 +``` + +Notice that when 'split_multiselected_dims' and 'merge_across_dims' are both used, +and dimension number of the splitted selector is more than two, a potential +problem of mixed dimension might occur. The following script is from part of +the usecase [ex1_7_split_merge.R](inst/doc/usecase/ex1_7_split_merge.R). + +It is important to check if **the order of file_date is in line with dates dimensions order!** +Regardless 'time', which is explicitly specified in Start(), the vector should list 'sdate' first, then 'syear'. +As you can see below, the first element '199607' is sdate = 1, and the second element '199612' is sdate = 2. If the order is wrong, you will still get a +return data but with mixed dimensions. Because 'sdate' and 'syear' are only +implied in the given selectors, Start() cannot check if the order of file_date and dates are consistent or not. + +```r +dates <- attr(hcst, 'Variables')$common$time +dim(dates) +#sdate syear time +# 2 3 12 + +file_date <- sort(unique(gsub('-', '', + sapply(as.character(dates), substr, 1, 7)))) +print(file_date) +#[1] "199607" "199612" "199701" "199707" "199712" "199801" "199807" "199812" +#[9] "199901" +``` + -## Something goes wrong... +# Something goes wrong... ### 1. No space left on device diff --git a/inst/doc/usecase/ex1_7_split_merge.R b/inst/doc/usecase/ex1_7_split_merge.R new file mode 100644 index 0000000..d6e88a7 --- /dev/null +++ b/inst/doc/usecase/ex1_7_split_merge.R @@ -0,0 +1,114 @@ +#--------------------------------------------------------------------- +# This usecase shows the things to be noticed when the parameters +# 'split_multiselected_dims' and 'merge_across_dims' are both used. +# The problem may occur when the dimension number of the splitted selector +# is more than two. + +# If you're not familiar with the usage of these parameters, please see usecases ex1_2 and ex1_3 first, which are less complicated. +# See FAQ How-to-#17 for more explanation. +#--------------------------------------------------------------------- + +library(startR) + +var_name <- 'sfcWind' + +# experimental data +path.exp <- '/esarchive/exp/ecmwf/s2s-monthly_ensforhc/daily/$var$_f24h/$sdate$/$var$_$syear$.nc' + +hcst <- Start(dat = path.exp, + var = var_name, + sdate = c('20160704', '20161222'), + syear = indices(1:3), #2016:2018 + syear_depends = 'sdate', + time = indices(1:12), #4th-15th Jul; 22nd Dec-2nd Jan + latitude = indices(1:10), + longitude = indices(1:10), + ensemble = 'all', + return_vars = list(latitude = 'dat', + longitude = 'dat', + time = c('sdate', 'syear') #time depends on both sdate and syear + ), + retrieve = F) + +#----------------------------------------------------------------------- +# The time attribute is going to be used in retrieving observational data. +# Because 'time' depends on both 'sdate' and 'syear', the dimension number +# is 3. + +dates <- attr(hcst, 'Variables')$common$time +dim(dates) +#sdate syear time +# 2 3 12 +#----------------------------------------------------------------------- + +#----------------------------------------------------------------------- +# This two lines should NOT be used!! It is an example showing the potential +# problem when using 'split_multiselected_dims' and 'merge_across_dims'. +# If 'dates' is reordered to 'syear' ahead of 'sdate', while 'file_date' below +# remains the same, the result will have mixed dimension. + +library(s2dv) +dates <- Reorder(dates, c('syear','time','sdate')) +#----------------------------------------------------------------------- + +#----------------------------------------------------------------------- +# Use 'dates' negerated above to get the required files. +# It is important to check if the order of file_date is in line with +# dates dimensions! Regardless 'time', the vector should list 'sdate' first, +# then 'syear'. As you can see below, the first element '199607' is sdate = 1, +# and the second element '199612' is sdate = 2. + +file_date <- sort(unique(gsub('-', '', + sapply(as.character(dates), substr, 1, 7)))) +print(file_date) +#[1] "199607" "199612" "199701" "199707" "199712" "199801" "199807" "199812" +#[9] "199901" +#----------------------------------------------------------------------- + +# observational data +path.obs <- '/esarchive/recon/ecmwf/era5/1hourly/$var$/$var$_$file_date$.nc' + +obs <- Start(dat = path.obs, + var = var_name, + file_date = file_date, # a vector + latitude = indices(1:10), + longitude = indices(1:10), + time = values(dates), # a 3-dim array (sdate, syear, time) + time_across = 'file_date', + merge_across_dims = TRUE, + merge_across_dims_narm = TRUE, + split_multiselected_dims = TRUE, + synonims = list(latitude = c('lat','latitude'), + longitude = c('lon','longitude')), + return_vars = list(latitude = 'dat', + longitude = 'dat', + time = 'file_date'), + retrieve = T) + +# check obs data +dim(obs) +# dat var latitude longitude sdate syear time +# 1 1 10 10 2 3 12 +obs[1, 1, 1, 1, 2, 1:2, ] +# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] +#[1,] 4.402223 2.657466 7.296539 10.263944 6.367464 5.433421 3.021327 7.498292 +#[2,] 6.802123 7.110264 7.584915 4.255134 2.047740 3.619044 5.648496 8.322672 +# [,9] [,10] [,11] [,12] +#[1,] 15.321060 1.131008 6.326981 5.301850 +#[2,] 7.942419 7.594263 6.189313 7.627579 + +# check with ncdf4 +library(ncdf4) +file199612 <- nc_open('/esarchive/recon/ecmwf/era5/1hourly/sfcWind/sfcWind_199612.nc') +wind199612 <- ncvar_get(file199612, 'sfcWind') +file199701 <- nc_open('/esarchive/recon/ecmwf/era5/1hourly/sfcWind/sfcWind_199701.nc') +wind199701 <- ncvar_get(file199701, 'sfcWind') + +data_wanted_199612 <- seq(506, 722, 24) +wind199612[1, 1, data_wanted_199612] +# [1] 4.402223 2.657466 7.296539 10.263944 6.367464 5.433421 3.021327 +# [8] 7.498292 15.321060 1.131008 +data_wanted_199701 <- seq(2, 26, 24) +wind199701[1, 1, data_wanted_199701] +#[1] 6.326981 5.301850 + -- GitLab From 3d2e82a8a4fd1c5e6a2a5da95d877946959e206f Mon Sep 17 00:00:00 2001 From: aho Date: Wed, 6 May 2020 14:05:33 +0200 Subject: [PATCH 19/23] Fix hyperlink --- inst/doc/faq.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/doc/faq.md b/inst/doc/faq.md index a27d473..cfff33d 100644 --- a/inst/doc/faq.md +++ b/inst/doc/faq.md @@ -20,7 +20,7 @@ This document intends to be the first reference for any doubts that you may have 14. [Find the error log when jobs are launched on Power9](#14-find-the-error-log-when-jobs-are-launched-on-power9) 15. [Specify extra function arguments in the workflow](#15-specify-extra-function-arguments-in-the-workflow) 16. [Use parameter 'return_vars' in Start()](#16-use-parameter-return_vars-in-start) - 17. [Use parameter 'split_multiselected_dims' in Start()](#17-use-parameter-split-multiselected-dims-in-start) + 17. [Use parameter 'split_multiselected_dims' in Start()](#17-use-parameter-split_multiselected_dims-in-start) 2. **Something goes wrong...** -- GitLab From 8f17423e834c461a697d51a344cf99f23092c17a Mon Sep 17 00:00:00 2001 From: aho Date: Wed, 6 May 2020 17:25:35 +0200 Subject: [PATCH 20/23] Add unit test test-Start-split-merge.R --- inst/doc/usecase/ex1_7_split_merge.R | 4 +- tests/testthat/test-Start-split-merge.R | 144 ++++++++++++++++++++++++ 2 files changed, 146 insertions(+), 2 deletions(-) create mode 100644 tests/testthat/test-Start-split-merge.R diff --git a/inst/doc/usecase/ex1_7_split_merge.R b/inst/doc/usecase/ex1_7_split_merge.R index d6e88a7..e72ccee 100644 --- a/inst/doc/usecase/ex1_7_split_merge.R +++ b/inst/doc/usecase/ex1_7_split_merge.R @@ -20,7 +20,7 @@ hcst <- Start(dat = path.exp, sdate = c('20160704', '20161222'), syear = indices(1:3), #2016:2018 syear_depends = 'sdate', - time = indices(1:12), #4th-15th Jul; 22nd Dec-2nd Jan + time = indices(1:12), #4th-15th Jul; 22nd Dec-2nd Jan latitude = indices(1:10), longitude = indices(1:10), ensemble = 'all', @@ -52,7 +52,7 @@ dates <- Reorder(dates, c('syear','time','sdate')) #----------------------------------------------------------------------- #----------------------------------------------------------------------- -# Use 'dates' negerated above to get the required files. +# Use 'dates' generated above to get the required files. # It is important to check if the order of file_date is in line with # dates dimensions! Regardless 'time', the vector should list 'sdate' first, # then 'syear'. As you can see below, the first element '199607' is sdate = 1, diff --git a/tests/testthat/test-Start-split-merge.R b/tests/testthat/test-Start-split-merge.R new file mode 100644 index 0000000..a88be8b --- /dev/null +++ b/tests/testthat/test-Start-split-merge.R @@ -0,0 +1,144 @@ +context("Start() split + merge dim and value check") + +var_name <- 'sfcWind' +path.exp <- '/esarchive/exp/ecmwf/s2s-monthly_ensforhc/daily/$var$_f24h/$sdate$/$var$_$syear$.nc' + +hcst <- Start(dat = path.exp, + var = var_name, + sdate = c('20160704', '20161222'), + syear = indices(1:3), #2016:2018 + syear_depends = 'sdate', + time = indices(1:12), #4th-15th Jul; 22nd Dec-2nd Jan + latitude = indices(1:10), + longitude = indices(1:10), + ensemble = 'all', + return_vars = list(latitude = 'dat', + longitude = 'dat', + time = c('sdate', 'syear')), + retrieve = F, + silent = TRUE) +dates <- attr(hcst, 'Variables')$common$time +file_date <- sort(unique(gsub('-', '', + sapply(as.character(dates), substr, 1, 7)))) + +path.obs <- '/esarchive/recon/ecmwf/era5/1hourly/$var$/$var$_$file_date$.nc' + + +test_that("1. split + merge + narm", { + +obs <- Start(dat = path.obs, + var = var_name, + file_date = file_date, # a vector + latitude = indices(1:10), + longitude = indices(1:10), + time = values(dates), # a 3-dim array (sdate, syear, time) + time_across = 'file_date', + merge_across_dims = TRUE, + merge_across_dims_narm = TRUE, + split_multiselected_dims = TRUE, + synonims = list(latitude = c('lat','latitude'), + longitude = c('lon','longitude')), + return_vars = list(latitude = 'dat', + longitude = 'dat', + time = 'file_date'), + retrieve = T) + + expect_equal( + dim(obs), + c(dat = 1, var = 1, latitude = 10, longitude = 10, sdate = 2, syear = 3, time = 12) + ) + expect_equal( + obs[1, 1, 1, 1, 2, 2, 1:3], + c(6.802123, 7.110264, 7.584915), + tolerance = 0.0001 + ) + expect_equal( + mean(obs), + 5.751444, + tolerance = 0.0001 + ) + expect_equal( + length(obs[which(is.na(obs))]), + 0 + ) +}) + + +test_that("2. no split + merge + narm", { + +obs <- Start(dat = path.obs, + var = var_name, + file_date = file_date, # a vector + latitude = indices(1:10), + longitude = indices(1:10), + time = values(dates), # a 3-dim array (sdate, syear, time) + time_across = 'file_date', + merge_across_dims = TRUE, + merge_across_dims_narm = TRUE, + split_multiselected_dims = FALSE, + synonims = list(latitude = c('lat','latitude'), + longitude = c('lon','longitude')), + return_vars = list(latitude = 'dat', + longitude = 'dat', + time = 'file_date'), + retrieve = T) + + expect_equal( + dim(obs), + c(dat = 1, var = 1, latitude = 10, longitude = 10, time = 72) + ) + expect_equal( + obs[1, 1, 1, 1, 10:13], + c(4.076760, 7.644944, 4.589063, 4.402223), + tolerance = 0.0001 + ) + expect_equal( + mean(obs), + 5.751444, + tolerance = 0.0001 + ) + expect_equal( + length(obs[which(is.na(obs))]), + 0 + ) +}) + + +test_that("3. no split + merge + no narm", { + +obs <- Start(dat = path.obs, + var = var_name, + file_date = file_date, # a vector + latitude = indices(1:10), + longitude = indices(1:10), + time = values(dates), # a 3-dim array (sdate, syear, time) + time_across = 'file_date', + merge_across_dims = TRUE, + merge_across_dims_narm = FALSE, + split_multiselected_dims = FALSE, + synonims = list(latitude = c('lat','latitude'), + longitude = c('lon','longitude')), + return_vars = list(latitude = 'dat', + longitude = 'dat', + time = 'file_date'), + retrieve = T) + + expect_equal( + dim(obs), + c(dat = 1, var = 1, latitude = 10, longitude = 10, time = 108) + ) + expect_equal( + obs[1, 1, 1, 1, 10:13], + c(4.076760, 7.644944, 4.589063, 4.402223), + tolerance = 0.0001 + ) + expect_equal( + mean(obs, na.rm = T), + 5.751444, + tolerance = 0.0001 + ) + expect_equal( + length(obs[which(is.na(obs))]), + 3600 + ) +}) -- GitLab From a28e39b6449d744fcdf5203e51553ca7e5b6cc07 Mon Sep 17 00:00:00 2001 From: aho Date: Wed, 6 May 2020 17:45:58 +0200 Subject: [PATCH 21/23] Update NEWS.md for split + merge bugfix. --- NEWS.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 84dad0f..f1befcf 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,7 +3,9 @@ - Add one new parameter 'merge_across_dims_narm' in Start(). If it is TRUE, the additional NAs in the across dimension will be removed. It is useful when a continuous time series is required, or parameter 'split_multiselected_dims' is -TRUE and expected dimensions are supposed to have no NAs. +TRUE and expected dimensions are supposed to have no NAs. +- Bugfix for the possible mixed dimension problem when 'split_multiselected_dims' and +'merge_across_dims' are both used. # startR v1.0.1 (Release date: 2020-04-21) - Bugfix for global longitude across the borders. -- GitLab From e7ddf9647a00d7d3c019d504598bcdbf52a3878c Mon Sep 17 00:00:00 2001 From: aho Date: Wed, 6 May 2020 18:01:51 +0200 Subject: [PATCH 22/23] Add ex1_7 to menu --- inst/doc/usecase.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/inst/doc/usecase.md b/inst/doc/usecase.md index 9ec2572..68bdbf3 100644 --- a/inst/doc/usecase.md +++ b/inst/doc/usecase.md @@ -31,6 +31,9 @@ for more explanation. 6. [Loading gridpoint data](inst/doc/usecase/ex1_6_gridpoint_data.R) **Start** can be used to load single point data by providing a vector of longitudes and latitudes. This use case also ilustrates how to reformat it to get a 'gridpoint' dimension. + 7. [Use split and merge parameters together](inst/doc/usecase/ex1_7_split_merge.R) + This usecase shows the things to be noticed when the parameters 'split_multiselected_dims' and 'merge_across_dims' are both used. +The problem may occur when the dimension number of the splitted selector is more than two. If you are not familiar with the usage of these parameters, please see usecases ex1_2 and ex1_3 first, which are less complicated. You can also go to FAQ How-to-#17 for more explanation. 2. **Execute computation (use `Compute()`)** -- GitLab From 0ccb97a19ca184c4c6e00718786803bf4cd882f8 Mon Sep 17 00:00:00 2001 From: aho Date: Wed, 6 May 2020 18:07:06 +0200 Subject: [PATCH 23/23] Typo corrected --- inst/doc/faq.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/doc/faq.md b/inst/doc/faq.md index cfff33d..66df1e1 100644 --- a/inst/doc/faq.md +++ b/inst/doc/faq.md @@ -590,7 +590,7 @@ List of 1 The selectors can be not only vectors, but also multidimensional array. For instance, the 'time' dimension can be assigned by a two-dimensional array `[sdate = 12, time = 31]`, which is the daily data of 12 months. You may want to have both 'sdate' and 'time' in the output dimension, even though 'sdate' is not explicitly specified in Start(). -The parameter 'split_multiselected_dims' is for this goal. It is commenly used in the case that uses experimental data attributes to get the corresponding observational data. +The parameter 'split_multiselected_dims' is for this goal. It is common in the case that uses experimental data attributes to get the corresponding observational data. The following script is part of the use case [ex1_2_exp_obs_attr.R](inst/doc/usecase/ex1_2_exp_obs_attr.R). The time selector for observational data comes from experimental data above (neglected here). The dimension number of the selector is two. -- GitLab