From 46d6c3b40542ae381e42c22d43427efb9858f064 Mon Sep 17 00:00:00 2001 From: aho Date: Mon, 21 Feb 2022 17:44:39 +0100 Subject: [PATCH 01/17] Reshape time attribute if time is across a file dim. --- R/Start.R | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++----- R/zzz.R | 24 +++++++++++++++++++--- 2 files changed, 76 insertions(+), 8 deletions(-) diff --git a/R/Start.R b/R/Start.R index d11c669..8c15236 100644 --- a/R/Start.R +++ b/R/Start.R @@ -3462,8 +3462,47 @@ Start <- function(..., # dim = indices/selectors, common_vars_to_crop[[common_var_to_crop]] <- Subset(transformed_var_with_selectors, inner_dim, crop_indices) } - } else { #old code - common_vars_to_crop[[common_var_to_crop]] <- Subset(common_vars_to_crop[[common_var_to_crop]], inner_dim, crop_indices) + } else { + if (!is.null(file_dim)) { #merge_across_dims, file_dims is the depended file dim + tmp <- common_vars_to_crop[[common_var_to_crop]] + dim_extra_ind <- which(!names(dim(tmp)) %in% c(file_dim, inner_dim)) + if (!identical(dim_extra_ind, integer(0))) { + tmp_list <- asplit(tmp, dim_extra_ind) + dim_file_ind <- which(names(dim(tmp_list[[1]])) == file_dim) + tmp_list <- lapply(tmp_list, asplit, dim_file_ind) + } else { # only file_dim and inner_dim + dim_file_ind <- which(names(dim(tmp)) == file_dim) + tmp_list <- asplit(tmp, dim_file_ind) + } + max_fri_length <- max(sapply(fri, length)) + for (i_extra_dim in 1:length(tmp_list)) { + for (i_fri in 1:length(fri)) { + tmp_list[[i_extra_dim]][[i_fri]] <- + tmp_list[[i_extra_dim]][[i_fri]][fri[[i_fri]]] + + if (length(tmp_list[[i_extra_dim]][[i_fri]]) != max_fri_length) { + tmp_list[[i_extra_dim]][[i_fri]] <- + c(tmp_list[[i_extra_dim]][[i_fri]], rep(NA, max_fri_length - length(tmp_list[[i_extra_dim]][[i_fri]]))) + } + } + } + # Change list back to array + tmp_new_dim <- c(max_fri_length, dim(tmp)[file_dim], dim(tmp)[dim_extra_ind]) + names(tmp_new_dim) <- c(inner_dim, file_dim, names(dim(tmp))[dim_extra_ind]) + common_vars_to_crop[[common_var_to_crop]] <- + array(unlist(tmp_list), dim = tmp_new_dim) + # Reorder back + common_vars_to_crop[[common_var_to_crop]] <- + aperm(common_vars_to_crop[[common_var_to_crop]], match(names(dim(tmp)), names(tmp_new_dim))) + # Convert number back to time + common_vars_to_crop[[common_var_to_crop]] <- + as.POSIXct(common_vars_to_crop[[common_var_to_crop]], + origin = "1970-01-01", tz = 'UTC') + } else { # old code + + common_vars_to_crop[[common_var_to_crop]] <- Subset(common_vars_to_crop[[common_var_to_crop]], inner_dim, crop_indices) + } + } } @@ -3790,9 +3829,14 @@ Start <- function(..., # dim = indices/selectors, if (!merge_across_dims_narm) { data_array_tmp <- array(bigmemory::as.matrix(data_array), dim = final_dims) } else { - data_array_tmp <- remove_additional_na_from_merge( - inner_dims_across_files, final_dims, across_inner_dim, - length_inner_across_dim, data_array) + tmp <- remove_additional_na_from_merge( + inner_dims_across_files, final_dims, across_inner_dim, + length_inner_across_dim, data_array, + merge_dim_metadata = picked_common_vars[[across_inner_dim]]) + data_array_tmp <- tmp$data_array_tmp + # Save dim for later use + dims_of_merge_dim <- dim(picked_common_vars[[across_inner_dim]]) + picked_common_vars[[across_inner_dim]] <- tmp$merge_dim_metadata } if (length(data_array_tmp) != prod(final_dims_fake)) { @@ -3809,6 +3853,12 @@ Start <- function(..., # dim = indices/selectors, } data_array <- array(data_array_tmp, dim = final_dims_fake) + final_dims_fake_merge_dim <- final_dims_fake[names(final_dims_fake) %in% names(dims_of_merge_dim)] + picked_common_vars[[across_inner_dim]] <- array(picked_common_vars[[across_inner_dim]], dim = final_dims_fake_merge_dim) + # Convert numeric back to dates + picked_common_vars[[across_inner_dim]] <- + as.POSIXct(picked_common_vars[[across_inner_dim]], + origin = "1970-01-01", tz = 'UTC') } else { # ! (merge_across_dims + split_multiselected_dims) (old version) data_array <- array(bigmemory::as.matrix(data_array), dim = final_dims_fake) diff --git a/R/zzz.R b/R/zzz.R index 0130724..7cf781e 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -1029,7 +1029,7 @@ retrieve_progress_message <- function(work_pieces, num_procs, silent) { # If merge_across_dims = TRUE and merge_across_dims_narm = TRUE, remove the additional NAs # due to unequal inner_dim ('time') length across file_dim ('sdate'). remove_additional_na_from_merge <- function(inner_dims_across_files, final_dims, across_inner_dim, - length_inner_across_dim, data_array) { + length_inner_across_dim, data_array, merge_dim_metadata) { across_file_dim <- names(inner_dims_across_files) #TODO: more than one? # Get the length of these two dimensions in final_dims length_inner_across_store_dims <- final_dims[across_inner_dim] @@ -1051,7 +1051,13 @@ remove_additional_na_from_merge <- function(inner_dims_across_files, final_dims, # dim(data_array) = [time, file_date] # dim(logi_array) = [time, file_date] # Change the blank spaces from NA to -9999 - data_array[which(!logi_array)] <- -9999 + if (any(class(data_array) %in% c("POSIXct", "POSIXt"))) { + # change to numeric first + data_array <- array(as.vector(data_array), dim = dim(data_array)) + data_array[which(!logi_array)] <- max(data_array, na.rm = T) + 1 + } else { + data_array[which(!logi_array)] <- -9999 + } return(data_array) } data_array_final_dims <- multiApply::Apply(data_array_final_dims, @@ -1059,12 +1065,24 @@ remove_additional_na_from_merge <- function(inner_dims_across_files, final_dims, output_dims = c(across_inner_dim, across_file_dim), fun = func_remove_blank, logi_array = logi_array)$output1 + + merge_dim_metadata <- multiApply::Apply(merge_dim_metadata, + target_dims = c(across_inner_dim, across_file_dim), + output_dims = c(across_inner_dim, across_file_dim), + fun = func_remove_blank, + logi_array = logi_array)$output1 + ## reorder back to the correct dim tmp <- match(names(final_dims), names(dim(data_array_final_dims))) data_array_final_dims <- .aperm2(data_array_final_dims, tmp) data_array_tmp <- data_array_final_dims[data_array_final_dims != -9999] # become a vector - return(data_array_tmp) + # Reorder metadata dim as final dim + tmp <- match(names(final_dims), names(dim(merge_dim_metadata))) + merge_dim_metadata <- aperm(merge_dim_metadata, tmp[!is.na(tmp)]) + merge_dim_metadata <- merge_dim_metadata[merge_dim_metadata != max(merge_dim_metadata, na.rm = TRUE)] + + return(list(data_array_tmp = data_array_tmp, merge_dim_metadata = merge_dim_metadata)) } -- GitLab From a7b8d3d6f8c5a436e5dbaa9e71d41dda8712dda5 Mon Sep 17 00:00:00 2001 From: aho Date: Mon, 21 Feb 2022 18:04:18 +0100 Subject: [PATCH 02/17] Add condition to time conversion --- R/Start.R | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/R/Start.R b/R/Start.R index 8c15236..0fba6d8 100644 --- a/R/Start.R +++ b/R/Start.R @@ -3494,10 +3494,12 @@ Start <- function(..., # dim = indices/selectors, # Reorder back common_vars_to_crop[[common_var_to_crop]] <- aperm(common_vars_to_crop[[common_var_to_crop]], match(names(dim(tmp)), names(tmp_new_dim))) - # Convert number back to time - common_vars_to_crop[[common_var_to_crop]] <- - as.POSIXct(common_vars_to_crop[[common_var_to_crop]], - origin = "1970-01-01", tz = 'UTC') + if (common_var_to_crop == 'time') { + # Convert number back to time + common_vars_to_crop[[common_var_to_crop]] <- + as.POSIXct(common_vars_to_crop[[common_var_to_crop]], + origin = "1970-01-01", tz = 'UTC') + } } else { # old code common_vars_to_crop[[common_var_to_crop]] <- Subset(common_vars_to_crop[[common_var_to_crop]], inner_dim, crop_indices) @@ -3855,10 +3857,12 @@ Start <- function(..., # dim = indices/selectors, data_array <- array(data_array_tmp, dim = final_dims_fake) final_dims_fake_merge_dim <- final_dims_fake[names(final_dims_fake) %in% names(dims_of_merge_dim)] picked_common_vars[[across_inner_dim]] <- array(picked_common_vars[[across_inner_dim]], dim = final_dims_fake_merge_dim) - # Convert numeric back to dates - picked_common_vars[[across_inner_dim]] <- - as.POSIXct(picked_common_vars[[across_inner_dim]], - origin = "1970-01-01", tz = 'UTC') + if (across_inner_dim == 'time') { + # Convert numeric back to dates + picked_common_vars[[across_inner_dim]] <- + as.POSIXct(picked_common_vars[[across_inner_dim]], + origin = "1970-01-01", tz = 'UTC') + } } else { # ! (merge_across_dims + split_multiselected_dims) (old version) data_array <- array(bigmemory::as.matrix(data_array), dim = final_dims_fake) -- GitLab From 3db54a0906df47eb0d742644d51dbc9a77080a7c Mon Sep 17 00:00:00 2001 From: aho Date: Mon, 21 Feb 2022 19:32:59 +0100 Subject: [PATCH 03/17] code improvement --- R/Start.R | 4 +++- R/zzz.R | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/R/Start.R b/R/Start.R index 0fba6d8..9589e25 100644 --- a/R/Start.R +++ b/R/Start.R @@ -3473,6 +3473,8 @@ Start <- function(..., # dim = indices/selectors, } else { # only file_dim and inner_dim dim_file_ind <- which(names(dim(tmp)) == file_dim) tmp_list <- asplit(tmp, dim_file_ind) + # Add another layer to be consistent with the first case above + tmp_list <- list(tmp_list) } max_fri_length <- max(sapply(fri, length)) for (i_extra_dim in 1:length(tmp_list)) { @@ -3832,7 +3834,7 @@ Start <- function(..., # dim = indices/selectors, data_array_tmp <- array(bigmemory::as.matrix(data_array), dim = final_dims) } else { tmp <- remove_additional_na_from_merge( - inner_dims_across_files, final_dims, across_inner_dim, + inner_dims_across_files, final_dims, length_inner_across_dim, data_array, merge_dim_metadata = picked_common_vars[[across_inner_dim]]) data_array_tmp <- tmp$data_array_tmp diff --git a/R/zzz.R b/R/zzz.R index 7cf781e..b21e51c 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -1028,9 +1028,10 @@ retrieve_progress_message <- function(work_pieces, num_procs, silent) { # If merge_across_dims = TRUE and merge_across_dims_narm = TRUE, remove the additional NAs # due to unequal inner_dim ('time') length across file_dim ('sdate'). -remove_additional_na_from_merge <- function(inner_dims_across_files, final_dims, across_inner_dim, +remove_additional_na_from_merge <- function(inner_dims_across_files, final_dims, length_inner_across_dim, data_array, merge_dim_metadata) { across_file_dim <- names(inner_dims_across_files) #TODO: more than one? + across_inner_dim <- inner_dims_across_files[[1]] #TODO: more than one? # Get the length of these two dimensions in final_dims length_inner_across_store_dims <- final_dims[across_inner_dim] length_file_across_store_dims <- final_dims[across_file_dim] -- GitLab From 7322a6893b0246ad6b7636145376cd263355a669 Mon Sep 17 00:00:00 2001 From: aho Date: Mon, 21 Feb 2022 20:10:48 +0100 Subject: [PATCH 04/17] Add crossed file dim in return_vars instead of replacing the original value --- R/Start.R | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/R/Start.R b/R/Start.R index 9589e25..2eb9cda 100644 --- a/R/Start.R +++ b/R/Start.R @@ -1746,10 +1746,12 @@ Start <- function(..., # dim = indices/selectors, is.character(file_dim_as_selector_array_dim)) { #(2) or (1) # inner_dim is not in return_vars or is NULL if (((!inner_dim %in% names(common_return_vars)) & (!inner_dim %in% names(return_vars))) | - (inner_dim %in% names(common_return_vars) & is.null(common_return_vars[[inner_dim]]))) { - common_return_vars[[inner_dim]] <- correct_return_vars( - inner_dim, inner_dims_across_files, - found_pattern_dim, file_dim_as_selector_array_dim) + (inner_dim %in% names(common_return_vars) & is.null(common_return_vars[[inner_dim]])) | + (inner_dim %in% names(common_return_vars) & !names(inner_dims_across_files) %in% common_return_vars[[inner_dim]])) { + common_return_vars[[inner_dim]] <- + c(common_return_vars[[inner_dim]], + correct_return_vars(inner_dim, inner_dims_across_files, + found_pattern_dim, file_dim_as_selector_array_dim)) } } } -- GitLab From 32550ef154307e999e6fe6a878cc5d6eced038fd Mon Sep 17 00:00:00 2001 From: aho Date: Tue, 22 Feb 2022 12:56:52 +0100 Subject: [PATCH 05/17] Put the lost metadata back after reshaping --- R/Start.R | 23 ++++++++++++++++--- .../test-Start-metadata_filedim_dependency.R | 4 ++-- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/R/Start.R b/R/Start.R index 2eb9cda..cce3834 100644 --- a/R/Start.R +++ b/R/Start.R @@ -1745,9 +1745,19 @@ Start <- function(..., # dim = indices/selectors, if (inner_dim %in% inner_dims_across_files | is.character(file_dim_as_selector_array_dim)) { #(2) or (1) # inner_dim is not in return_vars or is NULL - if (((!inner_dim %in% names(common_return_vars)) & (!inner_dim %in% names(return_vars))) | - (inner_dim %in% names(common_return_vars) & is.null(common_return_vars[[inner_dim]])) | - (inner_dim %in% names(common_return_vars) & !names(inner_dims_across_files) %in% common_return_vars[[inner_dim]])) { + need_correct <- FALSE + if (((!inner_dim %in% names(common_return_vars)) & + (!inner_dim %in% names(return_vars))) | + (inner_dim %in% names(common_return_vars) & + is.null(common_return_vars[[inner_dim]]))) { + need_correct <- TRUE + } else if (inner_dim %in% names(common_return_vars) & + !is.null(names(inner_dims_across_files))) { + if (!names(inner_dims_across_files) %in% common_return_vars[[inner_dim]]) { + need_correct <- TRUE + } + } + if (need_correct) { common_return_vars[[inner_dim]] <- c(common_return_vars[[inner_dim]], correct_return_vars(inner_dim, inner_dims_across_files, @@ -3467,6 +3477,7 @@ Start <- function(..., # dim = indices/selectors, } else { if (!is.null(file_dim)) { #merge_across_dims, file_dims is the depended file dim tmp <- common_vars_to_crop[[common_var_to_crop]] + tmp_attributes <- attributes(common_vars_to_crop[[common_var_to_crop]]) dim_extra_ind <- which(!names(dim(tmp)) %in% c(file_dim, inner_dim)) if (!identical(dim_extra_ind, integer(0))) { tmp_list <- asplit(tmp, dim_extra_ind) @@ -3498,6 +3509,12 @@ Start <- function(..., # dim = indices/selectors, # Reorder back common_vars_to_crop[[common_var_to_crop]] <- aperm(common_vars_to_crop[[common_var_to_crop]], match(names(dim(tmp)), names(tmp_new_dim))) + # Put attributes back + tmp <- which(!names(tmp_attributes) %in% names(attributes(common_vars_to_crop[[common_var_to_crop]]))) + attributes(common_vars_to_crop[[common_var_to_crop]]) <- + c(attributes(common_vars_to_crop[[common_var_to_crop]]), + tmp_attributes[tmp]) + if (common_var_to_crop == 'time') { # Convert number back to time common_vars_to_crop[[common_var_to_crop]] <- diff --git a/tests/testthat/test-Start-metadata_filedim_dependency.R b/tests/testthat/test-Start-metadata_filedim_dependency.R index a5fa558..cfd7dfb 100644 --- a/tests/testthat/test-Start-metadata_filedim_dependency.R +++ b/tests/testthat/test-Start-metadata_filedim_dependency.R @@ -65,7 +65,7 @@ expect_equal( return_vars = list(time = NULL), retrieve = TRUE) ) - time6 <- attr(test4, 'Variables')$common$time + time6 <- attr(test6, 'Variables')$common$time expect_equal( dim(time6), @@ -101,7 +101,7 @@ expect_equal( return_vars = list(time = NULL), retrieve = TRUE) ) - time6a <- attr(test4, 'Variables')$common$time + time6a <- attr(test6a, 'Variables')$common$time expect_equal( dim(time6a), -- GitLab From 6ee2db8783a2916bb56169f47847b20b0c71407c Mon Sep 17 00:00:00 2001 From: aho Date: Tue, 22 Feb 2022 18:21:19 +0100 Subject: [PATCH 06/17] Keep metadata integrate during the reshaping process --- R/Start.R | 3 + R/zzz.R | 6 ++ .../testthat/test-Start-metadata_reshaping.R | 93 +++++++++++++++++++ 3 files changed, 102 insertions(+) create mode 100644 tests/testthat/test-Start-metadata_reshaping.R diff --git a/R/Start.R b/R/Start.R index cce3834..784d25e 100644 --- a/R/Start.R +++ b/R/Start.R @@ -3877,7 +3877,10 @@ Start <- function(..., # dim = indices/selectors, data_array <- array(data_array_tmp, dim = final_dims_fake) final_dims_fake_merge_dim <- final_dims_fake[names(final_dims_fake) %in% names(dims_of_merge_dim)] + tmp_attr <- attr(picked_common_vars[[across_inner_dim]], 'variables') picked_common_vars[[across_inner_dim]] <- array(picked_common_vars[[across_inner_dim]], dim = final_dims_fake_merge_dim) + attr(picked_common_vars[[across_inner_dim]], 'variables') <- tmp_attr + if (across_inner_dim == 'time') { # Convert numeric back to dates picked_common_vars[[across_inner_dim]] <- diff --git a/R/zzz.R b/R/zzz.R index b21e51c..44f3cdc 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -1067,6 +1067,7 @@ remove_additional_na_from_merge <- function(inner_dims_across_files, final_dims, fun = func_remove_blank, logi_array = logi_array)$output1 + tmp_attr <- attributes(merge_dim_metadata)$variables merge_dim_metadata <- multiApply::Apply(merge_dim_metadata, target_dims = c(across_inner_dim, across_file_dim), output_dims = c(across_inner_dim, across_file_dim), @@ -1082,7 +1083,9 @@ remove_additional_na_from_merge <- function(inner_dims_across_files, final_dims, tmp <- match(names(final_dims), names(dim(merge_dim_metadata))) merge_dim_metadata <- aperm(merge_dim_metadata, tmp[!is.na(tmp)]) merge_dim_metadata <- merge_dim_metadata[merge_dim_metadata != max(merge_dim_metadata, na.rm = TRUE)] + attr(merge_dim_metadata, 'variables') <- tmp_attr + #NOTE: both outputs are vectors. If 'merge_dim_metadata' is actually time, it is just numeric here. return(list(data_array_tmp = data_array_tmp, merge_dim_metadata = merge_dim_metadata)) } @@ -1335,8 +1338,11 @@ generate_picked_var_of_read <- function(var_to_read, var_to_check, array_of_file } else { padding <- array(dim = padding_dims) } + tmp_attr <- attributes(either_picked_vars)$variables either_picked_vars <- .abind2(either_picked_vars, padding, names(full_array_var_dims)[longer_dims_in_full_array]) + attr(either_picked_vars, 'variables') <- tmp_attr + } else { stop("Error while reading the variable '", var_to_read, "' from ", "the file. Found size (", paste(var_dims, collapse = ' x '), diff --git a/tests/testthat/test-Start-metadata_reshaping.R b/tests/testthat/test-Start-metadata_reshaping.R new file mode 100644 index 0000000..d4d4c28 --- /dev/null +++ b/tests/testthat/test-Start-metadata_reshaping.R @@ -0,0 +1,93 @@ +context("Start() metadata reshaping") +# When data is reshaping (e.g., time_across = 'sdate'), the corresponding attribute should be reshaped too. + +test_that("1. time across fyear, fyear depends on sdate", { + +suppressWarnings( +data <- Start(dat = '/esarchive/exp/ecearth/a3w5/original_files/cmorfiles/DCPP/EC-Earth-Consortium/EC-Earth3/dcppA-hindcast/$member$/day/$var$/gr/v20210910/$var$_day_EC-Earth3_dcppA-hindcast_s$sdate$-$member$_gr_$fyear$.nc', + var = 'tasmin', + lat = indices(1), + lon = indices(1), + sdate = paste0(1960:1961), + time = 62:426, ## Jan to Dec (initialised in Nov) + time_across = 'fyear', + merge_across_dims = TRUE, + fyear = 'all', + fyear_depends = 'sdate', + member = 'r1i4p1f1', + synonims = list(lat = c('lat','latitude'), + lon = c('lon','longitude')), + return_vars = list(lat = NULL, lon = NULL, + time = c('sdate', 'fyear')), + retrieve = TRUE) +) +dates <- attr(data,'Variables')$common[['time']] + +expect_equal( +dim(dates), +c(sdate = 2, time = 365) +) +expect_equal( +dim(drop(data)), +dim(dates) +) +expect_equal( +names(attributes(dates)), +c('variables', 'dim', 'class', 'tzone') +) +expect_equal( +class(dates), +c("POSIXct", "POSIXt") +) +expect_equal( +as.vector(dates[1, c(1:2, 365)]), +as.vector(as.POSIXct(c("1961-01-01 12:00:00", "1961-01-02 12:00:00", "1961-12-31 12:00:00"), + tz = 'UTC')) +) +expect_equal( +as.vector(dates[2, c(1:2, 365)]), +as.vector(as.POSIXct(c("1962-01-01 12:00:00", "1962-01-02 12:00:00", "1962-12-31 12:00:00"), + tz = 'UTC')) +) + +}) + + +test_that("2. time across fyear, only one sdate", { + +suppressWarnings( +data <- Start(dat = '/esarchive/exp/ecearth/a3w5/original_files/cmorfiles/DCPP/EC-Earth-Consortium/EC-Earth3/dcppA-hindcast/$member$/day/$var$/gr/v20210910/$var$_day_EC-Earth3_dcppA-hindcast_s$sdate$-$member$_gr_$fyear$.nc', + var = 'tasmin', + lat = indices(1), + lon = indices(1), + sdate = paste0(1960), + time = 62:426, ## Jan to Dec (initialised in Nov) + time_across = 'fyear', + merge_across_dims = TRUE, + fyear = 'all', +# fyear_depends = 'sdate', + member = 'r1i4p1f1', + synonims = list(lat = c('lat','latitude'), + lon = c('lon','longitude')), + return_vars = list(lat = NULL, lon = NULL, + time = c('fyear')), + retrieve = TRUE) +) +dates <- attr(data,'Variables')$common[['time']] + + +expect_equal( +dim(dates), +c(time = 365) +) +expect_equal( +length(data), +length(dates) +) +expect_equal( +as.vector(dates[c(1:2, 365)]), +as.vector(as.POSIXct(c("1961-01-01 12:00:00", "1961-01-02 12:00:00", "1961-12-31 12:00:00"), + tz = 'UTC')) +) + +}) -- GitLab From a18a0fee7387390268bb6a090db19d307d0c90bf Mon Sep 17 00:00:00 2001 From: aho Date: Wed, 23 Feb 2022 12:06:05 +0100 Subject: [PATCH 07/17] Revise the NA substituted value --- R/zzz.R | 4 +- .../testthat/test-Start-metadata_reshaping.R | 64 ++++++++++++++++--- 2 files changed, 57 insertions(+), 11 deletions(-) diff --git a/R/zzz.R b/R/zzz.R index 44f3cdc..5b2b786 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -1055,7 +1055,7 @@ remove_additional_na_from_merge <- function(inner_dims_across_files, final_dims, if (any(class(data_array) %in% c("POSIXct", "POSIXt"))) { # change to numeric first data_array <- array(as.vector(data_array), dim = dim(data_array)) - data_array[which(!logi_array)] <- max(data_array, na.rm = T) + 1 + data_array[which(!logi_array)] <- -12^10 } else { data_array[which(!logi_array)] <- -9999 } @@ -1082,7 +1082,7 @@ remove_additional_na_from_merge <- function(inner_dims_across_files, final_dims, # Reorder metadata dim as final dim tmp <- match(names(final_dims), names(dim(merge_dim_metadata))) merge_dim_metadata <- aperm(merge_dim_metadata, tmp[!is.na(tmp)]) - merge_dim_metadata <- merge_dim_metadata[merge_dim_metadata != max(merge_dim_metadata, na.rm = TRUE)] + merge_dim_metadata <- merge_dim_metadata[merge_dim_metadata != -12^10] attr(merge_dim_metadata, 'variables') <- tmp_attr #NOTE: both outputs are vectors. If 'merge_dim_metadata' is actually time, it is just numeric here. diff --git a/tests/testthat/test-Start-metadata_reshaping.R b/tests/testthat/test-Start-metadata_reshaping.R index d4d4c28..680f8d6 100644 --- a/tests/testthat/test-Start-metadata_reshaping.R +++ b/tests/testthat/test-Start-metadata_reshaping.R @@ -40,14 +40,12 @@ class(dates), c("POSIXct", "POSIXt") ) expect_equal( -as.vector(dates[1, c(1:2, 365)]), -as.vector(as.POSIXct(c("1961-01-01 12:00:00", "1961-01-02 12:00:00", "1961-12-31 12:00:00"), - tz = 'UTC')) +as.vector(dates[1, ]), +as.vector(seq(as.POSIXct('1961-01-01 12:00:00', tz = 'UTC'), as.POSIXct('1961-12-31 12:00:00', tz = 'UTC'), by = 'day')) ) expect_equal( -as.vector(dates[2, c(1:2, 365)]), -as.vector(as.POSIXct(c("1962-01-01 12:00:00", "1962-01-02 12:00:00", "1962-12-31 12:00:00"), - tz = 'UTC')) +as.vector(dates[2, ]), +as.vector(seq(as.POSIXct('1962-01-01 12:00:00', tz = 'UTC'), as.POSIXct('1962-12-31 12:00:00', tz = 'UTC'), by = 'day')) ) }) @@ -85,9 +83,57 @@ length(data), length(dates) ) expect_equal( -as.vector(dates[c(1:2, 365)]), -as.vector(as.POSIXct(c("1961-01-01 12:00:00", "1961-01-02 12:00:00", "1961-12-31 12:00:00"), - tz = 'UTC')) +as.vector(dates), +as.vector(seq(as.POSIXct('1961-01-01 12:00:00', tz = 'UTC'), as.POSIXct('1961-12-31 12:00:00', tz = 'UTC'), by = 'day')) +) + + + +}) + + +test_that("3. time across fyear, fyear depends on sdate, 1st fyear is empty, 3rd fyear has more indices than 2nd one, 1964 is leap year", { + +suppressWarnings( +data <- Start(dat = '/esarchive/exp/ecearth/a3w5/original_files/cmorfiles/DCPP/EC-Earth-Consortium/EC-Earth3/dcppA-hindcast/$member$/day/$var$/gr/v20210910/$var$_day_EC-Earth3_dcppA-hindcast_s$sdate$-$member$_gr_$fyear$.nc', + var = 'tasmin', + lat = indices(1), + lon = indices(1), + sdate = paste0(1960:1961), + time = 700:860, ## initialised in Nov + time_across = 'fyear', + merge_across_dims = TRUE, + fyear = 'all', + fyear_depends = 'sdate', + member = 'r1i4p1f1', + synonims = list(lat = c('lat','latitude'), + lon = c('lon','longitude')), + return_vars = list(lat = NULL, lon = NULL, + time = c('sdate', 'fyear')), + retrieve = TRUE) +) +dates <- attr(data,'Variables')$common[['time']] + +expect_equal( +dim(dates), +c(sdate = 2, time = 161) +) + +expect_equal( +names(attributes(dates)), +c('variables', 'dim', 'class', 'tzone') +) +expect_equal( +class(dates), +c("POSIXct", "POSIXt") +) +expect_equal( +as.vector(dates[1, ]), +as.vector(seq(as.POSIXct('1962-10-01 12:00:00', tz = 'UTC'), as.POSIXct('1963-03-10 12:00:00', tz = 'UTC'), by = 'day')) +) +expect_equal( +as.vector(dates[2, ]), +as.vector(seq(as.POSIXct('1963-10-01 12:00:00', tz = 'UTC'), as.POSIXct('1964-03-09 12:00:00', tz = 'UTC'), by = 'day')) ) }) -- GitLab From 2ec599cb7d4986610f78766ffce88c8a0f313335 Mon Sep 17 00:00:00 2001 From: aho Date: Thu, 24 Feb 2022 18:21:05 +0100 Subject: [PATCH 08/17] Reshape metadata for split dim case --- R/Start.R | 37 ++++++++++++++++++++++++------------- R/zzz.R | 19 ++++++++++++++++--- 2 files changed, 40 insertions(+), 16 deletions(-) diff --git a/R/Start.R b/R/Start.R index 784d25e..f473dfd 100644 --- a/R/Start.R +++ b/R/Start.R @@ -3870,17 +3870,37 @@ Start <- function(..., # dim = indices/selectors, #NOTE: When one file contains values for dicrete dimensions, rearrange the # chunks (i.e., work_piece) is necessary. if (split_multiselected_dims) { - data_array_tmp <- rebuild_array_merge_split( - data_array_tmp, indices_chunk, all_split_dims, final_dims_fake, - across_inner_dim, length_inner_across_dim) + tmp <- rebuild_array_merge_split( + data_array_tmp, indices_chunk, all_split_dims, final_dims_fake, + across_inner_dim, length_inner_across_dim, metadata_tmp = picked_common_vars[[across_inner_dim]]) + data_array_tmp <- tmp$data_array_tmp + picked_common_vars[[across_inner_dim]] <- tmp$metadata_tmp } data_array <- array(data_array_tmp, dim = final_dims_fake) - final_dims_fake_merge_dim <- final_dims_fake[names(final_dims_fake) %in% names(dims_of_merge_dim)] + + # Find final_dim_fake for metadata and put it in an array + if (!split_multiselected_dims) { + final_dims_fake_merge_dim <- final_dims_fake[names(final_dims_fake) %in% names(dims_of_merge_dim)] + } else { + final_dims_fake_merge_dim <- final_dims_fake[names(final_dims_fake) %in% names(all_split_dims[[across_inner_dim]])] + } + tmp_attr <- attr(picked_common_vars[[across_inner_dim]], 'variables') picked_common_vars[[across_inner_dim]] <- array(picked_common_vars[[across_inner_dim]], dim = final_dims_fake_merge_dim) attr(picked_common_vars[[across_inner_dim]], 'variables') <- tmp_attr + # If split_multiselected_dims + merge_across_dims, the dimension order may change above. + # To get the user-required dim order, we need to reorder the array again. + if (split_multiselected_dims) { + if (inner_dim_pos_in_split_dims != 1) { + correct_order <- match(names(final_dims_fake_output), names(final_dims_fake)) + data_array <- .aperm2(data_array, correct_order) + correct_order_metadata <- match(names(final_dims_fake_output), names(all_split_dims[[across_inner_dim]])) + picked_common_vars[[across_inner_dim]] <- .aperm2(picked_common_vars[[across_inner_dim]], correct_order_metadata[!is.na(correct_order_metadata)]) + } + } + if (across_inner_dim == 'time') { # Convert numeric back to dates picked_common_vars[[across_inner_dim]] <- @@ -3892,15 +3912,6 @@ Start <- function(..., # dim = indices/selectors, data_array <- array(bigmemory::as.matrix(data_array), dim = final_dims_fake) } - # NOTE: If split_multiselected_dims + merge_across_dims, the dimension order may change above. - # To get the user-required dim order, we need to reorder the array again. - if (split_multiselected_dims & merge_across_dims) { - if (inner_dim_pos_in_split_dims != 1) { - correct_order <- match(names(final_dims_fake_output), names(final_dims_fake)) - data_array <- .aperm2(data_array, correct_order) - } - } - gc() # Load metadata and remove the metadata folder diff --git a/R/zzz.R b/R/zzz.R index 5b2b786..af601f7 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -1094,7 +1094,7 @@ remove_additional_na_from_merge <- function(inner_dims_across_files, final_dims, # When merge_across_dims = TRUE and split_multiselected_dims = TRUE, rearrange the chunks # (i.e., work_piece) is necessary if one file contains values for discrete dimensions rebuild_array_merge_split <- function(data_array_tmp, indices_chunk, all_split_dims, - final_dims_fake, across_inner_dim, length_inner_across_dim) { + final_dims_fake, across_inner_dim, length_inner_across_dim, metadata_tmp) { # generate the correct order list from indices_chunk final_order_list <- list() i <- 1 @@ -1126,14 +1126,20 @@ rebuild_array_merge_split <- function(data_array_tmp, indices_chunk, all_split_d new_dims <- c(new_dims, final_dims_fake[(split_dims_pos[length(split_dims_pos)] + 1):length(final_dims_fake)]) } data_array_no_split <- array(data_array_tmp, dim = new_dims) + metadata_no_split <- array(metadata_tmp, dim = new_dims) + # seperate 'time' dim into each work_piece length data_array_seperate <- list() + metadata_seperate <- list() tmp <- cumsum(unlist(length_inner_across_dim)) tmp <- c(0, tmp) for (i in 1:length(length_inner_across_dim)) { data_array_seperate[[i]] <- ClimProjDiags::Subset(data_array_no_split, across_inner_dim, (tmp[i] + 1):tmp[i + 1]) + metadata_seperate[[i]] <- ClimProjDiags::Subset(metadata_no_split, + across_inner_dim, + (tmp[i] + 1):tmp[i + 1]) } # re-build the array: chunk which_chunk <- as.numeric(names(final_order_list)) @@ -1142,26 +1148,33 @@ rebuild_array_merge_split <- function(data_array_tmp, indices_chunk, all_split_d how_many_indices <- unlist(final_order_list) array_piece <- list() + metadata_piece <- list() ind_in_array_seperate <- as.list(rep(1, length(data_array_seperate))) for (i in 1:length(final_order_list)) { array_piece[[i]] <- ClimProjDiags::Subset( data_array_seperate[[which_chunk[i]]], across_inner_dim, ind_in_array_seperate[[which_chunk[i]]]:(ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i] - 1)) + metadata_piece[[i]] <- ClimProjDiags::Subset( + metadata_seperate[[which_chunk[i]]], across_inner_dim, + ind_in_array_seperate[[which_chunk[i]]]:(ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i] - 1)) ind_in_array_seperate[[which_chunk[i]]] <- ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i] } # re-build the array: paste data_array_tmp <- array_piece[[1]] + metadata_tmp <- metadata_piece[[1]] along_pos <- which(names(dim(data_array_tmp)) == across_inner_dim) if (length(array_piece) > 1) { for (i in 2:length(array_piece)) { data_array_tmp <- abind::abind(data_array_tmp, array_piece[[i]], along = along_pos) + metadata_tmp <- abind::abind(metadata_tmp, metadata_piece[[i]], + along = along_pos) } } } - - return(data_array_tmp) + + return(list(data_array_tmp = data_array_tmp, metadata_tmp = metadata_tmp)) } -- GitLab From 979a14efcdb051cc593c55b6ddac00111cae1e8f Mon Sep 17 00:00:00 2001 From: aho Date: Fri, 25 Feb 2022 19:13:02 +0100 Subject: [PATCH 09/17] Finish metadata reshaping for retrieve = TRUE, including different combinations of merge_across_dims, merge_across_dims_narm, and split_multiselected_dims --- R/NcDataReader.R | 191 ++++++------ R/Start.R | 92 ++++-- .../testthat/test-Start-metadata_reshaping.R | 274 +++++++++++++++++- 3 files changed, 425 insertions(+), 132 deletions(-) diff --git a/R/NcDataReader.R b/R/NcDataReader.R index fb61ef8..7613172 100644 --- a/R/NcDataReader.R +++ b/R/NcDataReader.R @@ -183,120 +183,121 @@ NcDataReader <- function(file_path = NULL, file_object = NULL, } }) - if (length(names(attr(result, 'variables'))) == 1 & all(names(attr(result, 'variables')) == 'time')) { - var_name <- names(attr(result, 'variables')) - units <- attr(result, 'variables')[[var_name]][['units']] + if (length(names(attr(result, 'variables'))) == 1) { + if ('time' %in% synonims[[names(attr(result, 'variables'))]]) { + var_name <- names(attr(result, 'variables')) + units <- attr(result, 'variables')[[var_name]][['units']] - if (units %in% c('seconds', 'minutes', 'hours', 'days', 'weeks', 'months', 'years')) { - if (units == 'seconds') { -# units <- 'secs' - } else if (units == 'minutes') { -# units <- 'mins' - result <- result * 60 # min to sec - } - result[] <- paste(result[], units) - - } else if (grepl(' since ', units)) { - # Find the calendar - calendar <- attr(result, 'variables')[[var_name]]$calendar - if (calendar == 'standard') calendar <- 'gregorian' + if (units %in% c('seconds', 'minutes', 'hours', 'days', 'weeks', 'months', 'years')) { + if (units == 'seconds') { +# units <- 'secs' + } else if (units == 'minutes') { +# units <- 'mins' + result <- result * 60 # min to sec + } + result[] <- paste(result[], units) - parts <- strsplit(units, ' since ')[[1]] - units <- parts[1] + } else if (grepl(' since ', units)) { + # Find the calendar + calendar <- attr(result, 'variables')[[var_name]]$calendar + if (calendar == 'standard') calendar <- 'gregorian' - if (units %in% c('second', 'seconds')) { -# units <- 'secs' - } else if (units %in% c('minute', 'minutes')) { -# units <- 'mins' - result <- result * 60 # min to sec - } else if (units %in% c('hour', 'hours')) { - result <- result * 60 * 60 # hour to sec - } else if (units %in% c('day', 'days')) { -# units <- 'days' - result <- result * 24 * 60 * 60 # day to sec - } else if (units %in% c('month', 'months')) { - # define day in each month - leap_month_day <- c(31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31) - no_leap_month_day <- c(31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31) - # Origin year and month - ori_year <- as.numeric(substr(parts[2], 1, 4)) - ori_month <- as.numeric(substr(parts[2], 6, 7)) - if (is.na(ori_month)) { - ori_month <- as.numeric(substr(parts[2], 6, 6)) - } - if (!is.numeric(ori_year) | !is.numeric(ori_month)) { - stop(paste0("The time unit attribute format is not 'YYYY-MM-DD' or 'YYYY-M-D'. ", - "Check the file or contact the maintainer.")) - } + parts <- strsplit(units, ' since ')[[1]] + units <- parts[1] - if (calendar == 'gregorian') { - # Find how many years + months - yr_num <- floor(result / 12) - month_left <- result - yr_num * 12 - # Find the leap years we care - if (ori_month <= 2) { - leap_num <- length(which(sapply(ori_year:(ori_year + yr_num - 1), s2dv::LeapYear))) - } else { - leap_num <- length(which(sapply((ori_year + 1):(ori_year + yr_num), s2dv::LeapYear))) + if (units %in% c('second', 'seconds')) { +# units <- 'secs' + } else if (units %in% c('minute', 'minutes')) { +# units <- 'mins' + result <- result * 60 # min to sec + } else if (units %in% c('hour', 'hours')) { + result <- result * 60 * 60 # hour to sec + } else if (units %in% c('day', 'days')) { +# units <- 'days' + result <- result * 24 * 60 * 60 # day to sec + } else if (units %in% c('month', 'months')) { + # define day in each month + leap_month_day <- c(31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31) + no_leap_month_day <- c(31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31) + # Origin year and month + ori_year <- as.numeric(substr(parts[2], 1, 4)) + ori_month <- as.numeric(substr(parts[2], 6, 7)) + if (is.na(ori_month)) { + ori_month <- as.numeric(substr(parts[2], 6, 6)) + } + if (!is.numeric(ori_year) | !is.numeric(ori_month)) { + stop(paste0("The time unit attribute format is not 'YYYY-MM-DD' or 'YYYY-M-D'. ", + "Check the file or contact the maintainer.")) } - total_days <- leap_num * 366 + (yr_num - leap_num) * 365 # not include month_left yet + if (calendar == 'gregorian') { + # Find how many years + months + yr_num <- floor(result / 12) + month_left <- result - yr_num * 12 + # Find the leap years we care + if (ori_month <= 2) { + leap_num <- length(which(sapply(ori_year:(ori_year + yr_num - 1), s2dv::LeapYear))) + } else { + leap_num <- length(which(sapply((ori_year + 1):(ori_year + yr_num), s2dv::LeapYear))) + } + total_days <- leap_num * 366 + (yr_num - leap_num) * 365 # not include month_left yet - if (month_left != 0) { - if ((ori_month + month_left) <= 12) { # the last month is still in the same last yr - # Is the last year a leap year? - last_leap <- s2dv::LeapYear(ori_year + yr_num) - if (last_leap) { - total_days <- total_days + sum(leap_month_day[ori_month:(ori_month + month_left - 1)]) - } else { - total_days <- total_days + sum(no_leap_month_day[ori_month:(ori_month + month_left - 1)]) - } - } else { # the last month ends in the next yr - if (ori_month == 2) { # e.g., 2005-02-16 + 11mth = 2006-01-16 - last_leap <- s2dv::LeapYear(ori_year + yr_num) # still consider 2005 + if (month_left != 0) { + if ((ori_month + month_left) <= 12) { # the last month is still in the same last yr + # Is the last year a leap year? + last_leap <- s2dv::LeapYear(ori_year + yr_num) if (last_leap) { - total_days <- total_days + sum(leap_month_day[2:12]) + total_days <- total_days + sum(leap_month_day[ori_month:(ori_month + month_left - 1)]) } else { - total_days <- total_days + sum(no_leap_month_day[2:12]) + total_days <- total_days + sum(no_leap_month_day[ori_month:(ori_month + month_left - 1)]) } - } else { # e.g., 2005-04-16 + 11mth = 2006-03-16 - last_leap <- s2dv::LeapYear(ori_year + yr_num + 1) - needed_month <- c(ori_month:12, 1:(ori_month + month_left - 12 - 1)) - if (last_leap) { - total_days <- total_days + sum(leap_month_day[needed_month]) - } else { - total_days <- total_days + sum(no_leap_month_day[needed_month]) + } else { # the last month ends in the next yr + if (ori_month == 2) { # e.g., 2005-02-16 + 11mth = 2006-01-16 + last_leap <- s2dv::LeapYear(ori_year + yr_num) # still consider 2005 + if (last_leap) { + total_days <- total_days + sum(leap_month_day[2:12]) + } else { + total_days <- total_days + sum(no_leap_month_day[2:12]) + } + } else { # e.g., 2005-04-16 + 11mth = 2006-03-16 + last_leap <- s2dv::LeapYear(ori_year + yr_num + 1) + needed_month <- c(ori_month:12, 1:(ori_month + month_left - 12 - 1)) + if (last_leap) { + total_days <- total_days + sum(leap_month_day[needed_month]) + } else { + total_days <- total_days + sum(no_leap_month_day[needed_month]) + } } } } - } - result <- total_days * 24 * 60 * 60 # day to sec - } else if (calendar %in% c('365_day',' 365', 'noleap')) { - yr_num <- floor(result / 12) - month_left <- result - yr_num * 12 - total_days <- 365 * yr_num + sum(no_leap_month_day[ori_month:(month_left - 1)]) - result <- total_days * 24 * 60 * 60 # day to sec + result <- total_days * 24 * 60 * 60 # day to sec + } else if (calendar %in% c('365_day',' 365', 'noleap')) { + yr_num <- floor(result / 12) + month_left <- result - yr_num * 12 + total_days <- 365 * yr_num + sum(no_leap_month_day[ori_month:(month_left - 1)]) + result <- total_days * 24 * 60 * 60 # day to sec - } else if (calendar %in% c('360_day', '360')) { - result <- result * 30 * 24 * 60 * 60 # day to sec + } else if (calendar %in% c('360_day', '360')) { + result <- result * 30 * 24 * 60 * 60 # day to sec - } else { #old code. The calendar is not in any of the above. - result <- result * 30.5 - result <- result * 24 * 60 * 60 # day to sec + } else { #old code. The calendar is not in any of the above. + result <- result * 30.5 + result <- result * 24 * 60 * 60 # day to sec + } } - } - new_array <- PCICt::as.PCICt(result, cal = calendar, origin = parts[2])[] - new_array <- suppressWarnings(PCICt::as.POSIXct.PCICt(new_array, tz = "UTC")) + new_array <- PCICt::as.PCICt(result, cal = calendar, origin = parts[2])[] + new_array <- suppressWarnings(PCICt::as.POSIXct.PCICt(new_array, tz = "UTC")) - #new_array <- seq(as.POSIXct(parts[2]), - # length = max(result, na.rm = TRUE) + 1, - # by = units)[result[] + 1] - dim(new_array) <- dim(result) - attr(new_array, 'variables') <- attr(result, 'variables') - result <- new_array + #new_array <- seq(as.POSIXct(parts[2]), + # length = max(result, na.rm = TRUE) + 1, + # by = units)[result[] + 1] + dim(new_array) <- dim(result) + attr(new_array, 'variables') <- attr(result, 'variables') + result <- new_array + } } - } + } } if (close) { diff --git a/R/Start.R b/R/Start.R index f473dfd..e8ba7d3 100644 --- a/R/Start.R +++ b/R/Start.R @@ -3515,7 +3515,7 @@ Start <- function(..., # dim = indices/selectors, c(attributes(common_vars_to_crop[[common_var_to_crop]]), tmp_attributes[tmp]) - if (common_var_to_crop == 'time') { + if ('time' %in% synonims[[common_var_to_crop]]) { # Convert number back to time common_vars_to_crop[[common_var_to_crop]] <- as.POSIXct(common_vars_to_crop[[common_var_to_crop]], @@ -3848,47 +3848,56 @@ Start <- function(..., # dim = indices/selectors, # unequal inner_dim ('time') length across file_dim ('file_date'). # If merge_across_dims_narm = TRUE, add additional lines to remove these NAs. # TODO: Now it assumes that only one '_across'. Add a for loop for more-than-one case. + if (merge_across_dims) { + # Save dim and attributes for later use + dims_of_merge_dim <- dim(picked_common_vars[[across_inner_dim]]) + tmp_attr <- attr(picked_common_vars[[across_inner_dim]], 'variables') + # Find final_dim_fake for metadata and put it in an array + if (!split_multiselected_dims) { + final_dims_fake_merge_dim <- final_dims_fake[names(final_dims_fake) %in% names(dims_of_merge_dim)] + } else { + final_dims_fake_merge_dim <- final_dims_fake[names(final_dims_fake) %in% names(all_split_dims[[across_inner_dim]])] + } + } else if (split_multiselected_dims) { + tmp_attr <- attr(picked_common_vars[[names(all_split_dims)]], 'variables') + final_dims_fake_merge_dim <- all_split_dims[[1]] + } + if (merge_across_dims & (split_multiselected_dims | merge_across_dims_narm)) { if (!merge_across_dims_narm) { data_array_tmp <- array(bigmemory::as.matrix(data_array), dim = final_dims) + tmp <- match(names(final_dims), names(dims_of_merge_dim)) + if (any(diff(tmp[!is.na(tmp)]) < 0)) { #need to reorder + picked_common_vars[[across_inner_dim]] <- .aperm2(picked_common_vars[[across_inner_dim]], tmp[!is.na(tmp)]) + } + metadata_tmp <- picked_common_vars[[across_inner_dim]] } else { + tmp <- remove_additional_na_from_merge( inner_dims_across_files, final_dims, length_inner_across_dim, data_array, merge_dim_metadata = picked_common_vars[[across_inner_dim]]) data_array_tmp <- tmp$data_array_tmp - # Save dim for later use - dims_of_merge_dim <- dim(picked_common_vars[[across_inner_dim]]) - picked_common_vars[[across_inner_dim]] <- tmp$merge_dim_metadata + metadata_tmp <- tmp$merge_dim_metadata } if (length(data_array_tmp) != prod(final_dims_fake)) { stop(paste0("After reshaping, the data do not fit into the expected output dimension. ", "Check if the reshaping parameters are used correctly.")) } - + #NOTE: When one file contains values for dicrete dimensions, rearrange the # chunks (i.e., work_piece) is necessary. if (split_multiselected_dims) { tmp <- rebuild_array_merge_split( data_array_tmp, indices_chunk, all_split_dims, final_dims_fake, - across_inner_dim, length_inner_across_dim, metadata_tmp = picked_common_vars[[across_inner_dim]]) + across_inner_dim, length_inner_across_dim, metadata_tmp = metadata_tmp) data_array_tmp <- tmp$data_array_tmp - picked_common_vars[[across_inner_dim]] <- tmp$metadata_tmp - } - - data_array <- array(data_array_tmp, dim = final_dims_fake) - - # Find final_dim_fake for metadata and put it in an array - if (!split_multiselected_dims) { - final_dims_fake_merge_dim <- final_dims_fake[names(final_dims_fake) %in% names(dims_of_merge_dim)] - } else { - final_dims_fake_merge_dim <- final_dims_fake[names(final_dims_fake) %in% names(all_split_dims[[across_inner_dim]])] + metadata_tmp <- tmp$metadata_tmp } - tmp_attr <- attr(picked_common_vars[[across_inner_dim]], 'variables') - picked_common_vars[[across_inner_dim]] <- array(picked_common_vars[[across_inner_dim]], dim = final_dims_fake_merge_dim) - attr(picked_common_vars[[across_inner_dim]], 'variables') <- tmp_attr + data_array <- array(data_array_tmp, dim = final_dims_fake) + metadata_tmp <- array(metadata_tmp, dim = final_dims_fake_merge_dim) # If split_multiselected_dims + merge_across_dims, the dimension order may change above. # To get the user-required dim order, we need to reorder the array again. @@ -3897,19 +3906,48 @@ Start <- function(..., # dim = indices/selectors, correct_order <- match(names(final_dims_fake_output), names(final_dims_fake)) data_array <- .aperm2(data_array, correct_order) correct_order_metadata <- match(names(final_dims_fake_output), names(all_split_dims[[across_inner_dim]])) - picked_common_vars[[across_inner_dim]] <- .aperm2(picked_common_vars[[across_inner_dim]], correct_order_metadata[!is.na(correct_order_metadata)]) + metadata_tmp <- .aperm2(metadata_tmp, correct_order_metadata[!is.na(correct_order_metadata)]) } } - - if (across_inner_dim == 'time') { - # Convert numeric back to dates - picked_common_vars[[across_inner_dim]] <- - as.POSIXct(picked_common_vars[[across_inner_dim]], - origin = "1970-01-01", tz = 'UTC') + # Convert numeric back to dates + if ('time' %in% synonims[[across_inner_dim]]) { + metadata_tmp <- as.POSIXct(metadata_tmp, origin = "1970-01-01", tz = 'UTC') } - + + picked_common_vars[[across_inner_dim]] <- metadata_tmp + attr(picked_common_vars[[across_inner_dim]], 'variables') <- tmp_attr + } else { # ! (merge_across_dims + split_multiselected_dims) (old version) data_array <- array(bigmemory::as.matrix(data_array), dim = final_dims_fake) + if (merge_across_dims) { + # merge_across_dims = TRUE but (merge_across_dims_narm = F & split_multiselected_dims = F) + + inner_dim_pos <- which(names(dims_of_merge_dim) == inner_dims_across_files) + file_dim_pos <- which(names(dims_of_merge_dim) == names(inner_dims_across_files)) + if (file_dim_pos < inner_dim_pos) { #need to reorder + tmp <- seq(1, length(dims_of_merge_dim)) + tmp[inner_dim_pos] <- file_dim_pos + tmp[file_dim_pos] <- inner_dim_pos + picked_common_vars[[across_inner_dim]] <- .aperm2(picked_common_vars[[across_inner_dim]], tmp) + } + metadata_tmp <- array(picked_common_vars[[across_inner_dim]], dim = final_dims_fake_merge_dim) + # Convert numeric back to dates + if ('time' %in% synonims[[across_inner_dim]]) { + metadata_tmp <- as.POSIXct(metadata_tmp, origin = "1970-01-01", tz = 'UTC') + } + picked_common_vars[[across_inner_dim]] <- metadata_tmp + attr(picked_common_vars[[across_inner_dim]], 'variables') <- tmp_attr + } + + if (split_multiselected_dims) { + metadata_tmp <- array(picked_common_vars[[names(all_split_dims)]], dim = final_dims_fake_merge_dim) + # Convert numeric back to dates + if ('time' %in% synonims[[names(all_split_dims)]]) { + metadata_tmp <- as.POSIXct(metadata_tmp, origin = "1970-01-01", tz = 'UTC') + } + picked_common_vars[[names(all_split_dims)]] <- metadata_tmp + attr(picked_common_vars[[names(all_split_dims)]], 'variables') <- tmp_attr + } } gc() diff --git a/tests/testthat/test-Start-metadata_reshaping.R b/tests/testthat/test-Start-metadata_reshaping.R index 680f8d6..03674f1 100644 --- a/tests/testthat/test-Start-metadata_reshaping.R +++ b/tests/testthat/test-Start-metadata_reshaping.R @@ -32,8 +32,12 @@ dim(drop(data)), dim(dates) ) expect_equal( -names(attributes(dates)), -c('variables', 'dim', 'class', 'tzone') +length(attributes(dates)), +4 +) +expect_equal( +all(names(attributes(dates)) %in% c('variables', 'dim', 'class', 'tzone')), +TRUE ) expect_equal( class(dates), @@ -91,7 +95,6 @@ as.vector(seq(as.POSIXct('1961-01-01 12:00:00', tz = 'UTC'), as.POSIXct('1961-12 }) - test_that("3. time across fyear, fyear depends on sdate, 1st fyear is empty, 3rd fyear has more indices than 2nd one, 1964 is leap year", { suppressWarnings( @@ -100,16 +103,17 @@ data <- Start(dat = '/esarchive/exp/ecearth/a3w5/original_files/cmorfiles/DCPP/E lat = indices(1), lon = indices(1), sdate = paste0(1960:1961), - time = 700:860, ## initialised in Nov - time_across = 'fyear', + day = 700:860, ## initialised in Nov + day_across = 'fyear', merge_across_dims = TRUE, fyear = 'all', fyear_depends = 'sdate', member = 'r1i4p1f1', synonims = list(lat = c('lat','latitude'), - lon = c('lon','longitude')), + lon = c('lon','longitude'), + day = c('day', 'time')), return_vars = list(lat = NULL, lon = NULL, - time = c('sdate', 'fyear')), + day = c('sdate', 'fyear')), retrieve = TRUE) ) dates <- attr(data,'Variables')$common[['time']] @@ -118,10 +122,13 @@ expect_equal( dim(dates), c(sdate = 2, time = 161) ) - expect_equal( -names(attributes(dates)), -c('variables', 'dim', 'class', 'tzone') +length(attributes(dates)), +4 +) +expect_equal( +all(names(attributes(dates)) %in% c('variables', 'dim', 'class', 'tzone')), +TRUE ) expect_equal( class(dates), @@ -137,3 +144,250 @@ as.vector(seq(as.POSIXct('1963-10-01 12:00:00', tz = 'UTC'), as.POSIXct('1964-03 ) }) + + +test_that("4. merge and split time dim", { +datess <- seq(as.POSIXct('1994-05-01', tz = 'UTC'), as.POSIXct('1994-12-31', tz = 'UTC'), by = 'days') +datess <- c(datess[c(1:31, 32:62, 62:92, 93:123, 124:154, 154:184, 185:215, 215:245)]) +datess <- as.POSIXct(array(datess, dim = c(time = 31, sdate = 8)), + origin = '1970-01-01', tz = 'UTC') +dates_file <- sort(unique(gsub('-', '', sapply(as.character(datess), + substr, 1, 7)))) + +suppressWarnings( + data <- Start(dat = paste0('/esarchive/recon/ecmwf/erainterim/6hourly/', + '$var$/$var$_$file_date$.nc'), + var = 'tas', + file_date = dates_file, + time = values(datess), #[time = 31, sdate = 8] + latitude = indices(1), + longitude = indices(1), + time_var = 'time', + #because time is assigned by 'values', set the tolerance to avoid too distinct match + time_tolerance = as.difftime(1, units = 'hours'), + #time values are across all the files + time_across = 'file_date', + #combine time and file_date dims + merge_across_dims = TRUE, + #exclude the additional NAs generated by merge_across_dims + merge_across_dims_narm = TRUE, + #split time dim, because it is two-dimensional + split_multiselected_dims = TRUE, + return_vars = list(latitude = NULL, + longitude = NULL, + time = 'file_date'), + retrieve = TRUE) +) + +dates <- attr(data,'Variables')$common[['time']] + +expect_equal( +dim(dates), +c(time = 31, sdate = 8) +) +expect_equal( +length(attributes(dates)), +4 +) +expect_equal( +all(names(attributes(dates)) %in% c('variables', 'dim', 'class', 'tzone')), +TRUE +) +expect_equal( +class(dates), +c("POSIXct", "POSIXt") +) +expect_equal( +as.vector(dates), +as.vector(datess) +) + +}) + +test_that("5. test 1 but merge_across_dims_narm = F", { + +suppressWarnings( +data <- Start(dat = '/esarchive/exp/ecearth/a3w5/original_files/cmorfiles/DCPP/EC-Earth-Consortium/EC-Earth3/dcppA-hindcast/$member$/day/$var$/gr/v20210910/$var$_day_EC-Earth3_dcppA-hindcast_s$sdate$-$member$_gr_$fyear$.nc', + var = 'tasmin', + lat = indices(1), + lon = indices(1), + sdate = paste0(1960:1961), + time = 62:426, ## Jan to Dec (initialised in Nov) + time_across = 'fyear', + merge_across_dims = TRUE, + merge_across_dims_narm = F, + fyear = 'all', + fyear_depends = 'sdate', + member = 'r1i4p1f1', + synonims = list(lat = c('lat','latitude'), + lon = c('lon','longitude')), + return_vars = list(lat = NULL, lon = NULL, + time = c('sdate', 'fyear')), + retrieve = TRUE) +) +dates <- attr(data,'Variables')$common[['time']] + +expect_equal( +dim(dates), +c(sdate = 2, time = 608) +) +expect_equal( +dim(drop(data)), +dim(dates) +) +expect_equal( +length(attributes(dates)), +4 +) +expect_equal( +all(names(attributes(dates)) %in% c('variables', 'dim', 'class', 'tzone')), +TRUE +) +expect_equal( +class(dates), +c("POSIXct", "POSIXt") +) +expect_equal( +as.vector(dates[1, ]), +c(as.vector(seq(as.POSIXct('1961-01-01 12:00:00', tz = 'UTC'), as.POSIXct('1961-12-31 12:00:00', tz = 'UTC'), by = 'day')), rep(NA, 243)) +) +expect_equal( +as.vector(dates[2, ]), +c(as.vector(seq(as.POSIXct('1962-01-01 12:00:00', tz = 'UTC'), as.POSIXct('1962-12-31 12:00:00', tz = 'UTC'), by = 'day')), rep(NA, 243)) +) + +}) + +test_that("6. split dim only", { + +datess <- seq(as.POSIXct('1994-07-01', tz = 'UTC'), as.POSIXct('1994-07-14', tz = 'UTC'), by = 'days') +datess <- as.POSIXct(array(datess, dim = c(time = 7, week = 2)), + origin = '1970-01-01', tz = 'UTC') + +data <- Start(dat = paste0('/esarchive/recon/ecmwf/erainterim/6hourly/', + '$var$/$var$_199407.nc'), + var = 'tas', +# file_date = '199407', + time = values(datess), #[time = 7, week = 2] + latitude = indices(1), + longitude = indices(1), + time_var = 'time', + time_tolerance = as.difftime(1, units = 'hours'), +# time_across = 'file_date', +# merge_across_dims = TRUE, + split_multiselected_dims = TRUE, + return_vars = list(latitude = NULL, + longitude = NULL, + time = NULL), #'file_date'), + retrieve = TRUE) + +dates <- attr(data,'Variables')$common[['time']] + + +expect_equal( +dim(dates), +c(time = 7, week = 2) +) +expect_equal( +dim(drop(data)), +dim(dates) +) +expect_equal( +length(attributes(dates)), +4 +) +expect_equal( +all(names(attributes(dates)) %in% c('variables', 'dim', 'class', 'tzone')), +TRUE +) +expect_equal( +class(dates), +c("POSIXct", "POSIXt") +) +expect_equal( +as.vector(dates[, ]), +as.vector(seq(as.POSIXct('1994-07-01', tz = 'UTC'), as.POSIXct('1994-07-14', tz = 'UTC'), by = 'day')) +) + + +}) + +test_that("7. split dim + merge + merge_narm = F", { + +datess <- seq(as.POSIXct('1994-07-01', tz = 'UTC'), as.POSIXct('1994-08-31', tz = 'UTC'), by = 'days') +datess <- as.POSIXct(array(datess, dim = c(time = 31, month = 2)), + origin = '1970-01-01', tz = 'UTC') + +data <- Start(dat = paste0('/esarchive/recon/ecmwf/erainterim/6hourly/', + '$var$/$var$_$file_date$.nc'), + var = 'tas', + file_date = c('199407', '199408'), + time = values(datess), #[time = 31, month = 2] + latitude = indices(1), + longitude = indices(1), + time_var = 'time', + time_tolerance = as.difftime(1, units = 'hours'), + time_across = 'file_date', + merge_across_dims = TRUE, + merge_across_dims_narm = F, + split_multiselected_dims = TRUE, + return_vars = list(latitude = NULL, + longitude = NULL, + time = 'file_date'), + retrieve = TRUE) + +dates <- attr(data,'Variables')$common[['time']] + + +expect_equal( +dim(dates), +c(time = 31, month = 2) +) +expect_equal( +dim(drop(data)), +dim(dates) +) +expect_equal( +length(attributes(dates)), +4 +) +expect_equal( +all(names(attributes(dates)) %in% c('variables', 'dim', 'class', 'tzone')), +TRUE +) +expect_equal( +class(dates), +c("POSIXct", "POSIXt") +) +expect_equal( +as.vector(dates[, ]), +as.vector(seq(as.POSIXct('1994-07-01', tz = 'UTC'), as.POSIXct('1994-08-31', tz = 'UTC'), by = 'day')) +) + +}) + +test_that("8. test 1 but retrieve = F", { + +suppressWarnings( +data <- Start(dat = '/esarchive/exp/ecearth/a3w5/original_files/cmorfiles/DCPP/EC-Earth-Consortium/EC-Earth3/dcppA-hindcast/$member$/day/$var$/gr/v20210910/$var$_day_EC-Earth3_dcppA-hindcast_s$sdate$-$member$_gr_$fyear$.nc', + var = 'tasmin', + lat = indices(1), + lon = indices(1), + sdate = paste0(1960:1961), + time = 62:426, ## Jan to Dec (initialised in Nov) + time_across = 'fyear', + merge_across_dims = TRUE, + fyear = 'all', + fyear_depends = 'sdate', + member = 'r1i4p1f1', + synonims = list(lat = c('lat','latitude'), + lon = c('lon','longitude')), + return_vars = list(lat = NULL, lon = NULL, + time = c('sdate', 'fyear')), + retrieve = FALSE) +) +dates <- attr(data,'Variables')$common[['time']] + +}) + + -- GitLab From 04573b9a03d1bbf9683101e094ff48c6f58070b2 Mon Sep 17 00:00:00 2001 From: aho Date: Fri, 25 Feb 2022 20:27:41 +0100 Subject: [PATCH 10/17] Revise split dim case's condition; fix typo --- R/Start.R | 16 +++++++++------- tests/testthat/test-Start-metadata_reshaping.R | 6 +++--- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/R/Start.R b/R/Start.R index e8ba7d3..145851c 100644 --- a/R/Start.R +++ b/R/Start.R @@ -3938,16 +3938,18 @@ Start <- function(..., # dim = indices/selectors, picked_common_vars[[across_inner_dim]] <- metadata_tmp attr(picked_common_vars[[across_inner_dim]], 'variables') <- tmp_attr } - if (split_multiselected_dims) { - metadata_tmp <- array(picked_common_vars[[names(all_split_dims)]], dim = final_dims_fake_merge_dim) - # Convert numeric back to dates - if ('time' %in% synonims[[names(all_split_dims)]]) { - metadata_tmp <- as.POSIXct(metadata_tmp, origin = "1970-01-01", tz = 'UTC') + if (names(all_split_dims) %in% names(picked_common_vars)) { + metadata_tmp <- array(picked_common_vars[[names(all_split_dims)]], dim = final_dims_fake_merge_dim) + # Convert numeric back to dates + if ('time' %in% synonims[[names(all_split_dims)]]) { + metadata_tmp <- as.POSIXct(metadata_tmp, origin = "1970-01-01", tz = 'UTC') + } + picked_common_vars[[names(all_split_dims)]] <- metadata_tmp + attr(picked_common_vars[[names(all_split_dims)]], 'variables') <- tmp_attr } - picked_common_vars[[names(all_split_dims)]] <- metadata_tmp - attr(picked_common_vars[[names(all_split_dims)]], 'variables') <- tmp_attr } + } gc() diff --git a/tests/testthat/test-Start-metadata_reshaping.R b/tests/testthat/test-Start-metadata_reshaping.R index 03674f1..d6868d6 100644 --- a/tests/testthat/test-Start-metadata_reshaping.R +++ b/tests/testthat/test-Start-metadata_reshaping.R @@ -116,7 +116,7 @@ data <- Start(dat = '/esarchive/exp/ecearth/a3w5/original_files/cmorfiles/DCPP/E day = c('sdate', 'fyear')), retrieve = TRUE) ) -dates <- attr(data,'Variables')$common[['time']] +dates <- attr(data,'Variables')$common[['day']] expect_equal( dim(dates), @@ -263,7 +263,7 @@ test_that("6. split dim only", { datess <- seq(as.POSIXct('1994-07-01', tz = 'UTC'), as.POSIXct('1994-07-14', tz = 'UTC'), by = 'days') datess <- as.POSIXct(array(datess, dim = c(time = 7, week = 2)), origin = '1970-01-01', tz = 'UTC') - +suppressWarnings( data <- Start(dat = paste0('/esarchive/recon/ecmwf/erainterim/6hourly/', '$var$/$var$_199407.nc'), var = 'tas', @@ -280,7 +280,7 @@ data <- Start(dat = paste0('/esarchive/recon/ecmwf/erainterim/6hourly/', longitude = NULL, time = NULL), #'file_date'), retrieve = TRUE) - +) dates <- attr(data,'Variables')$common[['time']] -- GitLab From 4ca9d58675c40e6dae47c9cbe40f926710c7972d Mon Sep 17 00:00:00 2001 From: aho Date: Mon, 28 Feb 2022 09:24:13 +0100 Subject: [PATCH 11/17] Typo fixed --- tests/testthat/test-Start-metadata_reshaping.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test-Start-metadata_reshaping.R b/tests/testthat/test-Start-metadata_reshaping.R index d6868d6..a4f7d69 100644 --- a/tests/testthat/test-Start-metadata_reshaping.R +++ b/tests/testthat/test-Start-metadata_reshaping.R @@ -120,7 +120,7 @@ dates <- attr(data,'Variables')$common[['day']] expect_equal( dim(dates), -c(sdate = 2, time = 161) +c(sdate = 2, day = 161) ) expect_equal( length(attributes(dates)), @@ -318,6 +318,7 @@ datess <- seq(as.POSIXct('1994-07-01', tz = 'UTC'), as.POSIXct('1994-08-31', tz datess <- as.POSIXct(array(datess, dim = c(time = 31, month = 2)), origin = '1970-01-01', tz = 'UTC') +suppressWarnings( data <- Start(dat = paste0('/esarchive/recon/ecmwf/erainterim/6hourly/', '$var$/$var$_$file_date$.nc'), var = 'tas', @@ -335,7 +336,7 @@ data <- Start(dat = paste0('/esarchive/recon/ecmwf/erainterim/6hourly/', longitude = NULL, time = 'file_date'), retrieve = TRUE) - +) dates <- attr(data,'Variables')$common[['time']] -- GitLab From c65b9245c1ee3959f5b91f6b2f05d0a9e67bf0b5 Mon Sep 17 00:00:00 2001 From: aho Date: Mon, 28 Feb 2022 09:28:25 +0100 Subject: [PATCH 12/17] Revise condition statement to include implicit inner dim case --- R/NcDataReader.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/NcDataReader.R b/R/NcDataReader.R index 7613172..95fd1f1 100644 --- a/R/NcDataReader.R +++ b/R/NcDataReader.R @@ -184,7 +184,8 @@ NcDataReader <- function(file_path = NULL, file_object = NULL, }) if (length(names(attr(result, 'variables'))) == 1) { - if ('time' %in% synonims[[names(attr(result, 'variables'))]]) { + # The 1st condition is for implicit time dim (if time length = 1, it is allowed to not be defined in Start call. Therefore, it is not in the list of synonims) + if (names(attr(result, 'variables')) == 'time' | 'time' %in% synonims[[names(attr(result, 'variables'))]]) { var_name <- names(attr(result, 'variables')) units <- attr(result, 'variables')[[var_name]][['units']] -- GitLab From eba39b1a4a8b07828795332eaf98db91b28ac4af Mon Sep 17 00:00:00 2001 From: aho Date: Tue, 1 Mar 2022 15:30:43 +0100 Subject: [PATCH 13/17] Add metadata reshaping to retrieve = FALSE --- R/Start.R | 141 ++++++++++--- R/zzz.R | 179 ++++++++++------ .../testthat/test-Start-metadata_reshaping.R | 198 ++++++++++++++++-- 3 files changed, 409 insertions(+), 109 deletions(-) diff --git a/R/Start.R b/R/Start.R index 145851c..59230f6 100644 --- a/R/Start.R +++ b/R/Start.R @@ -3730,6 +3730,21 @@ Start <- function(..., # dim = indices/selectors, all_split_dims[[1]] <- tmp[[2]] } } + + if (merge_across_dims) { + # Save dim and attributes for later use + dims_of_merge_dim <- dim(picked_common_vars[[across_inner_dim]]) + tmp_attr <- attr(picked_common_vars[[across_inner_dim]], 'variables') + # Find final_dim_fake for metadata and put it in an array + if (!split_multiselected_dims) { + final_dims_fake_merge_dim <- final_dims_fake[names(final_dims_fake) %in% names(dims_of_merge_dim)] + } else { + final_dims_fake_merge_dim <- final_dims_fake[names(final_dims_fake) %in% names(all_split_dims[[across_inner_dim]])] + } + } else if (split_multiselected_dims) { + tmp_attr <- attr(picked_common_vars[[names(all_split_dims)]], 'variables') + final_dims_fake_merge_dim <- all_split_dims[[1]] + } # The following several lines will only run if retrieve = TRUE if (retrieve) { @@ -3844,25 +3859,10 @@ Start <- function(..., # dim = indices/selectors, } #print("P") - # NOTE: If merge_across_dims = TRUE, there might be additional NAs due to - # unequal inner_dim ('time') length across file_dim ('file_date'). - # If merge_across_dims_narm = TRUE, add additional lines to remove these NAs. + # If merge_across_dims = TRUE, there might be additional NAs due to unequal + # inner_dim ('time') length across file_dim ('file_date'). + # If merge_across_dims_narm = TRUE, add additional lines to remove these NAs. # TODO: Now it assumes that only one '_across'. Add a for loop for more-than-one case. - if (merge_across_dims) { - # Save dim and attributes for later use - dims_of_merge_dim <- dim(picked_common_vars[[across_inner_dim]]) - tmp_attr <- attr(picked_common_vars[[across_inner_dim]], 'variables') - # Find final_dim_fake for metadata and put it in an array - if (!split_multiselected_dims) { - final_dims_fake_merge_dim <- final_dims_fake[names(final_dims_fake) %in% names(dims_of_merge_dim)] - } else { - final_dims_fake_merge_dim <- final_dims_fake[names(final_dims_fake) %in% names(all_split_dims[[across_inner_dim]])] - } - } else if (split_multiselected_dims) { - tmp_attr <- attr(picked_common_vars[[names(all_split_dims)]], 'variables') - final_dims_fake_merge_dim <- all_split_dims[[1]] - } - if (merge_across_dims & (split_multiselected_dims | merge_across_dims_narm)) { if (!merge_across_dims_narm) { data_array_tmp <- array(bigmemory::as.matrix(data_array), dim = final_dims) @@ -3875,9 +3875,9 @@ Start <- function(..., # dim = indices/selectors, tmp <- remove_additional_na_from_merge( inner_dims_across_files, final_dims, - length_inner_across_dim, data_array, + length_inner_across_dim, data_array = bigmemory::as.matrix(data_array), merge_dim_metadata = picked_common_vars[[across_inner_dim]]) - data_array_tmp <- tmp$data_array_tmp + data_array_tmp <- tmp$data_array metadata_tmp <- tmp$merge_dim_metadata } @@ -3885,15 +3885,18 @@ Start <- function(..., # dim = indices/selectors, stop(paste0("After reshaping, the data do not fit into the expected output dimension. ", "Check if the reshaping parameters are used correctly.")) } + if (length(metadata_tmp) != prod(final_dims_fake_merge_dim)) { + stop(paste0("After reshaping, the metadata do not fit into the expected output dimension. ", "Check if the reshaping parameters are used correctly or contact support.")) + } #NOTE: When one file contains values for dicrete dimensions, rearrange the # chunks (i.e., work_piece) is necessary. if (split_multiselected_dims) { tmp <- rebuild_array_merge_split( - data_array_tmp, indices_chunk, all_split_dims, final_dims_fake, - across_inner_dim, length_inner_across_dim, metadata_tmp = metadata_tmp) - data_array_tmp <- tmp$data_array_tmp - metadata_tmp <- tmp$metadata_tmp + data_array = data_array_tmp, indices_chunk, all_split_dims, final_dims_fake, + across_inner_dim, length_inner_across_dim, metadata = metadata_tmp) + data_array_tmp <- tmp$data_array + metadata_tmp <- tmp$metadata } data_array <- array(data_array_tmp, dim = final_dims_fake) @@ -3999,7 +4002,95 @@ Start <- function(..., # dim = indices/selectors, } } # End if (retrieve) - + else { # if retrieve = FALSE, metadata still needs to reshape + + if (merge_across_dims & (split_multiselected_dims | merge_across_dims_narm)) { + if (!merge_across_dims_narm) { +# data_array_tmp <- array(bigmemory::as.matrix(data_array), dim = final_dims) + tmp <- match(names(final_dims), names(dims_of_merge_dim)) + if (any(diff(tmp[!is.na(tmp)]) < 0)) { #need to reorder + picked_common_vars[[across_inner_dim]] <- .aperm2(picked_common_vars[[across_inner_dim]], tmp[!is.na(tmp)]) + } + metadata_tmp <- picked_common_vars[[across_inner_dim]] + } else { + + tmp <- remove_additional_na_from_merge( + inner_dims_across_files, final_dims, + length_inner_across_dim, data_array = NULL, + merge_dim_metadata = picked_common_vars[[across_inner_dim]]) +# data_array_tmp <- tmp$data_array_tmp + metadata_tmp <- tmp$merge_dim_metadata + } + + if (length(metadata_tmp) != prod(final_dims_fake_merge_dim)) { + stop(paste0("After reshaping, the metadata do not fit into the expected output dimension. ", + "Check if the reshaping parameters are used correctly or contact support.")) + } + + #NOTE: When one file contains values for dicrete dimensions, rearrange the + # chunks (i.e., work_piece) is necessary. + if (split_multiselected_dims) { + tmp <- rebuild_array_merge_split( + data_array = NULL, indices_chunk, all_split_dims, final_dims_fake, + across_inner_dim, length_inner_across_dim, metadata = metadata_tmp) +# data_array_tmp <- tmp$data_array + metadata_tmp <- tmp$metadata + } + +# data_array <- array(data_array_tmp, dim = final_dims_fake) + metadata_tmp <- array(metadata_tmp, dim = final_dims_fake_merge_dim) + + # If split_multiselected_dims + merge_across_dims, the dimension order may change above. + # To get the user-required dim order, we need to reorder the array again. + if (split_multiselected_dims) { + if (inner_dim_pos_in_split_dims != 1) { + correct_order <- match(names(final_dims_fake_output), names(final_dims_fake)) +# data_array <- .aperm2(data_array, correct_order) + correct_order_metadata <- match(names(final_dims_fake_output), names(all_split_dims[[across_inner_dim]])) + metadata_tmp <- .aperm2(metadata_tmp, correct_order_metadata[!is.na(correct_order_metadata)]) + } + } + # Convert numeric back to dates + if ('time' %in% synonims[[across_inner_dim]]) { + metadata_tmp <- as.POSIXct(metadata_tmp, origin = "1970-01-01", tz = 'UTC') + } + picked_common_vars[[across_inner_dim]] <- metadata_tmp + attr(picked_common_vars[[across_inner_dim]], 'variables') <- tmp_attr + } else { # ! (merge_across_dims + split_multiselected_dims) (old version) + if (merge_across_dims) { + # merge_across_dims = TRUE but (merge_across_dims_narm = F & split_multiselected_dims = F) + + inner_dim_pos <- which(names(dims_of_merge_dim) == inner_dims_across_files) + file_dim_pos <- which(names(dims_of_merge_dim) == names(inner_dims_across_files)) + if (file_dim_pos < inner_dim_pos) { #need to reorder + tmp <- seq(1, length(dims_of_merge_dim)) + tmp[inner_dim_pos] <- file_dim_pos + tmp[file_dim_pos] <- inner_dim_pos + picked_common_vars[[across_inner_dim]] <- .aperm2(picked_common_vars[[across_inner_dim]], tmp) + } + metadata_tmp <- array(picked_common_vars[[across_inner_dim]], dim = final_dims_fake_merge_dim) + # Convert numeric back to dates + if ('time' %in% synonims[[across_inner_dim]]) { + metadata_tmp <- as.POSIXct(metadata_tmp, origin = "1970-01-01", tz = 'UTC') + } + picked_common_vars[[across_inner_dim]] <- metadata_tmp + attr(picked_common_vars[[across_inner_dim]], 'variables') <- tmp_attr + } + if (split_multiselected_dims) { + if (names(all_split_dims) %in% names(picked_common_vars)) { + metadata_tmp <- array(picked_common_vars[[names(all_split_dims)]], dim = final_dims_fake_merge_dim) + # Convert numeric back to dates + if ('time' %in% synonims[[names(all_split_dims)]]) { + metadata_tmp <- as.POSIXct(metadata_tmp, origin = "1970-01-01", tz = 'UTC') + } + picked_common_vars[[names(all_split_dims)]] <- metadata_tmp + attr(picked_common_vars[[names(all_split_dims)]], 'variables') <- tmp_attr + } + } + } + + } + # Change final_dims_fake back because retrieve = FALSE will use it for attributes later if (exists("final_dims_fake_output")) { final_dims_fake <- final_dims_fake_output diff --git a/R/zzz.R b/R/zzz.R index af601f7..b1694e1 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -1029,7 +1029,10 @@ retrieve_progress_message <- function(work_pieces, num_procs, silent) { # If merge_across_dims = TRUE and merge_across_dims_narm = TRUE, remove the additional NAs # due to unequal inner_dim ('time') length across file_dim ('sdate'). remove_additional_na_from_merge <- function(inner_dims_across_files, final_dims, - length_inner_across_dim, data_array, merge_dim_metadata) { + length_inner_across_dim, data_array = NULL, merge_dim_metadata = NULL) { + # data_array is a vector from bigmemory::as.matrix + # merge_dim_metadata is an array + across_file_dim <- names(inner_dims_across_files) #TODO: more than one? across_inner_dim <- inner_dims_across_files[[1]] #TODO: more than one? # Get the length of these two dimensions in final_dims @@ -1043,15 +1046,19 @@ remove_additional_na_from_merge <- function(inner_dims_across_files, final_dims, for (i in 1:length_file_across_store_dims) { #1:4 logi_array[1:length_inner_across_dim[[i]], i] <- TRUE } - - # First, get the data array with final_dims dimension - data_array_final_dims <- array(bigmemory::as.matrix(data_array), dim = final_dims) - + + if (!is.null(data_array)) { + # First, turn the data vector into array with final_dims + data_array_final_dims <- array(data_array, dim = final_dims) + } + # Change the NA derived from additional spaces to -9999, then remove these -9999 func_remove_blank <- function(data_array, logi_array) { # dim(data_array) = [time, file_date] # dim(logi_array) = [time, file_date] - # Change the blank spaces from NA to -9999 + # data_array can be data or metadata; if data, change the blank spaces from + # NA to -9999; if metadata (supposed to be 'time'), change the corresponding + # spaces to -12^10. if (any(class(data_array) %in% c("POSIXct", "POSIXt"))) { # change to numeric first data_array <- array(as.vector(data_array), dim = dim(data_array)) @@ -1061,40 +1068,48 @@ remove_additional_na_from_merge <- function(inner_dims_across_files, final_dims, } return(data_array) } - data_array_final_dims <- multiApply::Apply(data_array_final_dims, - target_dims = c(across_inner_dim, across_file_dim), #c('time', 'file_date') - output_dims = c(across_inner_dim, across_file_dim), - fun = func_remove_blank, - logi_array = logi_array)$output1 - - tmp_attr <- attributes(merge_dim_metadata)$variables - merge_dim_metadata <- multiApply::Apply(merge_dim_metadata, - target_dims = c(across_inner_dim, across_file_dim), - output_dims = c(across_inner_dim, across_file_dim), - fun = func_remove_blank, - logi_array = logi_array)$output1 - - ## reorder back to the correct dim - tmp <- match(names(final_dims), names(dim(data_array_final_dims))) - data_array_final_dims <- .aperm2(data_array_final_dims, tmp) - data_array_tmp <- data_array_final_dims[data_array_final_dims != -9999] # become a vector - - # Reorder metadata dim as final dim - tmp <- match(names(final_dims), names(dim(merge_dim_metadata))) - merge_dim_metadata <- aperm(merge_dim_metadata, tmp[!is.na(tmp)]) - merge_dim_metadata <- merge_dim_metadata[merge_dim_metadata != -12^10] - attr(merge_dim_metadata, 'variables') <- tmp_attr + + if (!is.null(data_array)) { + data_array_final_dims <- multiApply::Apply(data_array_final_dims, + target_dims = c(across_inner_dim, across_file_dim), #c('time', 'file_date') + output_dims = c(across_inner_dim, across_file_dim), + fun = func_remove_blank, + logi_array = logi_array)$output1 + } + if (!is.null(merge_dim_metadata)) { + tmp_attr <- attributes(merge_dim_metadata)$variables + merge_dim_metadata <- multiApply::Apply(merge_dim_metadata, + target_dims = c(across_inner_dim, across_file_dim), + output_dims = c(across_inner_dim, across_file_dim), + fun = func_remove_blank, + logi_array = logi_array)$output1 + } + + if (!is.null(data_array)) { + ## reorder back to the correct dim + tmp <- match(names(final_dims), names(dim(data_array_final_dims))) + data_array_final_dims <- .aperm2(data_array_final_dims, tmp) + data_array_tmp <- data_array_final_dims[data_array_final_dims != -9999] # become a vector + } else { + data_array_tmp <- NULL + } + if (!is.null(merge_dim_metadata)) { + # Reorder metadata dim as final dim + tmp <- match(names(final_dims), names(dim(merge_dim_metadata))) + merge_dim_metadata <- aperm(merge_dim_metadata, tmp[!is.na(tmp)]) + merge_dim_metadata <- merge_dim_metadata[merge_dim_metadata != -12^10] + attr(merge_dim_metadata, 'variables') <- tmp_attr + } #NOTE: both outputs are vectors. If 'merge_dim_metadata' is actually time, it is just numeric here. - return(list(data_array_tmp = data_array_tmp, merge_dim_metadata = merge_dim_metadata)) + return(list(data_array = data_array_tmp, merge_dim_metadata = merge_dim_metadata)) } - # When merge_across_dims = TRUE and split_multiselected_dims = TRUE, rearrange the chunks # (i.e., work_piece) is necessary if one file contains values for discrete dimensions -rebuild_array_merge_split <- function(data_array_tmp, indices_chunk, all_split_dims, - final_dims_fake, across_inner_dim, length_inner_across_dim, metadata_tmp) { +rebuild_array_merge_split <- function(data_array = NULL, indices_chunk, all_split_dims, + final_dims_fake, across_inner_dim, length_inner_across_dim, metadata = NULL) { # generate the correct order list from indices_chunk final_order_list <- list() i <- 1 @@ -1125,21 +1140,32 @@ rebuild_array_merge_split <- function(data_array_tmp, indices_chunk, all_split_d if (split_dims_pos[length(split_dims_pos)] < length(final_dims_fake)) { new_dims <- c(new_dims, final_dims_fake[(split_dims_pos[length(split_dims_pos)] + 1):length(final_dims_fake)]) } - data_array_no_split <- array(data_array_tmp, dim = new_dims) - metadata_no_split <- array(metadata_tmp, dim = new_dims) - # seperate 'time' dim into each work_piece length - data_array_seperate <- list() - metadata_seperate <- list() + if (!is.null(data_array)) { + data_array_no_split <- array(data_array, dim = new_dims) + # seperate 'time' dim into each work_piece length + data_array_seperate <- list() + array_piece <- list() + } + if (!is.null(metadata)) { + metadata_no_split <- array(metadata, dim = new_dims) + metadata_seperate <- list() + metadata_piece <- list() + } + tmp <- cumsum(unlist(length_inner_across_dim)) tmp <- c(0, tmp) for (i in 1:length(length_inner_across_dim)) { - data_array_seperate[[i]] <- ClimProjDiags::Subset(data_array_no_split, + if (!is.null(data_array)) { + data_array_seperate[[i]] <- ClimProjDiags::Subset(data_array_no_split, + across_inner_dim, + (tmp[i] + 1):tmp[i + 1]) + } + if (!is.null(metadata)) { + metadata_seperate[[i]] <- ClimProjDiags::Subset(metadata_no_split, across_inner_dim, (tmp[i] + 1):tmp[i + 1]) - metadata_seperate[[i]] <- ClimProjDiags::Subset(metadata_no_split, - across_inner_dim, - (tmp[i] + 1):tmp[i + 1]) + } } # re-build the array: chunk which_chunk <- as.numeric(names(final_order_list)) @@ -1147,34 +1173,65 @@ rebuild_array_merge_split <- function(data_array_tmp, indices_chunk, all_split_d which_chunk <- sapply(lapply(which_chunk, '==', sort_which_chunk), which) how_many_indices <- unlist(final_order_list) - array_piece <- list() - metadata_piece <- list() - ind_in_array_seperate <- as.list(rep(1, length(data_array_seperate))) + + if (!is.null(data_array)) { + ind_in_array_seperate <- as.list(rep(1, length(data_array_seperate))) + } else if (!is.null(metadata)) { + ind_in_array_seperate <- as.list(rep(1, length(metadata_seperate))) + } + for (i in 1:length(final_order_list)) { - array_piece[[i]] <- ClimProjDiags::Subset( - data_array_seperate[[which_chunk[i]]], across_inner_dim, - ind_in_array_seperate[[which_chunk[i]]]:(ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i] - 1)) - metadata_piece[[i]] <- ClimProjDiags::Subset( - metadata_seperate[[which_chunk[i]]], across_inner_dim, - ind_in_array_seperate[[which_chunk[i]]]:(ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i] - 1)) + if (!is.null(data_array)) { + array_piece[[i]] <- ClimProjDiags::Subset( + data_array_seperate[[which_chunk[i]]], across_inner_dim, + ind_in_array_seperate[[which_chunk[i]]]:(ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i] - 1)) + } + if (!is.null(metadata)) { + metadata_piece[[i]] <- ClimProjDiags::Subset( + metadata_seperate[[which_chunk[i]]], across_inner_dim, + ind_in_array_seperate[[which_chunk[i]]]:(ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i] - 1)) + } ind_in_array_seperate[[which_chunk[i]]] <- ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i] } # re-build the array: paste - data_array_tmp <- array_piece[[1]] - metadata_tmp <- metadata_piece[[1]] - along_pos <- which(names(dim(data_array_tmp)) == across_inner_dim) - if (length(array_piece) > 1) { - for (i in 2:length(array_piece)) { - data_array_tmp <- abind::abind(data_array_tmp, array_piece[[i]], - along = along_pos) - metadata_tmp <- abind::abind(metadata_tmp, metadata_piece[[i]], + if (!is.null(data_array)) { + data_array_tmp <- array_piece[[1]] + } else { + data_array_tmp <- NULL + } + if (!is.null(metadata)) { + metadata_tmp <- metadata_piece[[1]] + } else { + metadata_tmp <- NULL + } + + if (!is.null(data_array)) { + along_pos <- which(names(dim(data_array_tmp)) == across_inner_dim) + length_piece <- length(array_piece) + } else if (!is.null(metadata)) { + along_pos <- which(names(dim(metadata_tmp)) == across_inner_dim) + length_piece <- length(metadata_piece) + } + + if (length_piece > 1) { + for (i in 2:length_piece) { + if (!is.null(data_array)) { + data_array_tmp <- abind::abind(data_array_tmp, array_piece[[i]], + along = along_pos) + } + if (!is.null(metadata)) { + metadata_tmp <- abind::abind(metadata_tmp, metadata_piece[[i]], along = along_pos) + } } } - } + } else { + data_array_tmp <- data_array + metadata_tmp <- metadata + } - return(list(data_array_tmp = data_array_tmp, metadata_tmp = metadata_tmp)) + return(list(data_array = data_array_tmp, metadata = metadata_tmp)) } diff --git a/tests/testthat/test-Start-metadata_reshaping.R b/tests/testthat/test-Start-metadata_reshaping.R index a4f7d69..91a793d 100644 --- a/tests/testthat/test-Start-metadata_reshaping.R +++ b/tests/testthat/test-Start-metadata_reshaping.R @@ -52,6 +52,31 @@ as.vector(dates[2, ]), as.vector(seq(as.POSIXct('1962-01-01 12:00:00', tz = 'UTC'), as.POSIXct('1962-12-31 12:00:00', tz = 'UTC'), by = 'day')) ) +# retrieve = FALSE +suppressWarnings( +dataF <- Start(dat = '/esarchive/exp/ecearth/a3w5/original_files/cmorfiles/DCPP/EC-Earth-Consortium/EC-Earth3/dcppA-hindcast/$member$/day/$var$/gr/v20210910/$var$_day_EC-Earth3_dcppA-hindcast_s$sdate$-$member$_gr_$fyear$.nc', + var = 'tasmin', + lat = indices(1), + lon = indices(1), + sdate = paste0(1960:1961), + time = 62:426, ## Jan to Dec (initialised in Nov) + time_across = 'fyear', + merge_across_dims = TRUE, + fyear = 'all', + fyear_depends = 'sdate', + member = 'r1i4p1f1', + synonims = list(lat = c('lat','latitude'), + lon = c('lon','longitude')), + return_vars = list(lat = NULL, lon = NULL, + time = c('sdate', 'fyear')), + retrieve = FALSE) +) +datesF <- attr(dataF,'Variables')$common[['time']] +expect_equal( +datesF, +dates +) + }) @@ -91,7 +116,30 @@ as.vector(dates), as.vector(seq(as.POSIXct('1961-01-01 12:00:00', tz = 'UTC'), as.POSIXct('1961-12-31 12:00:00', tz = 'UTC'), by = 'day')) ) - +#retrieve = FALSE +suppressWarnings( +dataF <- Start(dat = '/esarchive/exp/ecearth/a3w5/original_files/cmorfiles/DCPP/EC-Earth-Consortium/EC-Earth3/dcppA-hindcast/$member$/day/$var$/gr/v20210910/$var$_day_EC-Earth3_dcppA-hindcast_s$sdate$-$member$_gr_$fyear$.nc', + var = 'tasmin', + lat = indices(1), + lon = indices(1), + sdate = paste0(1960), + time = 62:426, ## Jan to Dec (initialised in Nov) + time_across = 'fyear', + merge_across_dims = TRUE, + fyear = 'all', +# fyear_depends = 'sdate', + member = 'r1i4p1f1', + synonims = list(lat = c('lat','latitude'), + lon = c('lon','longitude')), + return_vars = list(lat = NULL, lon = NULL, + time = c('fyear')), + retrieve = FALSE) +) +datesF <- attr(dataF,'Variables')$common[['time']] +expect_equal( +datesF, +dates +) }) @@ -143,6 +191,32 @@ as.vector(dates[2, ]), as.vector(seq(as.POSIXct('1963-10-01 12:00:00', tz = 'UTC'), as.POSIXct('1964-03-09 12:00:00', tz = 'UTC'), by = 'day')) ) + +suppressWarnings( +dataF <- Start(dat = '/esarchive/exp/ecearth/a3w5/original_files/cmorfiles/DCPP/EC-Earth-Consortium/EC-Earth3/dcppA-hindcast/$member$/day/$var$/gr/v20210910/$var$_day_EC-Earth3_dcppA-hindcast_s$sdate$-$member$_gr_$fyear$.nc', + var = 'tasmin', + lat = indices(1), + lon = indices(1), + sdate = paste0(1960:1961), + day = 700:860, ## initialised in Nov + day_across = 'fyear', + merge_across_dims = TRUE, + fyear = 'all', + fyear_depends = 'sdate', + member = 'r1i4p1f1', + synonims = list(lat = c('lat','latitude'), + lon = c('lon','longitude'), + day = c('day', 'time')), + return_vars = list(lat = NULL, lon = NULL, + day = c('sdate', 'fyear')), + retrieve = FALSE) +) +datesF <- attr(dataF,'Variables')$common[['day']] +expect_equal( +datesF, +dates +) + }) @@ -202,6 +276,36 @@ as.vector(dates), as.vector(datess) ) +suppressWarnings( + dataF <- Start(dat = paste0('/esarchive/recon/ecmwf/erainterim/6hourly/', + '$var$/$var$_$file_date$.nc'), + var = 'tas', + file_date = dates_file, + time = values(datess), #[time = 31, sdate = 8] + latitude = indices(1), + longitude = indices(1), + time_var = 'time', + #because time is assigned by 'values', set the tolerance to avoid too distinct match + time_tolerance = as.difftime(1, units = 'hours'), + #time values are across all the files + time_across = 'file_date', + #combine time and file_date dims + merge_across_dims = TRUE, + #exclude the additional NAs generated by merge_across_dims + merge_across_dims_narm = TRUE, + #split time dim, because it is two-dimensional + split_multiselected_dims = TRUE, + return_vars = list(latitude = NULL, + longitude = NULL, + time = 'file_date'), + retrieve = FALSE) +) +datesF <- attr(dataF,'Variables')$common[['time']] +expect_equal( +datesF, +dates +) + }) test_that("5. test 1 but merge_across_dims_narm = F", { @@ -256,6 +360,31 @@ as.vector(dates[2, ]), c(as.vector(seq(as.POSIXct('1962-01-01 12:00:00', tz = 'UTC'), as.POSIXct('1962-12-31 12:00:00', tz = 'UTC'), by = 'day')), rep(NA, 243)) ) +suppressWarnings( +dataF <- Start(dat = '/esarchive/exp/ecearth/a3w5/original_files/cmorfiles/DCPP/EC-Earth-Consortium/EC-Earth3/dcppA-hindcast/$member$/day/$var$/gr/v20210910/$var$_day_EC-Earth3_dcppA-hindcast_s$sdate$-$member$_gr_$fyear$.nc', + var = 'tasmin', + lat = indices(1), + lon = indices(1), + sdate = paste0(1960:1961), + time = 62:426, ## Jan to Dec (initialised in Nov) + time_across = 'fyear', + merge_across_dims = TRUE, + merge_across_dims_narm = F, + fyear = 'all', + fyear_depends = 'sdate', + member = 'r1i4p1f1', + synonims = list(lat = c('lat','latitude'), + lon = c('lon','longitude')), + return_vars = list(lat = NULL, lon = NULL, + time = c('sdate', 'fyear')), + retrieve = FALSE) +) +datesF <- attr(dataF,'Variables')$common[['time']] +expect_equal( +datesF, +dates +) + }) test_that("6. split dim only", { @@ -309,6 +438,29 @@ as.vector(dates[, ]), as.vector(seq(as.POSIXct('1994-07-01', tz = 'UTC'), as.POSIXct('1994-07-14', tz = 'UTC'), by = 'day')) ) +suppressWarnings( +dataF <- Start(dat = paste0('/esarchive/recon/ecmwf/erainterim/6hourly/', + '$var$/$var$_199407.nc'), + var = 'tas', +# file_date = '199407', + time = values(datess), #[time = 7, week = 2] + latitude = indices(1), + longitude = indices(1), + time_var = 'time', + time_tolerance = as.difftime(1, units = 'hours'), +# time_across = 'file_date', +# merge_across_dims = TRUE, + split_multiselected_dims = TRUE, + return_vars = list(latitude = NULL, + longitude = NULL, + time = NULL), #'file_date'), + retrieve = FALSE) +) +datesF <- attr(dataF,'Variables')$common[['time']] +expect_equal( +datesF, +dates +) }) @@ -365,30 +517,30 @@ as.vector(dates[, ]), as.vector(seq(as.POSIXct('1994-07-01', tz = 'UTC'), as.POSIXct('1994-08-31', tz = 'UTC'), by = 'day')) ) -}) - -test_that("8. test 1 but retrieve = F", { - suppressWarnings( -data <- Start(dat = '/esarchive/exp/ecearth/a3w5/original_files/cmorfiles/DCPP/EC-Earth-Consortium/EC-Earth3/dcppA-hindcast/$member$/day/$var$/gr/v20210910/$var$_day_EC-Earth3_dcppA-hindcast_s$sdate$-$member$_gr_$fyear$.nc', - var = 'tasmin', - lat = indices(1), - lon = indices(1), - sdate = paste0(1960:1961), - time = 62:426, ## Jan to Dec (initialised in Nov) - time_across = 'fyear', - merge_across_dims = TRUE, - fyear = 'all', - fyear_depends = 'sdate', - member = 'r1i4p1f1', - synonims = list(lat = c('lat','latitude'), - lon = c('lon','longitude')), - return_vars = list(lat = NULL, lon = NULL, - time = c('sdate', 'fyear')), - retrieve = FALSE) +dataF <- Start(dat = paste0('/esarchive/recon/ecmwf/erainterim/6hourly/', + '$var$/$var$_$file_date$.nc'), + var = 'tas', + file_date = c('199407', '199408'), + time = values(datess), #[time = 31, month = 2] + latitude = indices(1), + longitude = indices(1), + time_var = 'time', + time_tolerance = as.difftime(1, units = 'hours'), + time_across = 'file_date', + merge_across_dims = TRUE, + merge_across_dims_narm = F, + split_multiselected_dims = TRUE, + return_vars = list(latitude = NULL, + longitude = NULL, + time = 'file_date'), + retrieve = FALSE) +) +datesF <- attr(dataF,'Variables')$common[['time']] +expect_equal( +datesF, +dates ) -dates <- attr(data,'Variables')$common[['time']] }) - -- GitLab From f70070b25e82f8826eb36a0b1739d21808e4d076 Mon Sep 17 00:00:00 2001 From: aho Date: Tue, 1 Mar 2022 16:01:55 +0100 Subject: [PATCH 14/17] Improve warning message --- R/zzz.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/zzz.R b/R/zzz.R index b1694e1..c2362b6 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -446,10 +446,10 @@ correct_return_vars <- function(inner_dim, inner_dims_across_files, found_patter corrected_value <- file_dim_name } } - .warning(paste0("Found ", inner_dim, " dependency on file diemnsion '", corrected_value, - "', but '", inner_dim, "' is not in return_vars list or is NULL. ", - "To provide the correct metadata, the value of ", inner_dim, - " in 'return_vars' is specified as '", corrected_value, "'.")) + .warning(paste0("Found '", inner_dim, "' dependency on file dimension '", corrected_value, + "', but '", inner_dim, "' is not in return_vars list or does not include '", corrected_value, + "'. To provide the correct metadata, '", corrected_value, "' is included under '", inner_dim, + "' in 'return_vars.")) return(corrected_value) } -- GitLab From 5fb304b0a4f02e5280e0588d12256c31de34c0cd Mon Sep 17 00:00:00 2001 From: aho Date: Tue, 1 Mar 2022 18:46:46 +0100 Subject: [PATCH 15/17] refine the code --- R/Start.R | 29 +++++++++++++---------------- R/zzz.R | 7 +++---- 2 files changed, 16 insertions(+), 20 deletions(-) diff --git a/R/Start.R b/R/Start.R index 59230f6..6fc05ea 100644 --- a/R/Start.R +++ b/R/Start.R @@ -3476,6 +3476,7 @@ Start <- function(..., # dim = indices/selectors, } } else { if (!is.null(file_dim)) { #merge_across_dims, file_dims is the depended file dim + if (file_dim %in% names(dim(common_vars_to_crop[[common_var_to_crop]]))) { tmp <- common_vars_to_crop[[common_var_to_crop]] tmp_attributes <- attributes(common_vars_to_crop[[common_var_to_crop]]) dim_extra_ind <- which(!names(dim(tmp)) %in% c(file_dim, inner_dim)) @@ -3521,6 +3522,7 @@ Start <- function(..., # dim = indices/selectors, as.POSIXct(common_vars_to_crop[[common_var_to_crop]], origin = "1970-01-01", tz = 'UTC') } + } } else { # old code common_vars_to_crop[[common_var_to_crop]] <- Subset(common_vars_to_crop[[common_var_to_crop]], inner_dim, crop_indices) @@ -3872,11 +3874,11 @@ Start <- function(..., # dim = indices/selectors, } metadata_tmp <- picked_common_vars[[across_inner_dim]] } else { - tmp <- remove_additional_na_from_merge( + data_array = bigmemory::as.matrix(data_array), + merge_dim_metadata = picked_common_vars[[across_inner_dim]], inner_dims_across_files, final_dims, - length_inner_across_dim, data_array = bigmemory::as.matrix(data_array), - merge_dim_metadata = picked_common_vars[[across_inner_dim]]) + length_inner_across_dim) data_array_tmp <- tmp$data_array metadata_tmp <- tmp$merge_dim_metadata } @@ -3886,15 +3888,16 @@ Start <- function(..., # dim = indices/selectors, "Check if the reshaping parameters are used correctly.")) } if (length(metadata_tmp) != prod(final_dims_fake_merge_dim)) { - stop(paste0("After reshaping, the metadata do not fit into the expected output dimension. ", "Check if the reshaping parameters are used correctly or contact support.")) + stop(paste0("After reshaping, the metadata do not fit into the expected output dimension. ", + "Check if the reshaping parameters are used correctly or contact support.")) } #NOTE: When one file contains values for dicrete dimensions, rearrange the # chunks (i.e., work_piece) is necessary. if (split_multiselected_dims) { tmp <- rebuild_array_merge_split( - data_array = data_array_tmp, indices_chunk, all_split_dims, final_dims_fake, - across_inner_dim, length_inner_across_dim, metadata = metadata_tmp) + data_array = data_array_tmp, metadata = metadata_tmp, indices_chunk, + all_split_dims, final_dims_fake, across_inner_dim, length_inner_across_dim) data_array_tmp <- tmp$data_array metadata_tmp <- tmp$metadata } @@ -4006,19 +4009,17 @@ Start <- function(..., # dim = indices/selectors, if (merge_across_dims & (split_multiselected_dims | merge_across_dims_narm)) { if (!merge_across_dims_narm) { -# data_array_tmp <- array(bigmemory::as.matrix(data_array), dim = final_dims) tmp <- match(names(final_dims), names(dims_of_merge_dim)) if (any(diff(tmp[!is.na(tmp)]) < 0)) { #need to reorder picked_common_vars[[across_inner_dim]] <- .aperm2(picked_common_vars[[across_inner_dim]], tmp[!is.na(tmp)]) } metadata_tmp <- picked_common_vars[[across_inner_dim]] } else { - tmp <- remove_additional_na_from_merge( + data_array = NULL, + merge_dim_metadata = picked_common_vars[[across_inner_dim]], inner_dims_across_files, final_dims, - length_inner_across_dim, data_array = NULL, - merge_dim_metadata = picked_common_vars[[across_inner_dim]]) -# data_array_tmp <- tmp$data_array_tmp + length_inner_across_dim) metadata_tmp <- tmp$merge_dim_metadata } @@ -4031,13 +4032,9 @@ Start <- function(..., # dim = indices/selectors, # chunks (i.e., work_piece) is necessary. if (split_multiselected_dims) { tmp <- rebuild_array_merge_split( - data_array = NULL, indices_chunk, all_split_dims, final_dims_fake, - across_inner_dim, length_inner_across_dim, metadata = metadata_tmp) -# data_array_tmp <- tmp$data_array + data_array = NULL, metadata = metadata_tmp, indices_chunk, all_split_dims, final_dims_fake, across_inner_dim, length_inner_across_dim) metadata_tmp <- tmp$metadata } - -# data_array <- array(data_array_tmp, dim = final_dims_fake) metadata_tmp <- array(metadata_tmp, dim = final_dims_fake_merge_dim) # If split_multiselected_dims + merge_across_dims, the dimension order may change above. diff --git a/R/zzz.R b/R/zzz.R index c2362b6..0067d6d 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -1028,8 +1028,8 @@ retrieve_progress_message <- function(work_pieces, num_procs, silent) { # If merge_across_dims = TRUE and merge_across_dims_narm = TRUE, remove the additional NAs # due to unequal inner_dim ('time') length across file_dim ('sdate'). -remove_additional_na_from_merge <- function(inner_dims_across_files, final_dims, - length_inner_across_dim, data_array = NULL, merge_dim_metadata = NULL) { +remove_additional_na_from_merge <- function(data_array = NULL, merge_dim_metadata = NULL, + inner_dims_across_files, final_dims, length_inner_across_dim) { # data_array is a vector from bigmemory::as.matrix # merge_dim_metadata is an array @@ -1108,8 +1108,7 @@ remove_additional_na_from_merge <- function(inner_dims_across_files, final_dims, # When merge_across_dims = TRUE and split_multiselected_dims = TRUE, rearrange the chunks # (i.e., work_piece) is necessary if one file contains values for discrete dimensions -rebuild_array_merge_split <- function(data_array = NULL, indices_chunk, all_split_dims, - final_dims_fake, across_inner_dim, length_inner_across_dim, metadata = NULL) { +rebuild_array_merge_split <- function(data_array = NULL, metadata = NULL, indices_chunk, all_split_dims, final_dims_fake, across_inner_dim, length_inner_across_dim) { # generate the correct order list from indices_chunk final_order_list <- list() i <- 1 -- GitLab From d38902fefc32cecd22a7cf5695551dc7cfbd6220 Mon Sep 17 00:00:00 2001 From: aho Date: Tue, 1 Mar 2022 18:48:41 +0100 Subject: [PATCH 16/17] Change indices to value for file dim --- tests/testthat/test-Start-two_dats.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-Start-two_dats.R b/tests/testthat/test-Start-two_dats.R index 4fa8642..ff83441 100644 --- a/tests/testthat/test-Start-two_dats.R +++ b/tests/testthat/test-Start-two_dats.R @@ -19,7 +19,7 @@ data <- Start(dataset = list(list(path = path_tas), list(path = path_tos)), lon = values(list(150, 170)), lon_reorder = CircularSort(0, 360), fyear = 'all', - member = indices(1), + member = 'r10i1p1f1', #indices(1), fyear_depends = 'sdate', time_across = 'fyear', merge_across_dims = TRUE, -- GitLab From 3d1d557069c41971882c4764e5886f9c346bee17 Mon Sep 17 00:00:00 2001 From: aho Date: Wed, 2 Mar 2022 11:52:01 +0100 Subject: [PATCH 17/17] Correct test for new dev --- tests/testthat/test-Start-calendar.R | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/testthat/test-Start-calendar.R b/tests/testthat/test-Start-calendar.R index 4f58a87..10cf7bf 100644 --- a/tests/testthat/test-Start-calendar.R +++ b/tests/testthat/test-Start-calendar.R @@ -212,9 +212,13 @@ suppressWarnings( ) expect_equal( - attr(obs, 'Variables')$common$time[1, 1], + attr(obs, 'Variables')$common$time[1], as.POSIXct('2005-05-16 12:00:00', tz = 'UTC') ) +expect_equal( + dim(attr(obs, 'Variables')$common$time), + c(time = 1) +) }) -- GitLab