From 74b4d95bd836e2f3f5cf2e9a06ccf098c0600224 Mon Sep 17 00:00:00 2001 From: aho Date: Tue, 25 Jan 2022 14:15:02 +0100 Subject: [PATCH 1/4] Start() can correctly merge files along the dimension that has different length each file, without additional NAs. --- R/Start.R | 102 ++++++++++++--- R/zzz.R | 3 +- .../testthat/test-Start-largest_dims_length.R | 122 ++++++++++++++++++ 3 files changed, 206 insertions(+), 21 deletions(-) diff --git a/R/Start.R b/R/Start.R index d977e71..fefd6ae 100644 --- a/R/Start.R +++ b/R/Start.R @@ -1929,8 +1929,19 @@ Start <- function(..., # dim = indices/selectors, if (!is.null(indices)) { #////////////////////////////////////////////////// # Find data_dims - ## old code. use the 1st valid file to determine the dims - if (!largest_dims_length | is.numeric(largest_dims_length)) { + ## If largest_dims_length is a number & !merge_across_dims, + ## directly assign this dim as the number; + ## If largest_dims_length is a number & this dim is across files, find the dim length of each file + find_largest_dims_length_by_files <- FALSE + if (is.numeric(largest_dims_length)) { + if (names(largest_dims_length) %in% inner_dims_across_files) { + find_largest_dims_length_by_files <- TRUE + } + } else if (largest_dims_length) { + find_largest_dims_length_by_files <- TRUE + } + + if (!find_largest_dims_length_by_files) { # old code file_path <- do.call("[", c(list(array_of_files_to_load), as.list(indices_of_first_files_with_data[[i]]))) # The following 5 lines should go several lines below, but were moved # here for better performance. @@ -1958,12 +1969,16 @@ Start <- function(..., # dim = indices/selectors, } } - ## largest_dims_length = TRUE } else { - data_dims <- find_largest_dims_length( - selectors_total_list[[i]], array_of_files_to_load, - selector_indices_save[[i]], dat[[i]], expected_inner_dims[[i]], - synonims, file_dim_reader) + ## largest_dims_length = TRUE, or is a number & merge_across_dims is across this dim + tmp <- find_largest_dims_length( + selectors_total_list[[i]], array_of_files_to_load, + selector_indices_save[[i]], dat[[i]], expected_inner_dims[[i]], + synonims, file_dim_reader) + data_dims <- tmp$largest_data_dims + # 'data_dims_each_file' is used when merge_across_dims = TRUE & + # the files have different length of inner dim. + data_dims_each_file <- tmp$data_dims_all_files # file_dim_reader returns dimension names as found in the file. # Need to translate accoridng to synonims: @@ -2380,6 +2395,19 @@ Start <- function(..., # dim = indices/selectors, # populated instead. #//////////////////////////////////////////////////////////////////// + # If the inner dim lengths differ among files, + # need to know each length to create the indices for each file later. + # Record 'iinner_dim_lengths' here for later usage. + inner_dim_lengths <- NULL + if (largest_dims_length & !is.null(file_dim)) { + inner_dim_lengths <- tryCatch({ + sapply(data_dims_each_file, '[[', inner_dim) + }, error = function(x) { + sapply(data_dims_each_file, '[[', + synonims[[inner_dim]][which(synonims[[inner_dim]] != inner_dim)]) + }) + } + fri <- first_round_indices <- NULL sri <- second_round_indices <- NULL # This variable will keep the indices needed to crop the transformed @@ -2397,7 +2425,13 @@ Start <- function(..., # dim = indices/selectors, sri <- vector('list', length = chunk_amount) dim(sri) <- c(chunk_amount) if (selector_array == 'all') { - fri[] <- replicate(chunk_amount, list(1:(data_dims[inner_dim]))) + if (is.null(inner_dim_lengths) | length(unique(inner_dim_lengths)) <= 1) { #old code + fri[] <- replicate(chunk_amount, list(1:(data_dims[inner_dim]))) + } else { # files have different inner dim length + for (i_chunk in 1:length(fri)) { + fri[[i_chunk]] <- 1:inner_dim_lengths[i_chunk] + } + } taken_chunks <- rep(TRUE, chunk_amount) #sri <- NULL } else if (selector_array == 'first') { @@ -3139,11 +3173,17 @@ Start <- function(..., # dim = indices/selectors, if (inner_dim %in% names(dim(sub_array_of_selectors))) { if (is.null(var_with_selectors_name)) { + if (!largest_dims_length | (largest_dims_length & length(unique(inner_dim_lengths)) <= 1)) { #old code + maximal_indice <- data_dims[inner_dim] * chunk_amount + } else { # files have different length of inner dim + maximal_indice <- sum(inner_dim_lengths) + } + if (any(na.omit(unlist(sub_array_of_selectors)) < 1) || - any(na.omit(unlist(sub_array_of_selectors)) > data_dims[inner_dim] * chunk_amount)) { - stop("Provided indices out of range for dimension '", inner_dim, "' ", - "for dataset '", dat[[i]][['name']], "' (accepted range: 1 to ", - data_dims[inner_dim] * chunk_amount, ").") + any(na.omit(unlist(sub_array_of_selectors)) > maximal_indice)) { + stop("Provided indices out of range for dimension '", inner_dim, "' ", + "for dataset '", dat[[i]][['name']], "' (accepted range: 1 to ", + maximal_indice, ").") } } else { if (inner_dim %in% names(dim(sub_array_of_values))) { @@ -3182,13 +3222,33 @@ Start <- function(..., # dim = indices/selectors, sub_array_is_list <- TRUE sub_array_of_indices <- unlist(sub_array_of_indices) } - if (is.null(var_with_selectors_name)) { - indices_chunk <- floor((sub_array_of_indices - 1) / data_dims[inner_dim]) + 1 - transformed_indices <- ((sub_array_of_indices - 1) %% data_dims[inner_dim]) + 1 - } else { - indices_chunk <- floor((sub_array_of_indices - 1) / var_full_dims[inner_dim]) + 1 - transformed_indices <- ((sub_array_of_indices - 1) %% var_full_dims[inner_dim]) + 1 + + # "indices_chunk" refers to in which file the + # sub_array_of_indices is; "transformed_indices" + # refers to the indices of sub_array_of_indices in each file. + if (!largest_dims_length | + (largest_dims_length & length(unique(inner_dim_lengths)) <= 1)) { + # old code; all the files have the same length of inner_dim + if (is.null(var_with_selectors_name)) { + indices_chunk <- floor((sub_array_of_indices - 1) / data_dims[inner_dim]) + 1 + transformed_indices <- ((sub_array_of_indices - 1) %% data_dims[inner_dim]) + 1 + } else { + indices_chunk <- floor((sub_array_of_indices - 1) / var_full_dims[inner_dim]) + 1 + transformed_indices <- ((sub_array_of_indices - 1) %% var_full_dims[inner_dim]) + 1 + } + } else { # files have different inner dim length + indices_chunk <- c() + for (item in 1:length(inner_dim_lengths)) { + tmp <- which(sub_array_of_indices <= cumsum(inner_dim_lengths)[item]) + indices_chunk <- c(indices_chunk, rep(item, length(tmp) - length(indices_chunk))) + } + sub_array_of_indices_by_file <- split(sub_array_of_indices, indices_chunk) + for (item in 2:length(sub_array_of_indices_by_file)) { + sub_array_of_indices_by_file[[item]] <- sub_array_of_indices_by_file[[item]] - cumsum(inner_dim_lengths)[item - 1] + } + transformed_indices <- unlist(sub_array_of_indices_by_file, use.names = FALSE) } + if (sub_array_is_list) { sub_array_of_indices <- as.list(sub_array_of_indices) } @@ -3197,14 +3257,15 @@ Start <- function(..., # dim = indices/selectors, print("-> GOING TO ITERATE ALONG CHUNKS.") } } + for (chunk in 1:chunk_amount) { if (!is.null(names(selector_store_position))) { selector_store_position[file_dim] <- chunk } else { selector_store_position <- chunk } - chunk_selectors <- transformed_indices[which(indices_chunk == chunk)] - sub_array_of_indices <- chunk_selectors + sub_array_of_indices <- transformed_indices[which(indices_chunk == chunk)] + sub_array_of_indices <- transformed_indices[which(indices_chunk == chunk)] if (with_transform) { # If the provided selectors are expressed in the world # before transformation @@ -3246,6 +3307,7 @@ Start <- function(..., # dim = indices/selectors, taken_chunks[chunk] <- TRUE } } + if (!is.null(var_unorder_indices)) { ordered_fri <- sub_array_of_fri sub_array_of_fri <- var_unorder_indices[sub_array_of_fri] diff --git a/R/zzz.R b/R/zzz.R index 941ecbe..6b41a69 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -538,7 +538,8 @@ find_largest_dims_length <- function(selectors_total_list, array_of_files_to_loa largest_data_dims[kk] <- max(sapply(data_dims_all_files, '[', kk)) } names(largest_data_dims) <- names(data_dims_all_files[[1]]) - return(largest_data_dims) + return(list(largest_data_dims = largest_data_dims, + data_dims_all_files = data_dims_all_files)) } # Gererate vars_to_transform from picked_vars[[i]] and picked_common_vars diff --git a/tests/testthat/test-Start-largest_dims_length.R b/tests/testthat/test-Start-largest_dims_length.R index 59b73e8..8657985 100644 --- a/tests/testthat/test-Start-largest_dims_length.R +++ b/tests/testthat/test-Start-largest_dims_length.R @@ -134,3 +134,125 @@ dat3 <- Start(dataset = repos, ) }) + +test_that("2. inconsistent time length, merge_across_dims = T", { + +path <- paste0('/esarchive/exp/CMIP6/dcppA-hindcast/EC-Earth3/DCPP/EC-Earth-Consortium/EC-Earth3/dcppA-hindcast/', + '$member$/Amon/$var$/gr/v20210309/', + '$var$_Amon_EC-Earth3_dcppA-hindcast_s$sdate$-$member$_gr_$fyear$.nc') + +suppressWarnings( +data <- Start(dataset = path, + var = 'psl', + sdate = '1970', + lat = indices(1), + lon = indices(1), + fyear = c('197011-197012', '197101-197112'), #2+12 + fmonth = 'all', + member = 'r6i2p1f1', + fmonth_across = 'fyear', + merge_across_dims = TRUE, + merge_across_dims_narm = TRUE, + largest_dims_length = TRUE, #c(fmonth = 12), + synonims = list(fmonth = c('time', 'fmonth'), + lon = c('lon', 'longitude'), + lat = c('lat', 'latitude')), + return_vars = list(lat = 'dataset', lon = 'dataset', + fmonth = 'fyear'), + retrieve = TRUE) +) +expect_equal( +dim(data), +c(dataset = 1, var = 1, sdate = 1, lat = 1, lon = 1, fmonth = 14, member = 1) +) +expect_equal( +as.vector(data)[1:5], +c(101341.03, 100831.62, 99877.38, 101355.11, 101067.74), +tolerance = 0.001 +) + +suppressWarnings( +data2 <- Start(dataset = path, + var = 'psl', + sdate = '1970', + lat = indices(1), + lon = indices(1), + fyear = c('197011-197012', '197101-197112'), #2+12 + fmonth = indices(1:14), + member = 'r6i2p1f1', + fmonth_across = 'fyear', + merge_across_dims = TRUE, + merge_across_dims_narm = TRUE, + largest_dims_length = TRUE, #c(fmonth = 12), + synonims = list(fmonth = c('time', 'fmonth'), + lon = c('lon', 'longitude'), + lat = c('lat', 'latitude')), + return_vars = list(lat = 'dataset', lon = 'dataset', + fmonth = 'fyear'), + retrieve = TRUE) +) + +expect_equal( +as.vector(data), +as.vector(data2) +) + +suppressWarnings( +data3 <- Start(dataset = path, + var = 'psl', + sdate = '1970', + lat = indices(1), + lon = indices(1), + fyear = c('197011-197012', '197101-197112'), #2+12 + fmonth = 'all', + member = 'r6i2p1f1', + fmonth_across = 'fyear', + merge_across_dims = TRUE, + merge_across_dims_narm = TRUE, + largest_dims_length = c(fmonth = 12), + synonims = list(fmonth = c('time', 'fmonth'), + lon = c('lon', 'longitude'), + lat = c('lat', 'latitude')), + return_vars = list(lat = 'dataset', lon = 'dataset', + fmonth = 'fyear'), + retrieve = TRUE) +) + +expect_equal( +as.vector(data), +as.vector(data3) +) + +#-------------------------------- +suppressWarnings( +data4 <- Start(dataset = path, + var = 'psl', + sdate = '1970', + lat = indices(1), + lon = indices(1), + fyear = c('197011-197012', '197101-197112'), #2+12 + fmonth = indices(c(1:2, 10:14)), + member = 'r6i2p1f1', + fmonth_across = 'fyear', + merge_across_dims = TRUE, + merge_across_dims_narm = TRUE, + largest_dims_length = TRUE, + synonims = list(fmonth = c('time', 'fmonth'), + lon = c('lon', 'longitude'), + lat = c('lat', 'latitude')), + return_vars = list(lat = 'dataset', lon = 'dataset', + fmonth = 'fyear'), + retrieve = TRUE) +) + +expect_equal( +dim(data4), +c(dataset = 1, var = 1, sdate = 1, lat = 1, lon = 1, fmonth = 7, member = 1) +) +expect_equal( +as.vector(data4), +c(101341.03, 100831.62, 101751.00, 101943.87, 100713.17, 101724.04, 99833.31), +tolerance = 0.001 +) + +}) -- GitLab From 290198c637768480b9bab3dffbef633cb2d1931d Mon Sep 17 00:00:00 2001 From: aho Date: Fri, 28 Jan 2022 18:30:34 +0100 Subject: [PATCH 2/4] Fix the bug of recording inner dim length of each merged file dim --- R/Start.R | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/R/Start.R b/R/Start.R index fefd6ae..d3a2b69 100644 --- a/R/Start.R +++ b/R/Start.R @@ -2400,12 +2400,33 @@ Start <- function(..., # dim = indices/selectors, # Record 'iinner_dim_lengths' here for later usage. inner_dim_lengths <- NULL if (largest_dims_length & !is.null(file_dim)) { + # inner_dim_lengths here includes all the files, but we only want + # the files of fyear for certain "sdate". We will categorize it later. inner_dim_lengths <- tryCatch({ sapply(data_dims_each_file, '[[', inner_dim) }, error = function(x) { sapply(data_dims_each_file, '[[', synonims[[inner_dim]][which(synonims[[inner_dim]] != inner_dim)]) }) + + # Use other file dims as the factors to categorize. + other_file_dims <- dim(array_of_files_to_load)[which(file_dims != file_dim)] + other_file_dims <- lapply(lapply(other_file_dims, seq, 1), rev) + other_file_dims_factor <- expand.grid(other_file_dims) + selector_indices_save_subset <- + lapply(selector_indices_save[[i]], '[', which(file_dims != file_dim)) + + # Put the fyear with the same other file dims (sdate, etc.) together, and find the largest length (in theory all of them should be the same) + inner_dim_lengths_cat <- vector('list', dim(other_file_dims_factor)[1]) + for (i_factor in 1:length(inner_dim_lengths_cat)) { + + inner_dim_lengths_cat[[i_factor]] <- + inner_dim_lengths[which(sapply(lapply( + selector_indices_save_subset, '==', + other_file_dims_factor[i_factor, ]), all))] + } + # Find the largest length of each time step + inner_dim_lengths <- do.call(pmax, inner_dim_lengths_cat) } fri <- first_round_indices <- NULL -- GitLab From 8566f522e5b05f33cde2f41bab4b37d03625533e Mon Sep 17 00:00:00 2001 From: aho Date: Fri, 28 Jan 2022 18:38:41 +0100 Subject: [PATCH 3/4] Revise unit test --- .../testthat/test-Start-largest_dims_length.R | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/tests/testthat/test-Start-largest_dims_length.R b/tests/testthat/test-Start-largest_dims_length.R index 8657985..a30fc24 100644 --- a/tests/testthat/test-Start-largest_dims_length.R +++ b/tests/testthat/test-Start-largest_dims_length.R @@ -227,11 +227,12 @@ as.vector(data3) suppressWarnings( data4 <- Start(dataset = path, var = 'psl', - sdate = '1970', + sdate = c('1970', '1971'), lat = indices(1), lon = indices(1), - fyear = c('197011-197012', '197101-197112'), #2+12 - fmonth = indices(c(1:2, 10:14)), + fyear = 'all', + fmonth = 'all', + fyear_depends = 'sdate', member = 'r6i2p1f1', fmonth_across = 'fyear', merge_across_dims = TRUE, @@ -247,12 +248,18 @@ data4 <- Start(dataset = path, expect_equal( dim(data4), -c(dataset = 1, var = 1, sdate = 1, lat = 1, lon = 1, fmonth = 7, member = 1) +c(dataset = 1, var = 1, sdate = 2, lat = 1, lon = 1, fmonth = 122, member = 1) +) +expect_equal( +as.vector(drop(data4)[1,1:5]), +c(101341.03, 100831.62, 99877.38, 101355.11, 101067.74), +tolerance = 0.001 ) expect_equal( -as.vector(data4), -c(101341.03, 100831.62, 101751.00, 101943.87, 100713.17, 101724.04, 99833.31), +as.vector(drop(data4)[2,10:14]), +c(100515.2, 101354.9, 100657.0, 100349.4, 100199.6), tolerance = 0.001 ) + }) -- GitLab From c9d06abe39ab3e39039a1d7b0be55d8071923ad1 Mon Sep 17 00:00:00 2001 From: aho Date: Mon, 31 Jan 2022 11:57:25 +0100 Subject: [PATCH 4/4] Add the unit test for merge_across_dims_narm = F --- .../testthat/test-Start-largest_dims_length.R | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/tests/testthat/test-Start-largest_dims_length.R b/tests/testthat/test-Start-largest_dims_length.R index a30fc24..b448f89 100644 --- a/tests/testthat/test-Start-largest_dims_length.R +++ b/tests/testthat/test-Start-largest_dims_length.R @@ -261,5 +261,40 @@ c(100515.2, 101354.9, 100657.0, 100349.4, 100199.6), tolerance = 0.001 ) +#-------------------------------- +suppressWarnings( +data5 <- Start(dataset = path, + var = 'psl', + sdate = c('1970', '1971'), + lat = indices(1), + lon = indices(1), + fyear = 'all', + fmonth = 'all', + fyear_depends = 'sdate', + member = 'r6i2p1f1', + fmonth_across = 'fyear', + merge_across_dims = TRUE, + merge_across_dims_narm = FALSE, #TRUE, + largest_dims_length = TRUE, + synonims = list(fmonth = c('time', 'fmonth'), + lon = c('lon', 'longitude'), + lat = c('lat', 'latitude')), + return_vars = list(lat = 'dataset', lon = 'dataset', + fmonth = 'fyear'), + retrieve = TRUE) +) + +expect_equal( +dim(data5), +c(dataset = 1, var = 1, sdate = 2, lat = 1, lon = 1, fmonth = 132, member = 1) +) +expect_equal( +which(is.na(drop(data5))), +c(5:24) +) +expect_equal( +as.vector(data4), +as.vector(data5)[-c(5:24)] +) }) -- GitLab