diff --git a/DESCRIPTION b/DESCRIPTION index ed168c01d5f47e993a04da090a739f5be90056d0..0cda1baa89cd345b7d9b55fcbd2cc4021e2f5a2e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: startR Title: Automatically Retrieve Multidimensional Distributed Data Sets -Version: 1.0.2 +Version: 1.0.3 Authors@R: c( person("BSC-CNS", role = c("aut", "cph")), person("Nicolau", "Manubens", , "nicolau.manubens@bsc.es", role = c("aut")), diff --git a/NEWS.md b/NEWS.md index 1206070fcc2b84384c8bb35de0eb2dcf1652ac72..baf436ea9ba95919a44911ee1f51256193bbec03 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,10 @@ +# startR v1.0.3 (Release date: 2020-06-19) +- Bugfix for requiring the repetitive values from a single file when using +'merge_across_dims' and 'split_multiselected_dims'. The value positions were not +correct before. +- Specify the time zone to be 'UTC' regarding time attributes retrieval. The time zone +was not specified before and it caused problems when the time crosses daylight saving. + # startR v1.0.2 (Release date: 2020-05-11) - Bugfix for longitude transformation when the required grid point across the borders. The bug apprears at v1.0.0 and v1.0.1. - Add one new parameter 'merge_across_dims_narm' in Start(). If it is TRUE, diff --git a/R/NcDataReader.R b/R/NcDataReader.R index 82131288a7e8b53d1edb9fc6467fd8a594e3b631..cba05fac8a5f721eb41c209437d5900722acb297 100644 --- a/R/NcDataReader.R +++ b/R/NcDataReader.R @@ -162,7 +162,7 @@ NcDataReader <- function(file_path = NULL, file_object = NULL, units <- 'days' } - new_array <- rep(as.POSIXct(parts[2]), length(result)) + + new_array <- rep(as.POSIXct(parts[2], tz = 'UTC'), length(result)) + as.difftime(result[], units = units) #new_array <- seq(as.POSIXct(parts[2]), # length = max(result, na.rm = TRUE) + 1, diff --git a/R/Start.R b/R/Start.R index 3c544a4cc58fb14bc65c777616476cf143b558ae..b23e0ebb82987abbaa6d968c33878198f68ff88f 100644 --- a/R/Start.R +++ b/R/Start.R @@ -1330,11 +1330,19 @@ Start <- function(..., # dim = indices/selectors, c(list(x = picked_common_vars[[var_to_read]]), var_store_indices, list(value = var_values))) + # Turn time zone back to UTC if this var_to_read is 'time' + if (all(class(picked_common_vars[[var_to_read]]) == c('POSIXct', 'POSIXt'))) { + attr(picked_common_vars[[var_to_read]], "tzone") <- 'UTC' + } } else { picked_vars[[i]][[var_to_read]] <- do.call('[<-', c(list(x = picked_vars[[i]][[var_to_read]]), var_store_indices, list(value = var_values))) + # Turn time zone back to UTC if this var_to_read is 'time' + if (all(class(picked_vars[[i]][[var_to_read]]) == c('POSIXct', 'POSIXt'))) { + attr(picked_vars[[i]][[var_to_read]], "tzone") <- 'UTC' + } } if (var_to_read %in% names(first_found_file)) { first_found_file[var_to_read] <- TRUE @@ -3165,6 +3173,77 @@ Start <- function(..., # dim = indices/selectors, data_array_final_dims <- .aperm2(data_array_final_dims, tmp) data_array_tmp <- data_array_final_dims[data_array_final_dims != -9999] # become a vector + #NOTE: When one file contains values for dicrete dimensions, rearrange the + # chunks (i.e., work_piece) is necessary. + if (split_multiselected_dims) { + + # generate the correct order list from indices_chunk + final_order_list <- list() + i <- 1 + j <- 1 + a <- indices_chunk[i] + while (i < length(indices_chunk)) { + while (indices_chunk[i+1] == indices_chunk[i] & i < length(indices_chunk)) { + a <- c(a, indices_chunk[i+1]) + i <- i + 1 + } + final_order_list[[j]] <- a + a <- indices_chunk[i+1] + i <- i + 1 + j <- j + 1 + } + names(final_order_list) <- sapply(final_order_list, '[[', 1) + final_order_list <- lapply(final_order_list, length) + + if (!all(diff(as.numeric(names(final_order_list))) > 0)) { + + + # shape the vector into the array without split_dims + split_dims_pos <- match(split_dims, final_dims_fake) + new_dims <- c() + if (split_dims_pos[1] > 1) { + new_dims <- c(new_dims, final_dims_fake[1:(split_dims_pos[1] - 1)]) + } + new_dims <- c(new_dims, prod(split_dims)) + names(new_dims)[split_dims_pos[1]] <- across_inner_dim + if (split_dims_pos[length(split_dims_pos)] < length(final_dims_fake)) { + new_dims <- c(new_dims, final_dims_fake[(split_dims_pos[length(split_dims_pos)] + 1):length(final_dims_fake)]) + } + final_dims_fake_no_split <- new_dims + data_array_no_split <- array(data_array_tmp, dim = final_dims_fake_no_split) + # seperate 'time' dim into each work_piece length + data_array_seperate <- list() + tmp <- cumsum(unlist(length_inner_across_dim)) + tmp <- c(0, tmp) + for (i in 1:length(length_inner_across_dim)) { + data_array_seperate[[i]] <- Subset(data_array_no_split, across_inner_dim, + (tmp[i] + 1):tmp[i + 1]) + } + + # re-build the array: chunk + which_chunk <- as.numeric(names(final_order_list)) + how_many_indices <- unlist(final_order_list) + array_piece <- list() + ind_in_array_seperate <- as.list(rep(1, length(data_array_seperate))) + for (i in 1:length(final_order_list)) { + array_piece[[i]] <- Subset(data_array_seperate[[which_chunk[i]]], + across_inner_dim, + ind_in_array_seperate[[which_chunk[i]]]:(ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i] - 1)) + ind_in_array_seperate[[which_chunk[i]]] <- ind_in_array_seperate[[which_chunk[i]]] + how_many_indices[i] + } + + # re-build the array: paste + data_array_tmp <- array_piece[[1]] + along_pos <- which(names(dim(data_array_tmp)) == across_inner_dim) + if (length(array_piece) > 1) { + for (i in 2:length(array_piece)) { + data_array_tmp <- abind::abind(data_array_tmp, array_piece[[i]], + along = along_pos) + } + } + } + } + data_array <- array(data_array_tmp, dim = final_dims_fake) } else { # merge_across_dims_narm = F (old version) diff --git a/inst/doc/faq.md b/inst/doc/faq.md index 66df1e143ed50359f54bfb81da39edae336c2248..6f1d1ac775e4d849af2b0e7ce0a82c8c162be421 100644 --- a/inst/doc/faq.md +++ b/inst/doc/faq.md @@ -27,8 +27,9 @@ This document intends to be the first reference for any doubts that you may have 1. [No space left on device](#1-no-space-left-on-device) 2. [ecFlow UI remains blue and does not update status](#2-ecflow-ui-remains-blue-and-does-not-update-status) 3. [Compute() successfully but then killed on R session](#3-compute-successfully-but-then-killed-on-r-session) - 4. [My jobs work well in workstation and fatnodes but not on Power9 (or vice versa)](#4-my-jobs-work-well-in-workstation-and-fatnodes-but-not-on-power9-or-vice-versa) - + 4. [My jobs work well in workstation and fatnodes but not on Power9 (or vice versa)](#4-my-jobs-work-well-in-workstation-and-fatnodes-but-not-on-power9-or-vice-versa) + 5. [Errors related to wrong file formatting](#5-errors-related-to-wrong-file-formatting) + 6. [Errors using a new cluster (setting Nord3)](#6-errors-using-a-new-cluster-setting-nord3) ## 1. How to @@ -588,7 +589,7 @@ List of 1 ### 17. Use parameter 'split_multiselected_dims' in Start() The selectors can be not only vectors, but also multidimensional array. For instance, the 'time' dimension -can be assigned by a two-dimensional array `[sdate = 12, time = 31]`, which is the daily data of 12 months. +can be assigned by a two-dimensional array `[sdate = 12, time = 31]`, which is 31 timesteps for 12 start dates. You may want to have both 'sdate' and 'time' in the output dimension, even though 'sdate' is not explicitly specified in Start(). The parameter 'split_multiselected_dims' is for this goal. It is common in the case that uses experimental data attributes to get the corresponding observational data. @@ -627,16 +628,9 @@ print(attr(obs, 'Dimensions')) # 1 1 4 3 256 512 ``` -Notice that when 'split_multiselected_dims' and 'merge_across_dims' are both used, -and dimension number of the splitted selector is more than two, a potential -problem of mixed dimension might occur. The following script is from part of -the usecase [ex1_7_split_merge.R](inst/doc/usecase/ex1_7_split_merge.R). - -It is important to check if **the order of file_date is in line with dates dimensions order!** -Regardless 'time', which is explicitly specified in Start(), the vector should list 'sdate' first, then 'syear'. -As you can see below, the first element '199607' is sdate = 1, and the second element '199612' is sdate = 2. If the order is wrong, you will still get a -return data but with mixed dimensions. Because 'sdate' and 'syear' are only -implied in the given selectors, Start() cannot check if the order of file_date and dates are consistent or not. +The splited dimension can have more than two dimensions. +The following example comes from the usecase [ex1_7_split_merge.R](inst/doc/usecase/ex1_7_split_merge.R). +The 'time' selector has three dimensions 'sdate', 'syear', and 'time'. ```r dates <- attr(hcst, 'Variables')$common$time @@ -649,6 +643,23 @@ file_date <- sort(unique(gsub('-', '', print(file_date) #[1] "199607" "199612" "199701" "199707" "199712" "199801" "199807" "199812" #[9] "199901" + +obs <- Start(dat = path.obs, + var = var_name, + file_date = file_date, # a vector with the information of sdate and syear + latitude = indices(1:10), + longitude = indices(1:10), + time = values(dates), # a 3-dim array (sdate, syear, time) + time_across = 'file_date', + merge_across_dims = TRUE, + merge_across_dims_narm = TRUE, + split_multiselected_dims = TRUE, + synonims = list(latitude = c('lat','latitude'), + longitude = c('lon','longitude')), + return_vars = list(latitude = 'dat', + longitude = 'dat', + time = 'file_date'), + retrieve = T) ``` @@ -712,3 +723,74 @@ have one, go to [usecase.md](https://earth.bsc.es/gitlab/es/startR/tree/develop- If it fails, it means that your connection to machine or the ecFlow setting has some problem. +### 5. Errors related to wrong file formatting + +Several errors could be return when the files are not correctly formatted. If you see one of this errors, review the coordinates in your files: + +``` +Error in Rsx_nc4_put_vara_double: NetCDF: Numeric conversion not representable +Error in ncvar_put(ncdf_object, defined_vars[[var_counter]]$name, arrays[[i]], : + C function Rsx_nc4_put_vara_double returned error +``` + +``` +Error in dim(x$x) <- dim_bk : + dims [product 1280] do not match the length of object [1233] <- this '1233' changes every time +``` + +``` +Error in s2dverification::CDORemap(data_array, lons, lats, ...) : + Found invalid values in 'lons'. +``` + +``` +ERROR: invalid cell + +Aborting in file clipping.c, line 1295 ... +Error in s2dverification::CDORemap(data_array, lons, lats, ...) : + CDO remap failed. +``` + + +### 6. Errors using a new cluster (setting Nord3) + +When using a new cluster, some errors could happen. Here, there are some behaviours detected from issue #64. + +- whether running Compute(), request password: + +``` +Password: +``` + +Check that the host name for the cluster has been include in the ´.ssh/config´. +Check also that the passwordless access has been properly set up. You can check that you can access the cluster without providing the password by using the host name ´ssh nord3´ (see more infor in the [**Practical guide**](inst/doc/practical_guide.md)). + +- alias may not be available, such as 'esnas' for 'esarchive' + +In this case, the error ´No data files found for any of the specified datasets.´ will be returned. + +- repetitive prints of modules loading: + +``` +load UDUNITS/2.1.24 (PATH) +load NETCDF/4.1.3 (PATH, LD_LIBRARY_PATH, NETCDF) +load R/2.15.2 (PATH, LD_LIBRARY_PATH) +``` + +The .bashrc in your Nord 3 home must be edit with the information from [BSC ES wiki](https://earth.bsc.es/wiki/doku.php?id=computing:nord3) to load correct modules. However, if you add a line before those, the result will be the one above. + +Check your .bashrc to avoid loading modules before define the department ones. + + +- R versions: Workstation version versus remote cluster version + +Some functions depends on the R version used and they should be compatible in workstation and in the remote cluster. If the error: + +``` +cannot read workspace version 3 written by R 3.6.2; need R 3.5.0 or newer +``` + +change the R version used in your workstation to one newer. + + + diff --git a/inst/doc/usecase.md b/inst/doc/usecase.md index 9ffbee4cf2bbb3f461e8bb6e69323604568e9693..65dbd2da6a9546c23c83ea5d67f0764110976812 100644 --- a/inst/doc/usecase.md +++ b/inst/doc/usecase.md @@ -51,4 +51,11 @@ The problem may occur when the dimension number of the splitted selector is more 7. [Calculate the ensemble-adjusted Continuous Ranked Probability Score (CRPS)](inst/doc/usecase/ex2_7_seasonal_forecast_crps.R) Use `SpecsVerification::EnsCrps` to calculate the ensemble-adjusted Continuous Ranked Probability Score (CRPS) for ECWMF experimental data, and do ensemble mean. Use `s2dverification::PlotEquiMap` to plot the CRPS map. 8. [Use CSTools Calibration function](inst/doc/usecase/ex2_8_calibration.R) - Use `CSTools:::.cal`, the interior function of `CSTools::CST_Calibration`, to do the bias adjustment for ECMWF experimental monthly mean data. + Use `CSTools:::.cal`, the interior function of `CSTools::CST_Calibration`, to do the bias adjustment for ECMWF experimental monthly mean data. + 9. [Use a mask to apply different methods to different gridpoints](inst/doc/usecase/ex2_9_mask.R) If you need to apply your analysis in a few gridpoints, you may want to consider use case 1.6, but if you need to load a lot of grid points, maybe this a better solution. + + + + + + diff --git a/inst/doc/usecase/ex1_2_exp_obs_attr.R b/inst/doc/usecase/ex1_2_exp_obs_attr.R index 5a906f3ac2f4eecfb71eab374537290ee184c228..acff7c9167d19b7bee40e96913859c0982c310b3 100644 --- a/inst/doc/usecase/ex1_2_exp_obs_attr.R +++ b/inst/doc/usecase/ex1_2_exp_obs_attr.R @@ -97,12 +97,12 @@ print(attr(exp, 'Variables')$common$time) #[11] "2007-03-16 13:14:44 CET" "2008-03-16 13:14:44 CET" print(attr(obs, 'Variables')$common$time) -# [1] "2005-01-31 18:00:00 CET" "2006-01-31 18:00:00 CET" -# [3] "2007-01-31 18:00:00 CET" "2008-01-31 18:00:00 CET" -# [5] "2005-02-28 18:00:00 CET" "2006-02-28 18:00:00 CET" -# [7] "2007-02-28 18:00:00 CET" "2008-02-29 18:00:00 CET" -# [9] "2005-03-31 19:00:00 CEST" "2006-03-31 19:00:00 CEST" -#[11] "2007-03-31 19:00:00 CEST" "2008-03-31 19:00:00 CEST" +# [1] "2005-01-31 18:00:00 UTC" "2006-01-31 18:00:00 UTC" +# [3] "2007-01-31 18:00:00 UTC" "2008-01-31 18:00:00 UTC" +# [5] "2005-02-28 18:00:00 UTC" "2006-02-28 18:00:00 UTC" +# [7] "2007-02-28 18:00:00 UTC" "2008-02-29 18:00:00 UTC" +# [9] "2005-03-31 18:00:00 UTC" "2006-03-31 18:00:00 UTC" +#[11] "2007-03-31 18:00:00 UTC" "2008-03-31 18:00:00 UTC" ##-----lat----- print(attr(exp, 'Variables')$common$lat[1:3]) diff --git a/inst/doc/usecase/ex1_3_attr_loadin.R b/inst/doc/usecase/ex1_3_attr_loadin.R index 3dca648fa83851059679bb8356b28d7355e70c84..a918e167c6f23092d8d27690613061e98e7affd5 100644 --- a/inst/doc/usecase/ex1_3_attr_loadin.R +++ b/inst/doc/usecase/ex1_3_attr_loadin.R @@ -113,7 +113,7 @@ erai[1, 1, 2, 31, 1, 1] # 1st March also, since June only has 30 days dates <- attr(system4, 'Variables')$common$time dates[2, 31] -#[1] "1994-07-01 CEST" +#[1] "1994-07-01 UTC" dates[2, 31] <- NA # Jun dates[5, 31] <- NA # Sep dates[7, 31] <- NA # Nov @@ -141,4 +141,9 @@ erai[1, 1, 2, , 1, 1] # June # [9] 270.5395 272.0379 272.5489 271.1494 270.7764 270.5678 272.0331 273.7856 #[17] 273.9849 274.5904 273.4369 273.8404 274.4068 274.2292 274.7375 275.5104 #[25] 275.4324 274.9408 274.8679 276.5602 275.0995 274.6409 NA +erai[1, 1, 5, , 1, 1] # Sep +# [1] 270.0656 270.7113 268.4678 271.6489 271.2354 269.7831 269.8045 268.7994 +# [9] 266.3092 262.2734 265.0124 261.8378 265.3950 257.1690 255.8402 264.8826 +#[17] 267.8663 266.6875 262.5502 258.5476 258.9617 263.6396 257.1111 264.8644 +#[25] 261.0085 256.7690 256.5811 256.4331 256.1260 256.4716 NA #------------------------------ diff --git a/inst/doc/usecase/ex1_7_split_merge.R b/inst/doc/usecase/ex1_7_split_merge.R index e72cceefb202eddd0dd11a90293c89d70b2c5bc2..539a24d8051f796dff317d45b7630c926f8e3ec3 100644 --- a/inst/doc/usecase/ex1_7_split_merge.R +++ b/inst/doc/usecase/ex1_7_split_merge.R @@ -1,11 +1,17 @@ #--------------------------------------------------------------------- # This usecase shows the things to be noticed when the parameters # 'split_multiselected_dims' and 'merge_across_dims' are both used. -# The problem may occur when the dimension number of the splitted selector -# is more than two. - -# If you're not familiar with the usage of these parameters, please see usecases ex1_2 and ex1_3 first, which are less complicated. +# The problem may occur when the dimension number of the splitted selector is +# more than two. If you're not familiar with the usage of these parameters, +# please see usecases ex1_2 and ex1_3 first, which are less complicated. # See FAQ How-to-#17 for more explanation. + +# In this example, hindcast data corresponding to 2 start dates ('20160704', '20161222') +# is retrieved. The hindcasts consist of daily values for the 47 following days +# after each start date (in this case only the first 12 days are retrieved) and +# for the 20 years of hindacst (in this case only the first 3 years are retrieved). +# Once the hindacst data is retrieved/read, the corresponding ERA5 values for each +# hindcast time and year are retrieved using the 'dates' metadata from the hindcast object. #--------------------------------------------------------------------- library(startR) @@ -18,7 +24,7 @@ path.exp <- '/esarchive/exp/ecmwf/s2s-monthly_ensforhc/daily/$var$_f24h/$sdate$/ hcst <- Start(dat = path.exp, var = var_name, sdate = c('20160704', '20161222'), - syear = indices(1:3), #2016:2018 + syear = indices(1:3), #1996:1998 syear_depends = 'sdate', time = indices(1:12), #4th-15th Jul; 22nd Dec-2nd Jan latitude = indices(1:10), @@ -42,22 +48,15 @@ dim(dates) #----------------------------------------------------------------------- #----------------------------------------------------------------------- -# This two lines should NOT be used!! It is an example showing the potential -# problem when using 'split_multiselected_dims' and 'merge_across_dims'. -# If 'dates' is reordered to 'syear' ahead of 'sdate', while 'file_date' below -# remains the same, the result will have mixed dimension. - +# If you need to reorder the dimensions of the 'time' selector, you can use +# s2dv::Reorder function. These two lines are not used in the following example. library(s2dv) -dates <- Reorder(dates, c('syear','time','sdate')) +dates <- Reorder(dates, c('syear', 'sdate', 'time')) #----------------------------------------------------------------------- #----------------------------------------------------------------------- # Use 'dates' generated above to get the required files. -# It is important to check if the order of file_date is in line with -# dates dimensions! Regardless 'time', the vector should list 'sdate' first, -# then 'syear'. As you can see below, the first element '199607' is sdate = 1, -# and the second element '199612' is sdate = 2. - +# The order of file_date is not important. sort() is not necessary to use. file_date <- sort(unique(gsub('-', '', sapply(as.character(dates), substr, 1, 7)))) print(file_date) @@ -70,7 +69,7 @@ path.obs <- '/esarchive/recon/ecmwf/era5/1hourly/$var$/$var$_$file_date$.nc' obs <- Start(dat = path.obs, var = var_name, - file_date = file_date, # a vector + file_date = file_date, # a vector with the information of sdate and syear latitude = indices(1:10), longitude = indices(1:10), time = values(dates), # a 3-dim array (sdate, syear, time) diff --git a/inst/doc/usecase/ex2_9_mask.R b/inst/doc/usecase/ex2_9_mask.R new file mode 100644 index 0000000000000000000000000000000000000000..13be87888875d37d123b26d81af25da4b9dc13d5 --- /dev/null +++ b/inst/doc/usecase/ex2_9_mask.R @@ -0,0 +1,159 @@ +# Use a mask for applying different methods in different gridpoints +# Author: Núria Pérez-Zanón +# ------------------------------------------------------------------ +# This use case has two parts: +# A) create a mask +# B) the startR workflow using the mask +# Notice that you can skip step A if you already have a mask, for instance, the Sea-Land Mask of a model, and you want to apply different analysis in the land than in the ocean. +# Notice too that this strategy allows you to chunk in the spatial dimensions. + +library(startR) + +##################################################################### +# Step A) + +# This is an example using toy data. +#I read one file of system5_m1 to get the latitudes and longitudes in a region, to create a consistent mask: +repos <- '/esarchive/exp/ecmwf/system5_m1/monthly_mean/$var$_f6h/$var$_$sdate$.nc' +data <- Start(dat = repos, + var = 'tas', + sdate = '20170101', + ensemble = indices(1), + time = 'first', + latitude = indices(300:341), + longitude = indices(1:40), + return_vars = list(latitude = 'dat', + longitude = 'dat'), + retrieve = FALSE) + +lats <- attributes(data)$Variables$dat1$longitude +lons <- attributes(data)$Variables$dat1$latitude + +# I create a fake mask with one's and zero's: +mask <- rep(c(rep(1, 21), rep(0, 21)), length(lats)) +dim(mask) <- c(latitude = length(lats), longitude = length(lons)) + +attr(mask, 'longitude') <- lons +attr(mask, 'latitude') <- lats + +# Save the mask in a ncdf file: +library(ncdf4) +dims_var <- NULL +dim_lon <- ncdim_def(name = 'longitude', units = 'degrees', + vals = as.vector(attributes(data)$Variables$dat1$longitude), + longname = 'longitude') +dims_var[[1]] <- dim_lon +dim_lat <- ncdim_def(name = 'latitude', units = 'degrees_north', + vals = as.vector(attributes(data)$Variables$dat1$latitude), + longname = 'latitude') +dims_var[[2]] <- dim_lat +datanc <- ncvar_def(name = 'mask', + units = 'adimensional', + dim = dims_var, missval = -99999) +file_nc <- nc_create("/esarchive/scratch/nperez/mask_system5_m1_harvestmonth.nc", + datanc) +ncvar_put(file_nc, datanc, mask) +nc_close(file_nc) + +##################################################################### +# -------------------------------------------------------------------------------- +##################################################################### +# Step B) + +# Read data: +repos <- '/esarchive/exp/ecmwf/system5_m1/monthly_mean/$var$_f6h/$var$_$sdate$.nc' +data <- Start(dat = repos, + var = 'tas', + sdate = c('20170101', '20180101'), + ensemble = indices(1:20), + time = 'all', + latitude = indices(300:341), + longitude = indices(1:40), + return_vars = list(latitude = 'dat', + longitude = 'dat', + time = 'sdate'), + retrieve = FALSE) + +# Read mask: +path <- '/esarchive/scratch/nperez/$var$_system5_m1_harvestmonth.nc' +mask <- Start(dat = path, + var = 'mask', + latitude = 'all', + longitude = 'all', + return_vars = list(latitude = 'dat', + longitude = 'dat'), + retrieve = FALSE) + +# The function does the mean if the mask is 1 or write an NA +MeanMask <- function(x, mask) { + if (mask == 1) { + ind <- mean(x) + } else { + ind <- NA + } + return(ind) +} + +# It is necessary to specify one dimension for mask, +# that's why I have added 'dat' dimension: +stepMask <- Step(fun = MeanMask, + target_dims = list(x = c('dat', 'ensemble', 'sdate', 'time'), + mask = c('dat')), + output_dim = NULL) + +# Now, there are two data inputs: +wf_mask <- AddStep(list(data, mask), stepMask) + +res <- Compute(workflow = wf_mask, + chunks = list(latitude = 2, + longitude = 2)) +##################################################################### +# -------------------------------------------------------------------------------- +##################################################################### +# Extra lines for output verification: +# Output verification: + +summary(res$output1) +dim(res$output1) +head(res$output1) + +mask_loaded <- Start(dat = path, + var = 'mask', + latitude = 'all', + longitude = 'all', + return_vars = list(latitude = 'dat', + longitude = 'dat'), + retrieve = TRUE) +summary(res$output1[mask_loaded == 0]) # All are NA's: correct +summary(res$output1[mask_loaded == 1]) # There is no NA's: correct +sum(mask_loaded == 0) # The number of NA's are 840: correct + +# compare values: +data_loaded <- Start(dat = repos, + var = 'tas', + sdate = c('20170101', '20180101'), + ensemble = indices(1:20), + time = 'all', + latitude = indices(300:341), + longitude = indices(1:40), + return_vars = list(latitude = 'dat', + longitude = 'dat', + time = 'sdate'), + retrieve = TRUE) +mean(data_loaded[1,1, , , ,1,1]) +res$output1[1,1,1] +mean(data_loaded[1,1, , , ,1,2]) +res$output1[1,1,2] + +out <- mask_loaded +for (i in 1:(dim(data_loaded)['latitude'])) { + for (j in 1:(dim(data_loaded)['longitude'])) { + if (mask_loaded[1,1, i, j] == 1) { + out[1,1,i, j] <- mean(data_loaded[,,,,,i,j]) + } else { + out[1,1,i,j] <- NA + } + } +} + + diff --git a/tests/testthat/test-Start-multiple-sdates.R b/tests/testthat/test-Start-multiple-sdates.R new file mode 100644 index 0000000000000000000000000000000000000000..cd5bac8f86128d86334c985fabfc389c893b1a62 --- /dev/null +++ b/tests/testthat/test-Start-multiple-sdates.R @@ -0,0 +1,157 @@ +context("Start() multiple sdate with split + merge dim") + +# When certain values in one observation file are required more than once, +# and 'merge_across_dims' + 'split_multiselected_dims' are used, the values may be misplaced. +# It might happen when reading experimental data with many start dates, +# and the corresponding observations are required to have the same data structure. + +ecmwf_path_hc <- paste0('/esarchive/exp/ecmwf/s2s-monthly_ensforhc/daily/$var$_f24h/$sdate$/$var$_$syear$.nc') +obs_path <-paste0('/esarchive/recon/ecmwf/era5/daily/$var$-r240x121/$var$_$file_date$.nc') + +var_name <- 'sfcWind' +var100_name <- 'windagl100' + +sdates.seq <- c("20161222","20161229","20170105","20170112") + +test_that("1. ", { +hcst<-Start(dat = ecmwf_path_hc, + var = var_name, + sdate = sdates.seq, + syear = 'all', + time = 'all', + latitude = indices(1), + longitude = indices(1), + ensemble = 'all', + syear_depends = 'sdate', + return_vars = list(latitude = 'dat', + longitude = 'dat', + time = c('sdate','syear') + ), + retrieve = F) +dates <- attr(hcst, 'Variables')$common$time +file_date <- unique(sapply(dates, format, '%Y%m')) + +obs <- Start(dat = obs_path, + var = var100_name, + latitude = indices(1), + longitude = indices(1), + file_date= file_date, + time = values(dates), # 'sdate' 'syear' 'time' + time_var = 'time', + time_across = 'file_date', + merge_across_dims= TRUE, + merge_across_dims_narm = TRUE, + split_multiselected_dims = TRUE, + synonims = list(latitude=c('lat','latitude'), + longitude=c('lon','longitude')), + return_vars = list(latitude = 'dat', + longitude = 'dat',# + time = c('file_date')), + retrieve = T) + + expect_equal( + dim(obs), + c(dat = 1, var = 1, latitude = 1, longitude = 1, sdate = 4, syear = 20, time = 47) + ) + expect_equal( + obs[1, 1, 1, 1, 1, 1, 8:15], + c(10.727190, 21.909714, 3.220845, 7.524321, 7.308463, 2.337417, 7.127212, 3.592193), + tolerance = 0.0001 + ) + expect_equal( + obs[1, 1, 1, 1, 2, 1, 1:5], + c(10.727190, 21.909714, 3.220845, 7.524321, 7.308463), + tolerance = 0.0001 + ) + expect_equal( + obs[1, 1, 1, 1, 2, 1, 31:38], + c(10.680604, 4.843633, 4.981896, 4.833428, 1.426522, 3.625800, 7.037229, 2.911440), + tolerance = 0.0001 + ) + expect_equal( + obs[1, 1, 1, 1, 1, 2, 9:15], + c(11.189878, 11.198478, 8.868102, 10.766751, 19.929094, 20.872601, 14.304168), + tolerance = 0.0001 + ) + expect_equal( + mean(obs), + 8.627518, + tolerance = 0.0001 + ) + expect_equal( + length(obs[which(is.na(obs))]), + 0 + ) +}) + +test_that("2. change the file_date order", { + hcst<-Start(dat = ecmwf_path_hc, + var = var_name, + sdate = sdates.seq, + syear = indices(1:20), + time = 'all', + latitude = indices(1), + longitude = indices(1), + ensemble = 'all', + syear_depends = 'sdate', + return_vars = list(latitude = 'dat', + longitude = 'dat', + time = c('sdate','syear') + ), + retrieve = F) + dates <- attr(hcst, 'Variables')$common$time + file_date <- sort(unique(sapply(dates, format, '%Y%m'))) + + +obs <- Start(dat = obs_path, + var = var100_name, + latitude = indices(1), + longitude = indices(1), + file_date= file_date, + time = values(dates), # 'sdate' 'syear' 'time' + time_var = 'time', + time_across = 'file_date', + merge_across_dims= TRUE, + merge_across_dims_narm = TRUE, + split_multiselected_dims = TRUE, + synonims = list(latitude=c('lat','latitude'), + longitude=c('lon','longitude')), + return_vars = list(latitude = 'dat', + longitude = 'dat',# + time = c('file_date')), + retrieve = T) + + expect_equal( + dim(obs), + c(dat = 1, var = 1, latitude = 1, longitude = 1, sdate = 4, syear = 20, time = 47) + ) + expect_equal( + obs[1, 1, 1, 1, 1, 1, 8:15], + c(10.727190, 21.909714, 3.220845, 7.524321, 7.308463, 2.337417, 7.127212, 3.592193), + tolerance = 0.0001 + ) + expect_equal( + obs[1, 1, 1, 1, 2, 1, 1:5], + c(10.727190, 21.909714, 3.220845, 7.524321, 7.308463), + tolerance = 0.0001 + ) + expect_equal( + obs[1, 1, 1, 1, 2, 1, 31:38], + c(10.680604, 4.843633, 4.981896, 4.833428, 1.426522, 3.625800, 7.037229, 2.911440), + tolerance = 0.0001 + ) + expect_equal( + obs[1, 1, 1, 1, 1, 2, 9:15], + c(11.189878, 11.198478, 8.868102, 10.766751, 19.929094, 20.872601, 14.304168), + tolerance = 0.0001 + ) + expect_equal( + mean(obs), + 8.627518, + tolerance = 0.0001 + ) + expect_equal( + length(obs[which(is.na(obs))]), + 0 + ) +})