From 91a7cfea543bf3ca61e7c54f689bf3c397ae0a3f Mon Sep 17 00:00:00 2001 From: vagudets Date: Wed, 16 Apr 2025 09:04:33 +0200 Subject: [PATCH 01/22] Add cross-validation anomalies (WIP) --- modules/Crossval/Crossval_anomalies_Apply.R | 363 ++++++++++++++++++++ modules/Crossval/R/tmp/CST_MergeDims.R | 162 +++++++++ modules/Crossval/R/tmp/GetProbs.R | 351 +++++++++++++++++++ recipe_ecvs_ano_seas.yml | 108 ++++++ 4 files changed, 984 insertions(+) create mode 100644 modules/Crossval/Crossval_anomalies_Apply.R create mode 100644 modules/Crossval/R/tmp/CST_MergeDims.R create mode 100644 modules/Crossval/R/tmp/GetProbs.R create mode 100644 recipe_ecvs_ano_seas.yml diff --git a/modules/Crossval/Crossval_anomalies_Apply.R b/modules/Crossval/Crossval_anomalies_Apply.R new file mode 100644 index 00000000..3ba5e6c4 --- /dev/null +++ b/modules/Crossval/Crossval_anomalies_Apply.R @@ -0,0 +1,363 @@ +# Full-cross-val workflow +## This code should be valid for individual months and temporal averages +source("modules/Crossval/R/tmp/GetProbs.R") + +Crossval_anomalies_Apply <- function(recipe, data) { + cross.method <- recipe$Analysis$cross.method + # TODO move check + if (is.null(cross.method)) { + cross.method <- 'leave-one-out' + } + categories <- recipe$Analysis$Workflow$Probabilities$percentiles + categories <- lapply(categories, function (x) { + sapply(x, function(y) { + eval(parse(text = y))})}) + ncores <- recipe$Analysis$ncores + na.rm <- recipe$Analysis$remove_NAs + ## data dimensions + sdate_dim <- dim(data$hcst$data)['syear'] + orig_dims <- names(dim(data$hcst$data)) + # spatial dims + if ('latitude' %in% names(dim(data$hcst$data))) { + nlats <- dim(data$hcst$data)['latitude'] + nlons <- dim(data$hcst$data)['longitude'] + agg = 'global' + } else if ('region' %in% names(dim(data$hcst$data))) { + agg = 'region' + nregions <- dim(data$hcst$data)['region'] + } + # TODO fix it to use new version https://earth.bsc.es/gitlab/external/cstools/-/blob/dev-cross-indices/R/CST_Calibration.R#L570 + cross <- CSTools:::.make.eval.train.dexes(eval.method = cross.method, + amt.points = sdate_dim, + amt.points_cor = NULL) # k = ? + + .cross_anomalies <- function(indices, exp, obs, cross, categories, recipe) { + # subset years: Subset works at BSC not at Athos + ## training indices + cross <- cross[[indices]] + print(paste("Index:", cross$eval.dexes)) + obs_tr <- Subset(obs, along = 'syear', + indices = cross$train.dexes) + hcst_tr <- Subset(exp, along = 'syear', + indices = cross$train.dexes) + ## evaluation indices + hcst_ev <- Subset(exp, along = 'syear', + indices = cross$eval.dexes, drop = 'selected') + obs_ev <- Subset(obs, along = 'syear', + indices = cross$eval.dexes, drop = 'selected') + if (tolower(recipe$Analysis$Horizon) == 'subseasonal') { + central_day <- (dim(exp)['sday'] + 1)/2 + hcst_tr <- MergeDims(hcst_tr, merge_dims = c('sday', 'syear'), + rename_dim = 'syear', na.rm = FALSE) + + obs_tr <- MergeDims(obs_tr, merge_dims = c('sday', 'syear'), + rename_dim = 'syear', na.rm = FALSE) + # 'sday' dim to select the central day + hcst_ev <- Subset(hcst_ev, along = 'sday', indices = central_day) + + } + # compute climatology: + clim_obs_tr <- MeanDims(obs_tr, 'syear') + clim_hcst_tr <- MeanDims(hcst_tr, c('syear', 'ensemble')) + # compute anomalies: + ano_obs_tr <- s2dv::Ano(obs_tr, clim_obs_tr, + ncores = recipe$Analysis$ncores) + ano_hcst_tr <- s2dv::Ano(hcst_tr, clim_hcst_tr, + ncores = recipe$Analysis$ncores) + ano_hcst_ev <- s2dv::Ano(hcst_ev, clim_hcst_tr, + ncores = recipe$Analysis$ncores) + ano_obs_ev <- s2dv::Ano(obs_ev, clim_obs_tr, + ncores = recipe$Analysis$ncores) + # compute category limits + lims_ano_hcst_tr <- Apply(ano_hcst_tr, target_dims = c('syear', 'ensemble'), + fun = function(x, prob_lims) { + lapply(prob_lims, function(ps) { + as.array(quantile(as.vector(x), ps, + na.rm = na.rm))})}, + prob_lims = categories, + ncores = ncores)$output1 + lims_ano_obs_tr <- Apply(ano_obs_tr, target_dims = c('syear', 'ensemble'), + fun = function(x, prob_lims) { + lapply(prob_lims, function(ps) { + as.array(quantile(as.vector(x), ps, + na.rm = na.rm))})}, + prob_lims = categories, + ncores = ncores)$output1 + if (tolower(recipe$Analysis$Horizon) == 'subseasonal') { + ano_hcst_tr <- SplitDim(ano_hcst_tr, split_dim = 'syear', + new_dim_name = 'sday', + indices = rep(1:dim(data$hcst$data)['sday'], + length(cross[[t]]$train.dexes))) + ano_obs_tr <- SplitDim(obs_tr, split_dim = 'syear', new_dim_name = 'sday', + indices = rep(1:dim(data$hcst$data)['sday'], + length(cross[[t]]$train.dexes))) + browser() + } + ## Objects to return: + # lims_ano_hcst_tr: hcst percentiles (per category) + # lims_ano_obs_tr: obs percentiles (per category) + # ano_hcst_ev: hcst cross-validated anomalies + # ano_obs_ev: obs cross-validated anomalies + # ano_obs_tr: obs training anomalies + return(list(ano_hcst_ev = ano_hcst_ev, + ano_obs_ev = ano_obs_ev, + ano_obs_tr = ano_obs_tr, + lims_ano_hcst_tr = lims_ano_hcst_tr, + lims_ano_obs_tr = lims_ano_obs_tr)) + } + + output_dims <- names(dim(data$hcst$data))[which(names(dim(data$hcst$data)) != "syear")] + lims_output_dims <- c("bin", + names(dim(data$hcst$data))[which(!names(dim(data$hcst$data)) %in% c("syear", "ensemble"))]) + indices <- array(1:length(cross), dim = c(syear = length(cross))) + res <- Apply(data = list(indices = indices, + exp = data$hcst$data, + obs = data$obs$data), + target_dims = list(indices = NULL, + obs = names(dim(data$obs$data)), + exp = names(dim(data$hcst$data))), + output_dims = list(ano_hcst_ev = output_dims, + ano_obs_ev = output_dims, + ano_obs_tr = output_dims, + lims_ano_hcst_tr = lims_output_dims, + lims_ano_obs_tr = lims_output_dims), + fun = .cross_anomalies, + ncores = 1, ## TODO: Explore options for core distribution + cross = cross, + categories = categories, + recipe = recipe) + + info(recipe$Run$logger, + "#### Anomalies Cross-validation loop ended #####") + + ## TODO: Sort out ano_obs_tr dims + # To make crps_clim to work the reference forecast need to have same dims as obs: + res$ano_obs_tr <- Subset(res$ano_obs_tr, along = 'unneeded', + indices = 1, drop = 'selected') + + # Make categories rounded number to use as names: + categories <- recipe$Analysis$Workflow$Probabilities$percentiles + categories <- lapply(categories, function (x) { + sapply(x, function(y) { + round(eval(parse(text = y)), 2)})}) + recipe$Run$output_dir <- paste0(recipe$Run$output_dir, + "/outputs/Anomalies/") + + # Forecast anomalies: + if (!is.null(data$fcst)) { + clim_hcst <- Apply(data$hcst$data, + target_dims = c('syear', 'ensemble'), + mean, + na.rm = na.rm, + ncores = ncores)$output1 + data$fcst$data <- Ano(data = data$fcst$data, clim = clim_hcst) + hcst_ano <- Ano(data = data$hcst$data, clim = clim_hcst) + # Terciles limits using the whole hindcast period: + lims_fcst <- Apply(hcst_ano, target_dims = c('syear', 'ensemble'), + fun = function(x, prob_lims) { + lapply(prob_lims, function(ps) { + as.array(quantile(as.vector(x), ps, + na.rm = na.rm))})}, + output_dims = lapply(categories, function(x) {'bin'}), + prob_lims = categories, + ncores = ncores) + clim_obs <- Apply(data$obs$data, + target_dims = c('syear', 'ensemble'), + mean, + na.rm = na.rm, + ncores = ncores)$output1 + obs_ano <- Ano(data = data$obs$data, clim = clim_obs) + lims <- Apply(obs_ano, target_dims = c('syear', 'ensemble'), + fun = function(x, prob_lims) { + lapply(prob_lims, function(ps) { + as.array(quantile(as.vector(x), ps, + na.rm = na.rm))})}, + output_dims = lapply(categories, function(x) {'bin'}), + prob_lims = categories, + ncores = ncores) + tmp_lims2 <- list() + for (ps in 1:length(categories)) { + tmp_lims3 <- .drop_dims(lims[[ps]]) + for (l in 1:dim(lims[[ps]])['bin']) { + tmp_lims <- ClimProjDiags::Subset(x = tmp_lims3, + along = "bin", + indices = l) + tmp_lims2 <- append(tmp_lims2, list(tmp_lims)) + names(tmp_lims2)[length(tmp_lims2)] <- as.character(categories[[ps]][l]) + } + if (recipe$Analysis$Workflow$Probabilities$save == 'yes') { + save_percentiles(recipe = recipe, percentiles = tmp_lims2, + data_cube = data$obs, + agg = "global", outdir = NULL) + } + } + } ## TODO: Should this loop end here? + + # Compute Probabilities + fcst_probs <- lapply(categories, function(x){NULL}) + hcst_probs_ev <- lapply(categories, function(x){NULL}) + obs_probs_ev <- lapply(categories, function(x){NULL}) + + for (ps in 1:length(categories)) { + hcst_probs_ev[[ps]] <- GetProbs(res$ano_hcst_ev, time_dim = 'syear', + prob_thresholds = NULL, + bin_dim_abs = 'bin', + indices_for_quantiles = NULL, + memb_dim = 'ensemble', + abs_thresholds = lims_ano_hcst_tr_res[[ps]], + ncores = ncores) + obs_probs_ev[[ps]] <- GetProbs(res$ano_obs_ev, time_dim = 'syear', + prob_thresholds = NULL, + bin_dim_abs = 'bin', + indices_for_quantiles = NULL, + memb_dim = 'ensemble', + abs_thresholds = lims_ano_obs_tr_res[[ps]], + ncores = ncores) + if (!is.null(data$fcst)) { + fcst_probs[[ps]] <- GetProbs(data$fcst$data, time_dim = 'syear', + prob_thresholds = NULL, + bin_dim_abs = 'bin', + indices_for_quantiles = NULL, + memb_dim = 'ensemble', + abs_thresholds = lims_fcst[[ps]], + ncores = ncores) + } + } + # Convert to s2dv_cubes the resulting anomalies + ano_hcst <- data$hcst + ano_hcst$data <- res$ano_hcst_ev + ano_obs <- data$obs + ano_obs$data <- res$ano_obs_ev + ## TODO: Remove objects? + + info(recipe$Run$logger, + "#### Anomalies and Probabilities Done #####") + if (recipe$Analysis$Workflow$Anomalies$save != 'none') { + info(recipe$Run$logger, "##### START SAVING ANOMALIES #####") + recipe$Run$output_dir <- paste0(recipe$Run$output_dir, + "/outputs/Anomalies/") + # Save forecast + if ((recipe$Analysis$Workflow$Anomalies$save %in% + c('all', 'exp_only', 'fcst_only')) && !is.null(data$fcst)) { + save_forecast(recipe = recipe, data_cube = data$fcst, type = 'fcst') + } + # Save hindcast + if (recipe$Analysis$Workflow$Anomalies$save %in% + c('all', 'exp_only')) { + save_forecast(recipe = recipe, data_cube = ano_hcst, type = 'hcst') + } + # Save observation + if (recipe$Analysis$Workflow$Anomalies$save == 'all') { + save_observations(recipe = recipe, data_cube = ano_obs) + } + } + # Save probability bins + probs_hcst <- list() + probs_fcst <- list() + probs_obs <- list() + all_names <- NULL + + for (ps in 1:length(categories)) { + for (perc in 1:(length(categories[[ps]]) + 1)) { + if (perc == 1) { + name_elem <- paste0("below_", categories[[ps]][perc]) + } else if (perc == length(categories[[ps]]) + 1) { + name_elem <- paste0("above_", categories[[ps]][perc-1]) + } else { + name_elem <- paste0("from_", categories[[ps]][perc-1], + "_to_", categories[[ps]][perc]) + } + probs_hcst <- append(probs_hcst, + list(Subset(hcst_probs_ev[[ps]], + along = 'bin', + indices = perc, + drop = 'selected'))) + probs_obs <- append(probs_obs, + list(Subset(obs_probs_ev[[ps]], + along = 'bin', + indices = perc, + drop = 'selected'))) + if (!is.null(data$fcst)) { + probs_fcst <- append(probs_fcst, + list(Subset(fcst_probs[[ps]], + along = 'bin', + indices = perc, + drop = 'selected'))) + } + all_names <- c(all_names, name_elem) + } + } + ## TODO: Apply .drop_dims() directly to hcst_probs_ev etc; and move + ## reorganizing and renaming to save_probabilities() + names(probs_hcst) <- all_names + probs_hcst <- lapply(probs_hcst, .drop_dims) + names(probs_obs) <- all_names + probs_obs <- lapply(probs_obs, .drop_dims) + if (!is.null(data$fcst)) { + names(probs_fcst) <- all_names + probs_fcst <- lapply(probs_fcst, .drop_dims) + } + + if (recipe$Analysis$Workflow$Probabilities$save %in% + c('all', 'bins_only')) { + save_probabilities(recipe = recipe, probs = probs_hcst, + data_cube = data$hcst, agg = agg, + type = "hcst") + save_probabilities(recipe = recipe, probs = probs_obs, + data_cube = data$hcst, agg = agg, + type = "obs") + if (!is.null(probs_fcst)) { + save_probabilities(recipe = recipe, probs = probs_fcst, + data_cube = data$fcst, agg = agg, + type = "fcst") + } + } + # Save ensemble mean for multimodel option: + hcst_EM <- MeanDims(ano_hcst$data, 'ensemble', drop = T) +# save_metrics(recipe = recipe, +# metrics = list(hcst_EM = +# Subset(hcst_EM, along = 'dat', indices = 1, drop = 'selected')), +# data_cube = data$hcst, agg = agg, +# module = "statistics") +# fcst_EM <- NULL + if (!is.null(data$fcst)) { + fcst_EM <- MeanDims(data$fcst$data, 'ensemble', drop = T) +# save_metrics(recipe = recipe, +# metrics = list(fcst_EM = +# Subset(fcst_EM, along = 'dat', indices = 1, drop = 'selected')), +# data_cube = data$fcst, agg = agg, +# module = "statistics") + } + return(list(hcst = ano_hcst, obs = ano_obs, fcst = data$fcst, + hcst.full_val = data$hcst, obs.full_val = data$obs, + hcst_EM = hcst_EM, fcst_EM = fcst_EM, + #cat_lims = list(hcst_tr = res$lims_ano_hcst_tr, + # obs_tr = res$lims_ano_obs_tr), + probs = list(hcst_ev = hcst_probs_ev, + obs_ev = obs_probs_ev, + fcst = fcst_probs), + ref_obs_tr = ano_obs_tr_res)) +} + + +## The result contains the inputs for Skill_full_crossval. +## this is a list with the required elements: + ## probs is a list with + ## probs$hcst_ev and probs$obs_ev + ## probs$hcst_ev will have as many elements in the $Probabilities$percentiles + ## each element will be an array with 'cat' dimension + ## the same for probs$obs_ev + ## hcst is a s2dv_cube for the post-processed hindcast for the evalutaion indices + ## in this case cross validated anomalies + ## obs is a s2dv_cube for the post-processed obs + ## in this case cross validated anomalies + ## fcst is a s2dv_cube for the post-processed fcst + ## in this case cross anomalies with the full hindcast period + ## this object is not required for skill assessment + ## hcst.full_val and obs.full_val are the original data to compute mean bias + ## cat_lims used to compute the probabilities + ## this object is not required for skill assessment + ## ref_obs_tr is an array with the cross-validate observed anomalies + ## to be used as reference forecast in the CRPSS and CRPS_clim + ## it is computed from the training indices + diff --git a/modules/Crossval/R/tmp/CST_MergeDims.R b/modules/Crossval/R/tmp/CST_MergeDims.R new file mode 100644 index 00000000..bb06bf60 --- /dev/null +++ b/modules/Crossval/R/tmp/CST_MergeDims.R @@ -0,0 +1,162 @@ +#'Function to Merge Dimensions +#' +#'@author Nuria Perez-Zanon, \email{nuria.perez@bsc.es} +#' +#'@description This function merges two dimensions of the array \code{data} in a +#''s2dv_cube' object into one. The user can select the dimensions to merge and +#'provide the final name of the dimension. The user can select to remove NA +#'values or keep them. +#' +#'@param data An 's2dv_cube' object +#'@param merge_dims A character vector indicating the names of the dimensions to +#' merge. +#'@param rename_dim a character string indicating the name of the output +#' dimension. If left at NULL, the first dimension name provided in parameter +#' \code{merge_dims} will be used. +#'@param na.rm A logical indicating if the NA values should be removed or not. +#' +#'@examples +#'data <- 1 : c(2 * 3 * 4 * 5 * 6 * 7) +#'dim(data) <- c(time = 7, lat = 2, lon = 3, monthly = 4, member = 6, +#' dataset = 5, var = 1) +#'data[2,,,,,,] <- NA +#'data[c(3,27)] <- NA +#'data <- list(data = data) +#'class(data) <- 's2dv_cube' +#'new_data <- CST_MergeDims(data, merge_dims = c('time', 'monthly')) +#'new_data <- CST_MergeDims(data, merge_dims = c('lon', 'lat'), rename_dim = 'grid') +#'new_data <- CST_MergeDims(data, merge_dims = c('time', 'monthly'), na.rm = TRUE) +#'@export +CST_MergeDims <- function(data, merge_dims = c('ftime', 'monthly'), + rename_dim = NULL, na.rm = FALSE) { + # Check 's2dv_cube' + if (!inherits(data, 's2dv_cube')) { + stop("Parameter 'data' must be of the class 's2dv_cube'.") + } + if (is.null(rename_dim)) { + rename_dim <- merge_dims[1] + } + # data + data$data <- MergeDims(data$data, merge_dims = merge_dims, + rename_dim = rename_dim, na.rm = na.rm) + # dims + data$dims <- dim(data$data) + + # rename_dim + if (length(rename_dim) > 1) { + rename_dim <- as.character(rename_dim[1]) + } + # coords + data$coords[merge_dims] <- NULL + data$coords[[rename_dim]] <- 1:dim(data$data)[rename_dim] + attr(data$coords[[rename_dim]], 'indices') <- TRUE + + # attrs + if (all(merge_dims %in% names(dim(data$attrs$Dates)))) { + original_timezone <- attr(data$attrs$Dates[1], "tzone") + data$attrs$Dates <- MergeDims(data$attrs$Dates, merge_dims = merge_dims, + rename_dim = rename_dim, na.rm = na.rm) + # Transform dates back to POSIXct + data$attrs$Dates <- as.POSIXct(data$attrs$Dates, + origin = "1970-01-01", + tz = original_timezone) + + } else if (any(merge_dims %in% names(dim(data$attrs$Dates)))) { + warning("The dimensions of 'Dates' array will be different from ", + "the temporal dimensions in 'data'. Parameter 'merge_dims' ", + "only includes one temporal dimension of 'Dates'.") + } + return(data) +} +#'Function to Split Dimension +#' +#'@author Nuria Perez-Zanon, \email{nuria.perez@bsc.es} +#' +#'@description This function merges two dimensions of an array into one. The +#'user can select the dimensions to merge and provide the final name of the +#'dimension. The user can select to remove NA values or keep them. +#' +#'@param data An n-dimensional array with named dimensions +#'@param merge_dims A character vector indicating the names of the dimensions to +#' merge. +#'@param rename_dim A character string indicating the name of the output +#' dimension. If left at NULL, the first dimension name provided in parameter +#' \code{merge_dims} will be used. +#'@param na.rm A logical indicating if the NA values should be removed or not. +#' +#'@examples +#'data <- 1 : 20 +#'dim(data) <- c(time = 10, lat = 2) +#'new_data <- MergeDims(data, merge_dims = c('time', 'lat')) +#'@import abind +#'@importFrom ClimProjDiags Subset +#'@export +MergeDims <- function(data, merge_dims = c('time', 'monthly'), + rename_dim = NULL, na.rm = FALSE) { + # check data + if (is.null(data)) { + stop("Parameter 'data' cannot be NULL.") + } + if (is.null(dim(data))) { + stop("Parameter 'data' must have dimensions.") + } + if (is.null(names(dim(data)))) { + stop("Parameter 'data' must have dimension names.") + } + dims <- dim(data) + # check merge_dims + if (is.null(merge_dims)) { + stop("Parameter 'merge_dims' cannot be NULL.") + } + if (!is.character(merge_dims)) { + stop("Parameter 'merge_dims' must be a character vector ", + "indicating the names of the dimensions to be merged.") + } + if (length(merge_dims) > 2) { + warning("Only two dimensions can be merge, only the first two ", + "dimension will be used. To merge further dimensions ", + "consider to use this function multiple times.") + merge_dims <- merge_dims[1 : 2] + } else if (length(merge_dims) < 2) { + stop("Parameter 'merge_dims' must be of length two.") + } + if (is.null(rename_dim)) { + rename_dim <- merge_dims[1] + } + if (length(rename_dim) > 1) { + warning("Parameter 'rename_dim' has length greater than 1 ", + "and only the first element will be used.") + rename_dim <- as.character(rename_dim[1]) + } + if (!any(names(dims) %in% merge_dims)) { + stop("Parameter 'merge_dims' must match with dimension ", + "names in parameter 'data'.") + } + pos1 <- which(names(dims) == merge_dims[1]) + pos2 <- which(names(dims) == merge_dims[2]) + if (length(pos1) == 0 | length(pos2) == 0) { + stop("Parameter 'merge_dims' must match with dimension ", + "names in parameter 'data'.") + } + if (pos1 > pos2) { + pos1 <- pos1 - 1 + } + data <- lapply(1:dims[pos2], function(x) {Subset(data, along = pos2, + indices = x, drop = 'selected')}) + data <- abind(data, along = pos1) + names(dim(data)) <- names(dims)[-pos2] + if (!is.null(rename_dim)) { + names(dim(data))[pos1] <- rename_dim + } + if (na.rm) { + nas <- which(is.na(Subset(data, along = -pos1, indices = 1))) + if (length(nas) != 0) { + nas <- unlist(lapply(nas, function(x) { + if(all(is.na(Subset(data, along = pos1, + indices = x)))) { + return(x)}})) + data <- Subset(data, along = pos1, indices = -nas) + } + } + return(data) +} diff --git a/modules/Crossval/R/tmp/GetProbs.R b/modules/Crossval/R/tmp/GetProbs.R new file mode 100644 index 00000000..fb2cda0c --- /dev/null +++ b/modules/Crossval/R/tmp/GetProbs.R @@ -0,0 +1,351 @@ +#'Compute probabilistic forecasts or the corresponding observations +#' +#'Compute probabilistic forecasts from an ensemble based on the relative +#'thresholds, or the probabilistic observations (i.e., which probabilistic +#'category was observed). A reference period can be specified to calculate the +#'absolute thresholds between each probabilistic category. The absolute +#'thresholds can be computed in cross-validation mode. If data is an ensemble, +#'the probabilities are calculated as the percentage of members that fall into +#'each category. For observations (or forecast without member dimension), 1 +#'means that the event happened, while 0 indicates that the event did not +#'happen. Weighted probabilities can be computed if the weights are provided for +#'each ensemble member and time step. The absolute thresholds can also be +#'provided directly for probabilities calculation. +#' +#'@param data A named numerical array of the forecasts or observations with, at +#' least, time dimension. +#'@param time_dim A character string indicating the name of the time dimension. +#' The default value is 'sdate'. +#'@param memb_dim A character string indicating the name of the member dimension +#' to compute the probabilities of the forecast, or NULL if there is no member +#' dimension (e.g., for observations, or for forecast with only one ensemble +#' member). The default value is 'member'. +#'@param prob_thresholds A numeric vector of the relative thresholds (from 0 to +#' 1) between the categories. The default value is c(1/3, 2/3), which +#' corresponds to tercile equiprobable categories. +#'@param abs_thresholds A numeric array or vector of the absolute thresholds in +#' the same units as \code{data}. If an array is provided, it should have at +#' least 'bin_dim_abs' dimension. If it has more dimensions (e.g. different +#' thresholds for different locations, i.e. lon and lat dimensions), they +#' should match the dimensions of \code{data}, except the member dimension +#' which should not be included. The default value is NULL and, in this case, +#' 'prob_thresholds' is used for calculating the probabilities. +#'@param bin_dim_abs A character string of the dimension name of +#' 'abs_thresholds' array in which category limits are stored. It will also be +#' the probabilistic category dimension name in the output. The default value +#' is 'bin'. +#'@param indices_for_quantiles A vector of the indices to be taken along +#' 'time_dim' for computing the absolute thresholds between the probabilistic +#' categories. If NULL (default), the whole period is used. It is only used +#' when 'prob_thresholds' is provided. +#'@param weights A named numerical array of the weights for 'data' with +#' dimensions 'time_dim' and 'memb_dim' (if 'data' has them). The default value +#' is NULL. The ensemble should have at least 70 members or span at least 10 +#' time steps and have more than 45 members if consistency between the weighted +#' and unweighted methodologies is desired. +#'@param cross.val A logical indicating whether to compute the thresholds +#' between probabilistic categories in cross-validation mode. The default value +#' is FALSE. +#'@param ncores An integer indicating the number of cores to use for parallel +#' computation. The default value is NULL. +#' +#'@return +#'A numerical array of probabilities with dimensions c(bin_dim_abs, the rest +#'dimensions of 'data' except 'memb_dim'). 'bin' dimension has the length of +#'probabilistic categories, i.e., \code{length(prob_thresholds) + 1}. +#' +#'@examples +#'data <- array(rnorm(2000), dim = c(ensemble = 25, sdate = 20, time = 4)) +#'res <- GetProbs(data = data, time_dim = 'sdate', memb_dim = 'ensemble', +#' indices_for_quantiles = 4:17) +#' +#'# abs_thresholds is provided +#'abs_thr1 <- c(-0.2, 0.3) +#'abs_thr2 <- array(c(-0.2, 0.3) + rnorm(40) * 0.1, dim = c(cat = 2, sdate = 20)) +#'res1 <- GetProbs(data = data, time_dim = 'sdate', memb_dim = 'ensemble', +#' prob_thresholds = NULL, abs_thresholds = abs_thr1) +#'res2 <- GetProbs(data = data, time_dim = 'sdate', memb_dim = 'ensemble', +#' prob_thresholds = NULL, abs_thresholds = abs_thr2, bin_dim_abs = 'cat') +#' +#'@import multiApply +#'@importFrom easyVerification convert2prob +#'@export +GetProbs <- function(data, time_dim = 'sdate', memb_dim = 'member', + indices_for_quantiles = NULL, + prob_thresholds = c(1/3, 2/3), abs_thresholds = NULL, + bin_dim_abs = 'bin', weights = NULL, cross.val = FALSE, ncores = NULL) { + + # Check inputs + ## data + if (is.null(data)) { + stop("Parameter 'data' cannot be NULL.") + } + if (!is.numeric(data)) { + stop("Parameter 'data' must be a numeric array.") + } + if (any(is.null(names(dim(data)))) | any(nchar(names(dim(data))) == 0)) { + stop("Parameter 'data' must have dimension names.") + } + ## time_dim + if (!is.character(time_dim) | length(time_dim) != 1) + stop('Parameter "time_dim" must be a character string.') + if (!time_dim %in% names(dim(data))) { + stop("Parameter 'time_dim' is not found in 'data' dimensions.") + } + ## memb_dim + if (!is.null(memb_dim)) { + if (!is.character(memb_dim) | length(memb_dim) > 1) { + stop("Parameter 'memb_dim' must be a character string.") + } + if (!memb_dim %in% names(dim(data))) { + stop("Parameter 'memb_dim' is not found in 'data' dimensions. If no member ", + "dimension exists, set it as NULL.") + } + } + ## bin_dim_abs + if (!is.character(bin_dim_abs) | length(bin_dim_abs) != 1) { + stop('Parameter "bin_dim_abs" must be a character string.') + } + ## prob_thresholds, abs_thresholds + if (!is.null(abs_thresholds) & !is.null(prob_thresholds)) { + .warning(paste0("Parameters 'prob_thresholds' and 'abs_thresholds' are both provided. ", + "Only the first one is used.")) + abs_thresholds <- NULL + } else if (is.null(abs_thresholds) & is.null(prob_thresholds)) { + stop("One of the parameters 'prob_thresholds' and 'abs_thresholds' must be provided.") + } + if (!is.null(prob_thresholds)) { + if (!is.numeric(prob_thresholds) | !is.vector(prob_thresholds) | + any(prob_thresholds <= 0) | any(prob_thresholds >= 1)) { + stop("Parameter 'prob_thresholds' must be a numeric vector between 0 and 1.") + } + ## indices_for_quantiles + if (is.null(indices_for_quantiles)) { + indices_for_quantiles <- seq_len(dim(data)[time_dim]) + } else { + if (!is.numeric(indices_for_quantiles) | !is.vector(indices_for_quantiles)) { + stop("Parameter 'indices_for_quantiles' must be NULL or a numeric vector.") + } else if (length(indices_for_quantiles) > dim(data)[time_dim] | + max(indices_for_quantiles) > dim(data)[time_dim] | + any(indices_for_quantiles < 1)) { + stop("Parameter 'indices_for_quantiles' should be the indices of 'time_dim'.") + } + } + + } else { # abs_thresholds + + if (is.null(dim(abs_thresholds))) { # a vector + dim(abs_thresholds) <- length(abs_thresholds) + names(dim(abs_thresholds)) <- bin_dim_abs + } + # bin_dim_abs + if (!(bin_dim_abs %in% names(dim(abs_thresholds)))) { + stop("Parameter abs_thresholds' can be a vector or array with 'bin_dim_abs' dimension.") + } + if (!is.null(memb_dim) && memb_dim %in% names(dim(abs_thresholds))) { + stop("Parameter abs_thresholds' cannot have member dimension.") + } + dim_name_abs <- names(dim(abs_thresholds))[which(names(dim(abs_thresholds)) != bin_dim_abs)] + if (!all(dim_name_abs %in% names(dim(data)))) { + stop("Parameter 'abs_thresholds' dimensions except 'bin_dim_abs' must be in 'data' as well.") + } else { + if (any(dim(abs_thresholds)[dim_name_abs] != dim(data)[dim_name_abs])) { + stop("Parameter 'abs_thresholds' dimensions must have the same length as 'data'.") + } + } + if (!is.null(indices_for_quantiles)) { + warning("Parameter 'indices_for_quantiles' is not used when 'abs_thresholds' are provided.") + } + abs_target_dims <- bin_dim_abs + if (time_dim %in% names(dim(abs_thresholds))) { + abs_target_dims <- c(bin_dim_abs, time_dim) + } + + } + + ## weights + if (!is.null(weights)) { + if (!is.array(weights) | !is.numeric(weights)) + stop("Parameter 'weights' must be a named numeric array.") + +# if (is.null(dat_dim)) { + if (!is.null(memb_dim)) { + lendim_weights <- 2 + namesdim_weights <- c(time_dim, memb_dim) + } else { + lendim_weights <- 1 + namesdim_weights <- c(time_dim) + } + if (length(dim(weights)) != lendim_weights | + !all(names(dim(weights)) %in% namesdim_weights)) { + stop("Parameter 'weights' must have dimension ", + paste0(namesdim_weights, collapse = ' and '), ".") + } + if (any(dim(weights)[namesdim_weights] != dim(data)[namesdim_weights])) { + stop("Parameter 'weights' must have the same dimension length as ", + paste0(namesdim_weights, collapse = ' and '), " dimension in 'data'.") + } + weights <- Reorder(weights, namesdim_weights) + +# } else { +# if (length(dim(weights)) != 3 | +# any(!names(dim(weights)) %in% c(memb_dim, time_dim, dat_dim))) +# stop("Parameter 'weights' must have three dimensions with the names of ", +# "'memb_dim', 'time_dim' and 'dat_dim'.") +# if (dim(weights)[memb_dim] != dim(exp)[memb_dim] | +# dim(weights)[time_dim] != dim(exp)[time_dim] | +# dim(weights)[dat_dim] != dim(exp)[dat_dim]) { +# stop(paste0("Parameter 'weights' must have the same dimension lengths ", +# "as 'memb_dim', 'time_dim' and 'dat_dim' in 'exp'.")) +# } +# weights <- Reorder(weights, c(time_dim, memb_dim, dat_dim)) +# } + } + ## cross.val + if (!is.logical(cross.val) | length(cross.val) > 1) { + stop("Parameter 'cross.val' must be either TRUE or FALSE.") + } + ## ncores + if (!is.null(ncores)) { + if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 | + length(ncores) > 1) { + stop("Parameter 'ncores' must be either NULL or a positive integer.") + } + } + ############################### + if (is.null(abs_thresholds)) { + res <- Apply(data = list(data = data), + target_dims = c(time_dim, memb_dim), + output_dims = c(bin_dim_abs, time_dim), + fun = .GetProbs, + prob_thresholds = prob_thresholds, + indices_for_quantiles = indices_for_quantiles, + weights = weights, cross.val = cross.val, ncores = ncores)$output1 + } else { + res <- Apply(data = list(data = data, abs_thresholds = abs_thresholds), + target_dims = list(c(time_dim, memb_dim), abs_target_dims), + output_dims = c(bin_dim_abs, time_dim), + fun = .GetProbs, + prob_thresholds = NULL, + indices_for_quantiles = NULL, + weights = NULL, cross.val = FALSE, ncores = ncores)$output1 + } + + return(res) +} + +.GetProbs <- function(data, indices_for_quantiles, + prob_thresholds = c(1/3, 2/3), abs_thresholds = NULL, + weights = NULL, cross.val = FALSE) { + # .GetProbs() is used in RPS, RPSS, ROCSS + # data + ## if data is exp: [sdate, memb] + ## if data is obs: [sdate, (memb)] + # weights: [sdate, (memb)], same as data + # if abs_thresholds is not NULL: [bin, (sdate)] + + # Add dim [memb = 1] to data if it doesn't have memb_dim + if (length(dim(data)) == 1) { + dim(data) <- c(dim(data), 1) + if (!is.null(weights)) dim(weights) <- c(dim(weights), 1) + } + + # Calculate absolute thresholds + if (is.null(abs_thresholds)) { + if (cross.val) { + quantiles <- array(NA, dim = c(bin = length(prob_thresholds), sdate = dim(data)[1])) + for (i_time in seq_len(dim(data)[1])) { + if (is.null(weights)) { + tmp <- which(indices_for_quantiles != i_time) + quantiles[, i_time] <- + quantile(x = as.vector(data[indices_for_quantiles[tmp], ]), + probs = prob_thresholds, type = 8, na.rm = TRUE) + } else { + # weights: [sdate, memb] + tmp <- which(indices_for_quantiles != i_time) + sorted_arrays <- + .sorted_distributions(data[indices_for_quantiles[tmp], ], + weights[indices_for_quantiles[tmp], ]) + sorted_data <- sorted_arrays$data + cumulative_weights <- sorted_arrays$cumulative_weights + quantiles[, i_time] <- approx(cumulative_weights, sorted_data, + prob_thresholds, "linear")$y + } + } + + } else { + if (is.null(weights)) { + quantiles <- quantile(x = as.vector(data[indices_for_quantiles, ]), + probs = prob_thresholds, type = 8, na.rm = TRUE) + } else { + # weights: [sdate, memb] + sorted_arrays <- .sorted_distributions(data[indices_for_quantiles, ], + weights[indices_for_quantiles, ]) + sorted_data <- sorted_arrays$data + cumulative_weights <- sorted_arrays$cumulative_weights + quantiles <- approx(cumulative_weights, sorted_data, prob_thresholds, "linear")$y + } + quantiles <- array(rep(quantiles, dim(data)[1]), + dim = c(bin = length(quantiles), dim(data)[1])) + } + + } else { # abs_thresholds provided + quantiles <- abs_thresholds + if (length(dim(quantiles)) == 1) { + quantiles <- InsertDim(quantiles, lendim = dim(data)[1], + posdim = 2, name = names(dim(data))[1]) + } + } + # quantiles: [bin-1, sdate] + # Probabilities + probs <- array(dim = c(dim(quantiles)[1] + 1, dim(data)[1])) # [bin, sdate] + for (i_time in seq_len(dim(data)[1])) { + if (anyNA(data[i_time, ])) { + probs[, i_time] <- rep(NA, dim = dim(quantiles)[1] + 1) + } else { + if (is.null(weights)) { + probs[, i_time] <- colMeans(easyVerification::convert2prob(data[i_time, ], + threshold = quantiles[, i_time])) + } else { + sorted_arrays <- .sorted_distributions(data[i_time, ], weights[i_time, ]) + sorted_data <- sorted_arrays$data + cumulative_weights <- sorted_arrays$cumulative_weights + # find any quantiles that are outside the data range + integrated_probs <- array(dim = dim(quantiles)) + for (i_quant in seq_len(dim(quantiles)[1])) { + # for thresholds falling under the distribution + if (quantiles[i_quant, i_time] < min(sorted_data)) { + integrated_probs[i_quant, i_time] <- 0 + # for thresholds falling over the distribution + } else if (max(sorted_data) < quantiles[i_quant, i_time]) { + integrated_probs[i_quant, i_time] <- 1 + } else { + integrated_probs[i_quant, i_time] <- approx(sorted_data, cumulative_weights, + quantiles[i_quant, i_time], "linear")$y + } + } + probs[, i_time] <- append(integrated_probs[, i_time], 1) - + append(0, integrated_probs[, i_time]) + if (min(probs[, i_time]) < 0 | max(probs[, i_time]) > 1) { + stop("Probability in i_time = ", i_time, " is out of [0, 1].") + } + } + } + } + + return(probs) +} + +.sorted_distributions <- function(data_vector, weights_vector) { + weights_vector <- as.vector(weights_vector) + data_vector <- as.vector(data_vector) + weights_vector <- weights_vector / sum(weights_vector) # normalize to 1 + sorter <- order(data_vector) + sorted_weights <- weights_vector[sorter] + cumulative_weights <- cumsum(sorted_weights) - 0.5 * sorted_weights + cumulative_weights <- cumulative_weights - cumulative_weights[1] # fix the 0 + cumulative_weights <- cumulative_weights / + cumulative_weights[length(cumulative_weights)] # fix the 1 + return(list(data = data_vector[sorter], cumulative_weights = cumulative_weights)) +} + diff --git a/recipe_ecvs_ano_seas.yml b/recipe_ecvs_ano_seas.yml new file mode 100644 index 00000000..890f5366 --- /dev/null +++ b/recipe_ecvs_ano_seas.yml @@ -0,0 +1,108 @@ +Description: + Author: nperez + Info: ECVs Oper ESS ECMWF SEAS5 Seasonal Forecast recipe (monthly mean, tas) + +Analysis: + Horizon: seasonal # Mandatory, str: either subseasonal, seasonal, or decadal + Variables: + - {name: 'prlr', freq: 'monthly_mean', units: 'C'} + #- {name: 'prlr', freq: 'monthly_mean', units: 'mm', flux: yes} + Datasets: + System: + #- {name: 'Meteo-France-System8'} + #- {name: 'CMCC-SPS3.5'} + - {name: 'ECMWF-SEAS5.1'} + #- {name: 'UK-MetOffice-Glosea603'} + ##- {name: 'NCEP-CFSv2'} + #- {name: 'DWD-GCFS2.1'} + #- {name: 'ECCC-GEM5.2-NEMO'} + # name: Meteo-France-System7 #ECMWF-SEAS5.1 #ECMWF-i2o2 #UKMO-System602 #Meteo-France-System8 + Multimodel: + execute: no + approach: pooled # Mandatory, bool: Either yes/true or no/false + createFrom: Anomalies + Reference: + name: ERA5 # Mandatory, str: Reference codename. See docu. + Time: + sdate: '1201' + fcst_year: '2024' + hcst_start: '1993' # Mandatory, int: Hindcast start year 'YYYY' + hcst_end: '2016' # Mandatory, int: Hindcast end year 'YYYY' + ftime_min: 1 # Mandatory, int: First leadtime time step in months + ftime_max: 6 # Mandatory, int: Last leadtime time step in months + Region: + - {name: 'Global', latmin: -90, latmax: 90, lonmin: -180, lonmax: 179.9} + #- {name: 'Global', latmin: 0, latmax: 10, lonmin: 0, lonmax: 10} + Regrid: + method: conservative # Mandatory, str: Interpolation method. See docu. + type: to_system + Workflow: + Anomalies: + compute: yes + cross_validation: no + save: none + Time_aggregation: + execute: yes + method: average + ini: [1, 2, 3, 4] + end: [3, 4, 5, 6] + Calibration: + method: raw #crps_min #evmos # Mandatory, str: Calibration method. See docu. + cross_validation: yes + save: none + Skill: + metric: rpss + save: 'all' + cross_validation: yes + Probabilities: + percentiles: [[1/3, 2/3]] # frac: Quantile thresholds. + save: 'all' + Indicators: + index: no + Visualization: + plots: skill_metrics forecast_ensemble_mean most_likely_terciles + multi_panel: no + mask_terciles: 'both' + mask_ens: 'both' + projection: Robinson + file_format: 'PNG' + Scorecards: + execute: no # yes/no + regions: + Extra-tropical NH: {lon.min: 0, lon.max: 360, lat.min: 30, lat.max: 90} + Tropics: {lon.min: 0, lon.max: 360, lat.min: -30, lat.max: 30} + Extra-tropical SH : {lon.min: 0, lon.max: 360, lat.min: -90, lat.max: -30} + start_months: NULL + metric: mean_bias enscorr rpss crpss EnsSprErr + metric_aggregation: 'skill' + #inf_to_na: yes + table_label: NULL + fileout_label: NULL + col1_width: NULL + col2_width: NULL + calculate_diff: FALSE + ncores: 30 # Optional, int: number of cores, defaults to 1 + remove_NAs: yes # Optional, bool: Whether NAs are removed, defaults to FALSE + alpha: 0.05 + Output_format: scorecards + logo: yes +Run: + Loglevel: INFO + Terminal: yes + filesystem: esarchive + output_dir: /home/Earth/vagudets/sunset-outputs/ # replace with the directory where you want to save the outputs + code_dir: /home/bsc/bsc032339/sunset/ # replace with the directory where your code is + autosubmit: no + # fill only if using autosubmit + auto_conf: + script: /esarchive/scratch/nperez/git3/sunset/full_ecvs_scorecards.R # replace with the path to your script + expid: a68v # replace with your EXPID + hpc_user: bsc32339 # replace with your hpc username + wallclock: 02:00 # hh:mm + processors_per_job: 4 + platform: nord3v2 + email_notifications: yes # enable/disable email notifications. Change it if you want to. + email_address: nuria.perez@bsc.es # replace with your email address + notify_completed: yes # notify me by email when a job finishes + notify_failed: yes # notify me by email when a job fails + -- GitLab From 2ee81a507e17b8fdc17edfbf7ec121558b9812e1 Mon Sep 17 00:00:00 2001 From: vagudets Date: Wed, 16 Apr 2025 15:01:02 +0200 Subject: [PATCH 02/22] Crossval_anomalies_Apply working version --- modules/Crossval/Crossval_anomalies_Apply.R | 50 +++++++++++++++------ 1 file changed, 36 insertions(+), 14 deletions(-) diff --git a/modules/Crossval/Crossval_anomalies_Apply.R b/modules/Crossval/Crossval_anomalies_Apply.R index 3ba5e6c4..7bd4546d 100644 --- a/modules/Crossval/Crossval_anomalies_Apply.R +++ b/modules/Crossval/Crossval_anomalies_Apply.R @@ -35,7 +35,7 @@ Crossval_anomalies_Apply <- function(recipe, data) { # subset years: Subset works at BSC not at Athos ## training indices cross <- cross[[indices]] - print(paste("Index:", cross$eval.dexes)) + print(paste("Crossval Index:", cross$eval.dexes)) obs_tr <- Subset(obs, along = 'syear', indices = cross$train.dexes) hcst_tr <- Subset(exp, along = 'syear', @@ -74,14 +74,14 @@ Crossval_anomalies_Apply <- function(recipe, data) { lapply(prob_lims, function(ps) { as.array(quantile(as.vector(x), ps, na.rm = na.rm))})}, - prob_lims = categories, + prob_lims = list(unlist(categories)), ncores = ncores)$output1 lims_ano_obs_tr <- Apply(ano_obs_tr, target_dims = c('syear', 'ensemble'), fun = function(x, prob_lims) { lapply(prob_lims, function(ps) { as.array(quantile(as.vector(x), ps, na.rm = na.rm))})}, - prob_lims = categories, + prob_lims = list(unlist(categories)), ncores = ncores)$output1 if (tolower(recipe$Analysis$Horizon) == 'subseasonal') { ano_hcst_tr <- SplitDim(ano_hcst_tr, split_dim = 'syear', @@ -91,7 +91,6 @@ Crossval_anomalies_Apply <- function(recipe, data) { ano_obs_tr <- SplitDim(obs_tr, split_dim = 'syear', new_dim_name = 'sday', indices = rep(1:dim(data$hcst$data)['sday'], length(cross[[t]]$train.dexes))) - browser() } ## Objects to return: # lims_ano_hcst_tr: hcst percentiles (per category) @@ -118,7 +117,9 @@ Crossval_anomalies_Apply <- function(recipe, data) { exp = names(dim(data$hcst$data))), output_dims = list(ano_hcst_ev = output_dims, ano_obs_ev = output_dims, - ano_obs_tr = output_dims, + ano_obs_tr = c("dat", "var", "sday", "sweek", + "ensemble", "time", "latitude", "longitude", + "unneeded"), lims_ano_hcst_tr = lims_output_dims, lims_ano_obs_tr = lims_output_dims), fun = .cross_anomalies, @@ -130,16 +131,32 @@ Crossval_anomalies_Apply <- function(recipe, data) { info(recipe$Run$logger, "#### Anomalies Cross-validation loop ended #####") + # Rearrange limits into a list: + lims_tmp_hcst <- list() + lims_tmp_obs <- list() + last_index <- 0 + for (ps in 1:length(categories)) { + indices <- last_index + 1:length(categories[[ps]]) + lims_category_hcst <- Subset(res$lims_ano_hcst_tr, along = "bin", + indices = list(indices), + drop = FALSE) + lims_tmp_hcst[[ps]] <- lims_category_hcst + lims_category_obs <- Subset(res$lims_ano_obs_tr, along = "bin", + indices = list(indices), + drop = FALSE) + lims_tmp_obs[[ps]] <- lims_category_obs + last_index <- indices[length(indices)] + } + res$lims_ano_hcst_tr <- lims_tmp_hcst + res$lims_ano_obs_tr <- lims_tmp_obs + rm(lims_tmp_hcst, lims_tmp_obs) + gc() + ## TODO: Sort out ano_obs_tr dims # To make crps_clim to work the reference forecast need to have same dims as obs: res$ano_obs_tr <- Subset(res$ano_obs_tr, along = 'unneeded', indices = 1, drop = 'selected') - # Make categories rounded number to use as names: - categories <- recipe$Analysis$Workflow$Probabilities$percentiles - categories <- lapply(categories, function (x) { - sapply(x, function(y) { - round(eval(parse(text = y)), 2)})}) recipe$Run$output_dir <- paste0(recipe$Run$output_dir, "/outputs/Anomalies/") @@ -153,6 +170,7 @@ Crossval_anomalies_Apply <- function(recipe, data) { data$fcst$data <- Ano(data = data$fcst$data, clim = clim_hcst) hcst_ano <- Ano(data = data$hcst$data, clim = clim_hcst) # Terciles limits using the whole hindcast period: + print(categories) lims_fcst <- Apply(hcst_ano, target_dims = c('syear', 'ensemble'), fun = function(x, prob_lims) { lapply(prob_lims, function(ps) { @@ -191,7 +209,11 @@ Crossval_anomalies_Apply <- function(recipe, data) { agg = "global", outdir = NULL) } } - } ## TODO: Should this loop end here? + } + + ## TODO: Should this happen here? + # Make categories rounded number to use as names: + categories <- lapply(categories, round, digits = 2) # Compute Probabilities fcst_probs <- lapply(categories, function(x){NULL}) @@ -204,14 +226,14 @@ Crossval_anomalies_Apply <- function(recipe, data) { bin_dim_abs = 'bin', indices_for_quantiles = NULL, memb_dim = 'ensemble', - abs_thresholds = lims_ano_hcst_tr_res[[ps]], + abs_thresholds = res$lims_ano_hcst_tr[[ps]], ncores = ncores) obs_probs_ev[[ps]] <- GetProbs(res$ano_obs_ev, time_dim = 'syear', prob_thresholds = NULL, bin_dim_abs = 'bin', indices_for_quantiles = NULL, memb_dim = 'ensemble', - abs_thresholds = lims_ano_obs_tr_res[[ps]], + abs_thresholds = res$lims_ano_obs_tr[[ps]], ncores = ncores) if (!is.null(data$fcst)) { fcst_probs[[ps]] <- GetProbs(data$fcst$data, time_dim = 'syear', @@ -336,7 +358,7 @@ Crossval_anomalies_Apply <- function(recipe, data) { probs = list(hcst_ev = hcst_probs_ev, obs_ev = obs_probs_ev, fcst = fcst_probs), - ref_obs_tr = ano_obs_tr_res)) + ref_obs_tr = res$ano_obs_tr)) } -- GitLab From 512a056b1d2f3a63e3749c657b10409b6b0ab98a Mon Sep 17 00:00:00 2001 From: vagudets Date: Wed, 16 Apr 2025 15:07:49 +0200 Subject: [PATCH 03/22] Remove TODOs --- modules/Crossval/Crossval_anomalies_Apply.R | 2 -- 1 file changed, 2 deletions(-) diff --git a/modules/Crossval/Crossval_anomalies_Apply.R b/modules/Crossval/Crossval_anomalies_Apply.R index 7bd4546d..39d66981 100644 --- a/modules/Crossval/Crossval_anomalies_Apply.R +++ b/modules/Crossval/Crossval_anomalies_Apply.R @@ -152,7 +152,6 @@ Crossval_anomalies_Apply <- function(recipe, data) { rm(lims_tmp_hcst, lims_tmp_obs) gc() - ## TODO: Sort out ano_obs_tr dims # To make crps_clim to work the reference forecast need to have same dims as obs: res$ano_obs_tr <- Subset(res$ano_obs_tr, along = 'unneeded', indices = 1, drop = 'selected') @@ -250,7 +249,6 @@ Crossval_anomalies_Apply <- function(recipe, data) { ano_hcst$data <- res$ano_hcst_ev ano_obs <- data$obs ano_obs$data <- res$ano_obs_ev - ## TODO: Remove objects? info(recipe$Run$logger, "#### Anomalies and Probabilities Done #####") -- GitLab From c6ac042c4d6aa5a91bbdc8b63041465daaaa31b7 Mon Sep 17 00:00:00 2001 From: vagudets Date: Wed, 16 Apr 2025 15:09:05 +0200 Subject: [PATCH 04/22] Fix .drop_dims(), move Crossval_anomalies --- modules/Crossval/Crossval_anomalies.R | 383 ++++++++++++++++++++++++++ modules/Saving/R/drop_dims.R | 2 +- 2 files changed, 384 insertions(+), 1 deletion(-) create mode 100644 modules/Crossval/Crossval_anomalies.R diff --git a/modules/Crossval/Crossval_anomalies.R b/modules/Crossval/Crossval_anomalies.R new file mode 100644 index 00000000..39d66981 --- /dev/null +++ b/modules/Crossval/Crossval_anomalies.R @@ -0,0 +1,383 @@ +# Full-cross-val workflow +## This code should be valid for individual months and temporal averages +source("modules/Crossval/R/tmp/GetProbs.R") + +Crossval_anomalies_Apply <- function(recipe, data) { + cross.method <- recipe$Analysis$cross.method + # TODO move check + if (is.null(cross.method)) { + cross.method <- 'leave-one-out' + } + categories <- recipe$Analysis$Workflow$Probabilities$percentiles + categories <- lapply(categories, function (x) { + sapply(x, function(y) { + eval(parse(text = y))})}) + ncores <- recipe$Analysis$ncores + na.rm <- recipe$Analysis$remove_NAs + ## data dimensions + sdate_dim <- dim(data$hcst$data)['syear'] + orig_dims <- names(dim(data$hcst$data)) + # spatial dims + if ('latitude' %in% names(dim(data$hcst$data))) { + nlats <- dim(data$hcst$data)['latitude'] + nlons <- dim(data$hcst$data)['longitude'] + agg = 'global' + } else if ('region' %in% names(dim(data$hcst$data))) { + agg = 'region' + nregions <- dim(data$hcst$data)['region'] + } + # TODO fix it to use new version https://earth.bsc.es/gitlab/external/cstools/-/blob/dev-cross-indices/R/CST_Calibration.R#L570 + cross <- CSTools:::.make.eval.train.dexes(eval.method = cross.method, + amt.points = sdate_dim, + amt.points_cor = NULL) # k = ? + + .cross_anomalies <- function(indices, exp, obs, cross, categories, recipe) { + # subset years: Subset works at BSC not at Athos + ## training indices + cross <- cross[[indices]] + print(paste("Crossval Index:", cross$eval.dexes)) + obs_tr <- Subset(obs, along = 'syear', + indices = cross$train.dexes) + hcst_tr <- Subset(exp, along = 'syear', + indices = cross$train.dexes) + ## evaluation indices + hcst_ev <- Subset(exp, along = 'syear', + indices = cross$eval.dexes, drop = 'selected') + obs_ev <- Subset(obs, along = 'syear', + indices = cross$eval.dexes, drop = 'selected') + if (tolower(recipe$Analysis$Horizon) == 'subseasonal') { + central_day <- (dim(exp)['sday'] + 1)/2 + hcst_tr <- MergeDims(hcst_tr, merge_dims = c('sday', 'syear'), + rename_dim = 'syear', na.rm = FALSE) + + obs_tr <- MergeDims(obs_tr, merge_dims = c('sday', 'syear'), + rename_dim = 'syear', na.rm = FALSE) + # 'sday' dim to select the central day + hcst_ev <- Subset(hcst_ev, along = 'sday', indices = central_day) + + } + # compute climatology: + clim_obs_tr <- MeanDims(obs_tr, 'syear') + clim_hcst_tr <- MeanDims(hcst_tr, c('syear', 'ensemble')) + # compute anomalies: + ano_obs_tr <- s2dv::Ano(obs_tr, clim_obs_tr, + ncores = recipe$Analysis$ncores) + ano_hcst_tr <- s2dv::Ano(hcst_tr, clim_hcst_tr, + ncores = recipe$Analysis$ncores) + ano_hcst_ev <- s2dv::Ano(hcst_ev, clim_hcst_tr, + ncores = recipe$Analysis$ncores) + ano_obs_ev <- s2dv::Ano(obs_ev, clim_obs_tr, + ncores = recipe$Analysis$ncores) + # compute category limits + lims_ano_hcst_tr <- Apply(ano_hcst_tr, target_dims = c('syear', 'ensemble'), + fun = function(x, prob_lims) { + lapply(prob_lims, function(ps) { + as.array(quantile(as.vector(x), ps, + na.rm = na.rm))})}, + prob_lims = list(unlist(categories)), + ncores = ncores)$output1 + lims_ano_obs_tr <- Apply(ano_obs_tr, target_dims = c('syear', 'ensemble'), + fun = function(x, prob_lims) { + lapply(prob_lims, function(ps) { + as.array(quantile(as.vector(x), ps, + na.rm = na.rm))})}, + prob_lims = list(unlist(categories)), + ncores = ncores)$output1 + if (tolower(recipe$Analysis$Horizon) == 'subseasonal') { + ano_hcst_tr <- SplitDim(ano_hcst_tr, split_dim = 'syear', + new_dim_name = 'sday', + indices = rep(1:dim(data$hcst$data)['sday'], + length(cross[[t]]$train.dexes))) + ano_obs_tr <- SplitDim(obs_tr, split_dim = 'syear', new_dim_name = 'sday', + indices = rep(1:dim(data$hcst$data)['sday'], + length(cross[[t]]$train.dexes))) + } + ## Objects to return: + # lims_ano_hcst_tr: hcst percentiles (per category) + # lims_ano_obs_tr: obs percentiles (per category) + # ano_hcst_ev: hcst cross-validated anomalies + # ano_obs_ev: obs cross-validated anomalies + # ano_obs_tr: obs training anomalies + return(list(ano_hcst_ev = ano_hcst_ev, + ano_obs_ev = ano_obs_ev, + ano_obs_tr = ano_obs_tr, + lims_ano_hcst_tr = lims_ano_hcst_tr, + lims_ano_obs_tr = lims_ano_obs_tr)) + } + + output_dims <- names(dim(data$hcst$data))[which(names(dim(data$hcst$data)) != "syear")] + lims_output_dims <- c("bin", + names(dim(data$hcst$data))[which(!names(dim(data$hcst$data)) %in% c("syear", "ensemble"))]) + indices <- array(1:length(cross), dim = c(syear = length(cross))) + res <- Apply(data = list(indices = indices, + exp = data$hcst$data, + obs = data$obs$data), + target_dims = list(indices = NULL, + obs = names(dim(data$obs$data)), + exp = names(dim(data$hcst$data))), + output_dims = list(ano_hcst_ev = output_dims, + ano_obs_ev = output_dims, + ano_obs_tr = c("dat", "var", "sday", "sweek", + "ensemble", "time", "latitude", "longitude", + "unneeded"), + lims_ano_hcst_tr = lims_output_dims, + lims_ano_obs_tr = lims_output_dims), + fun = .cross_anomalies, + ncores = 1, ## TODO: Explore options for core distribution + cross = cross, + categories = categories, + recipe = recipe) + + info(recipe$Run$logger, + "#### Anomalies Cross-validation loop ended #####") + + # Rearrange limits into a list: + lims_tmp_hcst <- list() + lims_tmp_obs <- list() + last_index <- 0 + for (ps in 1:length(categories)) { + indices <- last_index + 1:length(categories[[ps]]) + lims_category_hcst <- Subset(res$lims_ano_hcst_tr, along = "bin", + indices = list(indices), + drop = FALSE) + lims_tmp_hcst[[ps]] <- lims_category_hcst + lims_category_obs <- Subset(res$lims_ano_obs_tr, along = "bin", + indices = list(indices), + drop = FALSE) + lims_tmp_obs[[ps]] <- lims_category_obs + last_index <- indices[length(indices)] + } + res$lims_ano_hcst_tr <- lims_tmp_hcst + res$lims_ano_obs_tr <- lims_tmp_obs + rm(lims_tmp_hcst, lims_tmp_obs) + gc() + + # To make crps_clim to work the reference forecast need to have same dims as obs: + res$ano_obs_tr <- Subset(res$ano_obs_tr, along = 'unneeded', + indices = 1, drop = 'selected') + + recipe$Run$output_dir <- paste0(recipe$Run$output_dir, + "/outputs/Anomalies/") + + # Forecast anomalies: + if (!is.null(data$fcst)) { + clim_hcst <- Apply(data$hcst$data, + target_dims = c('syear', 'ensemble'), + mean, + na.rm = na.rm, + ncores = ncores)$output1 + data$fcst$data <- Ano(data = data$fcst$data, clim = clim_hcst) + hcst_ano <- Ano(data = data$hcst$data, clim = clim_hcst) + # Terciles limits using the whole hindcast period: + print(categories) + lims_fcst <- Apply(hcst_ano, target_dims = c('syear', 'ensemble'), + fun = function(x, prob_lims) { + lapply(prob_lims, function(ps) { + as.array(quantile(as.vector(x), ps, + na.rm = na.rm))})}, + output_dims = lapply(categories, function(x) {'bin'}), + prob_lims = categories, + ncores = ncores) + clim_obs <- Apply(data$obs$data, + target_dims = c('syear', 'ensemble'), + mean, + na.rm = na.rm, + ncores = ncores)$output1 + obs_ano <- Ano(data = data$obs$data, clim = clim_obs) + lims <- Apply(obs_ano, target_dims = c('syear', 'ensemble'), + fun = function(x, prob_lims) { + lapply(prob_lims, function(ps) { + as.array(quantile(as.vector(x), ps, + na.rm = na.rm))})}, + output_dims = lapply(categories, function(x) {'bin'}), + prob_lims = categories, + ncores = ncores) + tmp_lims2 <- list() + for (ps in 1:length(categories)) { + tmp_lims3 <- .drop_dims(lims[[ps]]) + for (l in 1:dim(lims[[ps]])['bin']) { + tmp_lims <- ClimProjDiags::Subset(x = tmp_lims3, + along = "bin", + indices = l) + tmp_lims2 <- append(tmp_lims2, list(tmp_lims)) + names(tmp_lims2)[length(tmp_lims2)] <- as.character(categories[[ps]][l]) + } + if (recipe$Analysis$Workflow$Probabilities$save == 'yes') { + save_percentiles(recipe = recipe, percentiles = tmp_lims2, + data_cube = data$obs, + agg = "global", outdir = NULL) + } + } + } + + ## TODO: Should this happen here? + # Make categories rounded number to use as names: + categories <- lapply(categories, round, digits = 2) + + # Compute Probabilities + fcst_probs <- lapply(categories, function(x){NULL}) + hcst_probs_ev <- lapply(categories, function(x){NULL}) + obs_probs_ev <- lapply(categories, function(x){NULL}) + + for (ps in 1:length(categories)) { + hcst_probs_ev[[ps]] <- GetProbs(res$ano_hcst_ev, time_dim = 'syear', + prob_thresholds = NULL, + bin_dim_abs = 'bin', + indices_for_quantiles = NULL, + memb_dim = 'ensemble', + abs_thresholds = res$lims_ano_hcst_tr[[ps]], + ncores = ncores) + obs_probs_ev[[ps]] <- GetProbs(res$ano_obs_ev, time_dim = 'syear', + prob_thresholds = NULL, + bin_dim_abs = 'bin', + indices_for_quantiles = NULL, + memb_dim = 'ensemble', + abs_thresholds = res$lims_ano_obs_tr[[ps]], + ncores = ncores) + if (!is.null(data$fcst)) { + fcst_probs[[ps]] <- GetProbs(data$fcst$data, time_dim = 'syear', + prob_thresholds = NULL, + bin_dim_abs = 'bin', + indices_for_quantiles = NULL, + memb_dim = 'ensemble', + abs_thresholds = lims_fcst[[ps]], + ncores = ncores) + } + } + # Convert to s2dv_cubes the resulting anomalies + ano_hcst <- data$hcst + ano_hcst$data <- res$ano_hcst_ev + ano_obs <- data$obs + ano_obs$data <- res$ano_obs_ev + + info(recipe$Run$logger, + "#### Anomalies and Probabilities Done #####") + if (recipe$Analysis$Workflow$Anomalies$save != 'none') { + info(recipe$Run$logger, "##### START SAVING ANOMALIES #####") + recipe$Run$output_dir <- paste0(recipe$Run$output_dir, + "/outputs/Anomalies/") + # Save forecast + if ((recipe$Analysis$Workflow$Anomalies$save %in% + c('all', 'exp_only', 'fcst_only')) && !is.null(data$fcst)) { + save_forecast(recipe = recipe, data_cube = data$fcst, type = 'fcst') + } + # Save hindcast + if (recipe$Analysis$Workflow$Anomalies$save %in% + c('all', 'exp_only')) { + save_forecast(recipe = recipe, data_cube = ano_hcst, type = 'hcst') + } + # Save observation + if (recipe$Analysis$Workflow$Anomalies$save == 'all') { + save_observations(recipe = recipe, data_cube = ano_obs) + } + } + # Save probability bins + probs_hcst <- list() + probs_fcst <- list() + probs_obs <- list() + all_names <- NULL + + for (ps in 1:length(categories)) { + for (perc in 1:(length(categories[[ps]]) + 1)) { + if (perc == 1) { + name_elem <- paste0("below_", categories[[ps]][perc]) + } else if (perc == length(categories[[ps]]) + 1) { + name_elem <- paste0("above_", categories[[ps]][perc-1]) + } else { + name_elem <- paste0("from_", categories[[ps]][perc-1], + "_to_", categories[[ps]][perc]) + } + probs_hcst <- append(probs_hcst, + list(Subset(hcst_probs_ev[[ps]], + along = 'bin', + indices = perc, + drop = 'selected'))) + probs_obs <- append(probs_obs, + list(Subset(obs_probs_ev[[ps]], + along = 'bin', + indices = perc, + drop = 'selected'))) + if (!is.null(data$fcst)) { + probs_fcst <- append(probs_fcst, + list(Subset(fcst_probs[[ps]], + along = 'bin', + indices = perc, + drop = 'selected'))) + } + all_names <- c(all_names, name_elem) + } + } + ## TODO: Apply .drop_dims() directly to hcst_probs_ev etc; and move + ## reorganizing and renaming to save_probabilities() + names(probs_hcst) <- all_names + probs_hcst <- lapply(probs_hcst, .drop_dims) + names(probs_obs) <- all_names + probs_obs <- lapply(probs_obs, .drop_dims) + if (!is.null(data$fcst)) { + names(probs_fcst) <- all_names + probs_fcst <- lapply(probs_fcst, .drop_dims) + } + + if (recipe$Analysis$Workflow$Probabilities$save %in% + c('all', 'bins_only')) { + save_probabilities(recipe = recipe, probs = probs_hcst, + data_cube = data$hcst, agg = agg, + type = "hcst") + save_probabilities(recipe = recipe, probs = probs_obs, + data_cube = data$hcst, agg = agg, + type = "obs") + if (!is.null(probs_fcst)) { + save_probabilities(recipe = recipe, probs = probs_fcst, + data_cube = data$fcst, agg = agg, + type = "fcst") + } + } + # Save ensemble mean for multimodel option: + hcst_EM <- MeanDims(ano_hcst$data, 'ensemble', drop = T) +# save_metrics(recipe = recipe, +# metrics = list(hcst_EM = +# Subset(hcst_EM, along = 'dat', indices = 1, drop = 'selected')), +# data_cube = data$hcst, agg = agg, +# module = "statistics") +# fcst_EM <- NULL + if (!is.null(data$fcst)) { + fcst_EM <- MeanDims(data$fcst$data, 'ensemble', drop = T) +# save_metrics(recipe = recipe, +# metrics = list(fcst_EM = +# Subset(fcst_EM, along = 'dat', indices = 1, drop = 'selected')), +# data_cube = data$fcst, agg = agg, +# module = "statistics") + } + return(list(hcst = ano_hcst, obs = ano_obs, fcst = data$fcst, + hcst.full_val = data$hcst, obs.full_val = data$obs, + hcst_EM = hcst_EM, fcst_EM = fcst_EM, + #cat_lims = list(hcst_tr = res$lims_ano_hcst_tr, + # obs_tr = res$lims_ano_obs_tr), + probs = list(hcst_ev = hcst_probs_ev, + obs_ev = obs_probs_ev, + fcst = fcst_probs), + ref_obs_tr = res$ano_obs_tr)) +} + + +## The result contains the inputs for Skill_full_crossval. +## this is a list with the required elements: + ## probs is a list with + ## probs$hcst_ev and probs$obs_ev + ## probs$hcst_ev will have as many elements in the $Probabilities$percentiles + ## each element will be an array with 'cat' dimension + ## the same for probs$obs_ev + ## hcst is a s2dv_cube for the post-processed hindcast for the evalutaion indices + ## in this case cross validated anomalies + ## obs is a s2dv_cube for the post-processed obs + ## in this case cross validated anomalies + ## fcst is a s2dv_cube for the post-processed fcst + ## in this case cross anomalies with the full hindcast period + ## this object is not required for skill assessment + ## hcst.full_val and obs.full_val are the original data to compute mean bias + ## cat_lims used to compute the probabilities + ## this object is not required for skill assessment + ## ref_obs_tr is an array with the cross-validate observed anomalies + ## to be used as reference forecast in the CRPSS and CRPS_clim + ## it is computed from the training indices + diff --git a/modules/Saving/R/drop_dims.R b/modules/Saving/R/drop_dims.R index b36693be..6578e818 100644 --- a/modules/Saving/R/drop_dims.R +++ b/modules/Saving/R/drop_dims.R @@ -2,7 +2,7 @@ .drop_dims <- function(metric_array) { # Define dimensions that are not essential for saving droppable_dims <- c("dat", "sday", "sweek", "ensemble", "nobs", - "nexp", "exp_memb", "obs_memb", "bin") + "nexp", "exp_memb", "obs_memb") # ,"bin") # Select non-essential dimensions of length 1 dims_to_drop <- intersect(names(which(dim(metric_array) == 1)), droppable_dims) -- GitLab From 867d185efe43092016372041a4046b94080153be Mon Sep 17 00:00:00 2001 From: vagudets Date: Wed, 16 Apr 2025 15:29:01 +0200 Subject: [PATCH 05/22] Remove Crossval_anomalies_Apply.R and fix function name --- modules/Crossval/Crossval_anomalies.R | 17 +- modules/Crossval/Crossval_anomalies_Apply.R | 383 -------------------- 2 files changed, 10 insertions(+), 390 deletions(-) delete mode 100644 modules/Crossval/Crossval_anomalies_Apply.R diff --git a/modules/Crossval/Crossval_anomalies.R b/modules/Crossval/Crossval_anomalies.R index 39d66981..935106ed 100644 --- a/modules/Crossval/Crossval_anomalies.R +++ b/modules/Crossval/Crossval_anomalies.R @@ -2,7 +2,7 @@ ## This code should be valid for individual months and temporal averages source("modules/Crossval/R/tmp/GetProbs.R") -Crossval_anomalies_Apply <- function(recipe, data) { +Crossval_anomalies <- function(recipe, data) { cross.method <- recipe$Analysis$cross.method # TODO move check if (is.null(cross.method)) { @@ -31,7 +31,15 @@ Crossval_anomalies_Apply <- function(recipe, data) { amt.points = sdate_dim, amt.points_cor = NULL) # k = ? + # Define function to compute anomalies in full cross-validation .cross_anomalies <- function(indices, exp, obs, cross, categories, recipe) { + ## Objects to return: + # lims_ano_hcst_tr: hcst percentiles (per category) + # lims_ano_obs_tr: obs percentiles (per category) + # ano_hcst_ev: hcst cross-validated anomalies + # ano_obs_ev: obs cross-validated anomalies + # ano_obs_tr: obs training anomalies + # subset years: Subset works at BSC not at Athos ## training indices cross <- cross[[indices]] @@ -92,12 +100,7 @@ Crossval_anomalies_Apply <- function(recipe, data) { indices = rep(1:dim(data$hcst$data)['sday'], length(cross[[t]]$train.dexes))) } - ## Objects to return: - # lims_ano_hcst_tr: hcst percentiles (per category) - # lims_ano_obs_tr: obs percentiles (per category) - # ano_hcst_ev: hcst cross-validated anomalies - # ano_obs_ev: obs cross-validated anomalies - # ano_obs_tr: obs training anomalies + return(list(ano_hcst_ev = ano_hcst_ev, ano_obs_ev = ano_obs_ev, ano_obs_tr = ano_obs_tr, diff --git a/modules/Crossval/Crossval_anomalies_Apply.R b/modules/Crossval/Crossval_anomalies_Apply.R deleted file mode 100644 index 39d66981..00000000 --- a/modules/Crossval/Crossval_anomalies_Apply.R +++ /dev/null @@ -1,383 +0,0 @@ -# Full-cross-val workflow -## This code should be valid for individual months and temporal averages -source("modules/Crossval/R/tmp/GetProbs.R") - -Crossval_anomalies_Apply <- function(recipe, data) { - cross.method <- recipe$Analysis$cross.method - # TODO move check - if (is.null(cross.method)) { - cross.method <- 'leave-one-out' - } - categories <- recipe$Analysis$Workflow$Probabilities$percentiles - categories <- lapply(categories, function (x) { - sapply(x, function(y) { - eval(parse(text = y))})}) - ncores <- recipe$Analysis$ncores - na.rm <- recipe$Analysis$remove_NAs - ## data dimensions - sdate_dim <- dim(data$hcst$data)['syear'] - orig_dims <- names(dim(data$hcst$data)) - # spatial dims - if ('latitude' %in% names(dim(data$hcst$data))) { - nlats <- dim(data$hcst$data)['latitude'] - nlons <- dim(data$hcst$data)['longitude'] - agg = 'global' - } else if ('region' %in% names(dim(data$hcst$data))) { - agg = 'region' - nregions <- dim(data$hcst$data)['region'] - } - # TODO fix it to use new version https://earth.bsc.es/gitlab/external/cstools/-/blob/dev-cross-indices/R/CST_Calibration.R#L570 - cross <- CSTools:::.make.eval.train.dexes(eval.method = cross.method, - amt.points = sdate_dim, - amt.points_cor = NULL) # k = ? - - .cross_anomalies <- function(indices, exp, obs, cross, categories, recipe) { - # subset years: Subset works at BSC not at Athos - ## training indices - cross <- cross[[indices]] - print(paste("Crossval Index:", cross$eval.dexes)) - obs_tr <- Subset(obs, along = 'syear', - indices = cross$train.dexes) - hcst_tr <- Subset(exp, along = 'syear', - indices = cross$train.dexes) - ## evaluation indices - hcst_ev <- Subset(exp, along = 'syear', - indices = cross$eval.dexes, drop = 'selected') - obs_ev <- Subset(obs, along = 'syear', - indices = cross$eval.dexes, drop = 'selected') - if (tolower(recipe$Analysis$Horizon) == 'subseasonal') { - central_day <- (dim(exp)['sday'] + 1)/2 - hcst_tr <- MergeDims(hcst_tr, merge_dims = c('sday', 'syear'), - rename_dim = 'syear', na.rm = FALSE) - - obs_tr <- MergeDims(obs_tr, merge_dims = c('sday', 'syear'), - rename_dim = 'syear', na.rm = FALSE) - # 'sday' dim to select the central day - hcst_ev <- Subset(hcst_ev, along = 'sday', indices = central_day) - - } - # compute climatology: - clim_obs_tr <- MeanDims(obs_tr, 'syear') - clim_hcst_tr <- MeanDims(hcst_tr, c('syear', 'ensemble')) - # compute anomalies: - ano_obs_tr <- s2dv::Ano(obs_tr, clim_obs_tr, - ncores = recipe$Analysis$ncores) - ano_hcst_tr <- s2dv::Ano(hcst_tr, clim_hcst_tr, - ncores = recipe$Analysis$ncores) - ano_hcst_ev <- s2dv::Ano(hcst_ev, clim_hcst_tr, - ncores = recipe$Analysis$ncores) - ano_obs_ev <- s2dv::Ano(obs_ev, clim_obs_tr, - ncores = recipe$Analysis$ncores) - # compute category limits - lims_ano_hcst_tr <- Apply(ano_hcst_tr, target_dims = c('syear', 'ensemble'), - fun = function(x, prob_lims) { - lapply(prob_lims, function(ps) { - as.array(quantile(as.vector(x), ps, - na.rm = na.rm))})}, - prob_lims = list(unlist(categories)), - ncores = ncores)$output1 - lims_ano_obs_tr <- Apply(ano_obs_tr, target_dims = c('syear', 'ensemble'), - fun = function(x, prob_lims) { - lapply(prob_lims, function(ps) { - as.array(quantile(as.vector(x), ps, - na.rm = na.rm))})}, - prob_lims = list(unlist(categories)), - ncores = ncores)$output1 - if (tolower(recipe$Analysis$Horizon) == 'subseasonal') { - ano_hcst_tr <- SplitDim(ano_hcst_tr, split_dim = 'syear', - new_dim_name = 'sday', - indices = rep(1:dim(data$hcst$data)['sday'], - length(cross[[t]]$train.dexes))) - ano_obs_tr <- SplitDim(obs_tr, split_dim = 'syear', new_dim_name = 'sday', - indices = rep(1:dim(data$hcst$data)['sday'], - length(cross[[t]]$train.dexes))) - } - ## Objects to return: - # lims_ano_hcst_tr: hcst percentiles (per category) - # lims_ano_obs_tr: obs percentiles (per category) - # ano_hcst_ev: hcst cross-validated anomalies - # ano_obs_ev: obs cross-validated anomalies - # ano_obs_tr: obs training anomalies - return(list(ano_hcst_ev = ano_hcst_ev, - ano_obs_ev = ano_obs_ev, - ano_obs_tr = ano_obs_tr, - lims_ano_hcst_tr = lims_ano_hcst_tr, - lims_ano_obs_tr = lims_ano_obs_tr)) - } - - output_dims <- names(dim(data$hcst$data))[which(names(dim(data$hcst$data)) != "syear")] - lims_output_dims <- c("bin", - names(dim(data$hcst$data))[which(!names(dim(data$hcst$data)) %in% c("syear", "ensemble"))]) - indices <- array(1:length(cross), dim = c(syear = length(cross))) - res <- Apply(data = list(indices = indices, - exp = data$hcst$data, - obs = data$obs$data), - target_dims = list(indices = NULL, - obs = names(dim(data$obs$data)), - exp = names(dim(data$hcst$data))), - output_dims = list(ano_hcst_ev = output_dims, - ano_obs_ev = output_dims, - ano_obs_tr = c("dat", "var", "sday", "sweek", - "ensemble", "time", "latitude", "longitude", - "unneeded"), - lims_ano_hcst_tr = lims_output_dims, - lims_ano_obs_tr = lims_output_dims), - fun = .cross_anomalies, - ncores = 1, ## TODO: Explore options for core distribution - cross = cross, - categories = categories, - recipe = recipe) - - info(recipe$Run$logger, - "#### Anomalies Cross-validation loop ended #####") - - # Rearrange limits into a list: - lims_tmp_hcst <- list() - lims_tmp_obs <- list() - last_index <- 0 - for (ps in 1:length(categories)) { - indices <- last_index + 1:length(categories[[ps]]) - lims_category_hcst <- Subset(res$lims_ano_hcst_tr, along = "bin", - indices = list(indices), - drop = FALSE) - lims_tmp_hcst[[ps]] <- lims_category_hcst - lims_category_obs <- Subset(res$lims_ano_obs_tr, along = "bin", - indices = list(indices), - drop = FALSE) - lims_tmp_obs[[ps]] <- lims_category_obs - last_index <- indices[length(indices)] - } - res$lims_ano_hcst_tr <- lims_tmp_hcst - res$lims_ano_obs_tr <- lims_tmp_obs - rm(lims_tmp_hcst, lims_tmp_obs) - gc() - - # To make crps_clim to work the reference forecast need to have same dims as obs: - res$ano_obs_tr <- Subset(res$ano_obs_tr, along = 'unneeded', - indices = 1, drop = 'selected') - - recipe$Run$output_dir <- paste0(recipe$Run$output_dir, - "/outputs/Anomalies/") - - # Forecast anomalies: - if (!is.null(data$fcst)) { - clim_hcst <- Apply(data$hcst$data, - target_dims = c('syear', 'ensemble'), - mean, - na.rm = na.rm, - ncores = ncores)$output1 - data$fcst$data <- Ano(data = data$fcst$data, clim = clim_hcst) - hcst_ano <- Ano(data = data$hcst$data, clim = clim_hcst) - # Terciles limits using the whole hindcast period: - print(categories) - lims_fcst <- Apply(hcst_ano, target_dims = c('syear', 'ensemble'), - fun = function(x, prob_lims) { - lapply(prob_lims, function(ps) { - as.array(quantile(as.vector(x), ps, - na.rm = na.rm))})}, - output_dims = lapply(categories, function(x) {'bin'}), - prob_lims = categories, - ncores = ncores) - clim_obs <- Apply(data$obs$data, - target_dims = c('syear', 'ensemble'), - mean, - na.rm = na.rm, - ncores = ncores)$output1 - obs_ano <- Ano(data = data$obs$data, clim = clim_obs) - lims <- Apply(obs_ano, target_dims = c('syear', 'ensemble'), - fun = function(x, prob_lims) { - lapply(prob_lims, function(ps) { - as.array(quantile(as.vector(x), ps, - na.rm = na.rm))})}, - output_dims = lapply(categories, function(x) {'bin'}), - prob_lims = categories, - ncores = ncores) - tmp_lims2 <- list() - for (ps in 1:length(categories)) { - tmp_lims3 <- .drop_dims(lims[[ps]]) - for (l in 1:dim(lims[[ps]])['bin']) { - tmp_lims <- ClimProjDiags::Subset(x = tmp_lims3, - along = "bin", - indices = l) - tmp_lims2 <- append(tmp_lims2, list(tmp_lims)) - names(tmp_lims2)[length(tmp_lims2)] <- as.character(categories[[ps]][l]) - } - if (recipe$Analysis$Workflow$Probabilities$save == 'yes') { - save_percentiles(recipe = recipe, percentiles = tmp_lims2, - data_cube = data$obs, - agg = "global", outdir = NULL) - } - } - } - - ## TODO: Should this happen here? - # Make categories rounded number to use as names: - categories <- lapply(categories, round, digits = 2) - - # Compute Probabilities - fcst_probs <- lapply(categories, function(x){NULL}) - hcst_probs_ev <- lapply(categories, function(x){NULL}) - obs_probs_ev <- lapply(categories, function(x){NULL}) - - for (ps in 1:length(categories)) { - hcst_probs_ev[[ps]] <- GetProbs(res$ano_hcst_ev, time_dim = 'syear', - prob_thresholds = NULL, - bin_dim_abs = 'bin', - indices_for_quantiles = NULL, - memb_dim = 'ensemble', - abs_thresholds = res$lims_ano_hcst_tr[[ps]], - ncores = ncores) - obs_probs_ev[[ps]] <- GetProbs(res$ano_obs_ev, time_dim = 'syear', - prob_thresholds = NULL, - bin_dim_abs = 'bin', - indices_for_quantiles = NULL, - memb_dim = 'ensemble', - abs_thresholds = res$lims_ano_obs_tr[[ps]], - ncores = ncores) - if (!is.null(data$fcst)) { - fcst_probs[[ps]] <- GetProbs(data$fcst$data, time_dim = 'syear', - prob_thresholds = NULL, - bin_dim_abs = 'bin', - indices_for_quantiles = NULL, - memb_dim = 'ensemble', - abs_thresholds = lims_fcst[[ps]], - ncores = ncores) - } - } - # Convert to s2dv_cubes the resulting anomalies - ano_hcst <- data$hcst - ano_hcst$data <- res$ano_hcst_ev - ano_obs <- data$obs - ano_obs$data <- res$ano_obs_ev - - info(recipe$Run$logger, - "#### Anomalies and Probabilities Done #####") - if (recipe$Analysis$Workflow$Anomalies$save != 'none') { - info(recipe$Run$logger, "##### START SAVING ANOMALIES #####") - recipe$Run$output_dir <- paste0(recipe$Run$output_dir, - "/outputs/Anomalies/") - # Save forecast - if ((recipe$Analysis$Workflow$Anomalies$save %in% - c('all', 'exp_only', 'fcst_only')) && !is.null(data$fcst)) { - save_forecast(recipe = recipe, data_cube = data$fcst, type = 'fcst') - } - # Save hindcast - if (recipe$Analysis$Workflow$Anomalies$save %in% - c('all', 'exp_only')) { - save_forecast(recipe = recipe, data_cube = ano_hcst, type = 'hcst') - } - # Save observation - if (recipe$Analysis$Workflow$Anomalies$save == 'all') { - save_observations(recipe = recipe, data_cube = ano_obs) - } - } - # Save probability bins - probs_hcst <- list() - probs_fcst <- list() - probs_obs <- list() - all_names <- NULL - - for (ps in 1:length(categories)) { - for (perc in 1:(length(categories[[ps]]) + 1)) { - if (perc == 1) { - name_elem <- paste0("below_", categories[[ps]][perc]) - } else if (perc == length(categories[[ps]]) + 1) { - name_elem <- paste0("above_", categories[[ps]][perc-1]) - } else { - name_elem <- paste0("from_", categories[[ps]][perc-1], - "_to_", categories[[ps]][perc]) - } - probs_hcst <- append(probs_hcst, - list(Subset(hcst_probs_ev[[ps]], - along = 'bin', - indices = perc, - drop = 'selected'))) - probs_obs <- append(probs_obs, - list(Subset(obs_probs_ev[[ps]], - along = 'bin', - indices = perc, - drop = 'selected'))) - if (!is.null(data$fcst)) { - probs_fcst <- append(probs_fcst, - list(Subset(fcst_probs[[ps]], - along = 'bin', - indices = perc, - drop = 'selected'))) - } - all_names <- c(all_names, name_elem) - } - } - ## TODO: Apply .drop_dims() directly to hcst_probs_ev etc; and move - ## reorganizing and renaming to save_probabilities() - names(probs_hcst) <- all_names - probs_hcst <- lapply(probs_hcst, .drop_dims) - names(probs_obs) <- all_names - probs_obs <- lapply(probs_obs, .drop_dims) - if (!is.null(data$fcst)) { - names(probs_fcst) <- all_names - probs_fcst <- lapply(probs_fcst, .drop_dims) - } - - if (recipe$Analysis$Workflow$Probabilities$save %in% - c('all', 'bins_only')) { - save_probabilities(recipe = recipe, probs = probs_hcst, - data_cube = data$hcst, agg = agg, - type = "hcst") - save_probabilities(recipe = recipe, probs = probs_obs, - data_cube = data$hcst, agg = agg, - type = "obs") - if (!is.null(probs_fcst)) { - save_probabilities(recipe = recipe, probs = probs_fcst, - data_cube = data$fcst, agg = agg, - type = "fcst") - } - } - # Save ensemble mean for multimodel option: - hcst_EM <- MeanDims(ano_hcst$data, 'ensemble', drop = T) -# save_metrics(recipe = recipe, -# metrics = list(hcst_EM = -# Subset(hcst_EM, along = 'dat', indices = 1, drop = 'selected')), -# data_cube = data$hcst, agg = agg, -# module = "statistics") -# fcst_EM <- NULL - if (!is.null(data$fcst)) { - fcst_EM <- MeanDims(data$fcst$data, 'ensemble', drop = T) -# save_metrics(recipe = recipe, -# metrics = list(fcst_EM = -# Subset(fcst_EM, along = 'dat', indices = 1, drop = 'selected')), -# data_cube = data$fcst, agg = agg, -# module = "statistics") - } - return(list(hcst = ano_hcst, obs = ano_obs, fcst = data$fcst, - hcst.full_val = data$hcst, obs.full_val = data$obs, - hcst_EM = hcst_EM, fcst_EM = fcst_EM, - #cat_lims = list(hcst_tr = res$lims_ano_hcst_tr, - # obs_tr = res$lims_ano_obs_tr), - probs = list(hcst_ev = hcst_probs_ev, - obs_ev = obs_probs_ev, - fcst = fcst_probs), - ref_obs_tr = res$ano_obs_tr)) -} - - -## The result contains the inputs for Skill_full_crossval. -## this is a list with the required elements: - ## probs is a list with - ## probs$hcst_ev and probs$obs_ev - ## probs$hcst_ev will have as many elements in the $Probabilities$percentiles - ## each element will be an array with 'cat' dimension - ## the same for probs$obs_ev - ## hcst is a s2dv_cube for the post-processed hindcast for the evalutaion indices - ## in this case cross validated anomalies - ## obs is a s2dv_cube for the post-processed obs - ## in this case cross validated anomalies - ## fcst is a s2dv_cube for the post-processed fcst - ## in this case cross anomalies with the full hindcast period - ## this object is not required for skill assessment - ## hcst.full_val and obs.full_val are the original data to compute mean bias - ## cat_lims used to compute the probabilities - ## this object is not required for skill assessment - ## ref_obs_tr is an array with the cross-validate observed anomalies - ## to be used as reference forecast in the CRPSS and CRPS_clim - ## it is computed from the training indices - -- GitLab From e5b9cd1073ce7264c47ce4690ef43def987de9a5 Mon Sep 17 00:00:00 2001 From: vagudets Date: Wed, 16 Apr 2025 15:41:00 +0200 Subject: [PATCH 06/22] Remove unneeded print() --- modules/Crossval/Crossval_anomalies.R | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/Crossval/Crossval_anomalies.R b/modules/Crossval/Crossval_anomalies.R index 935106ed..83189e2c 100644 --- a/modules/Crossval/Crossval_anomalies.R +++ b/modules/Crossval/Crossval_anomalies.R @@ -172,7 +172,6 @@ Crossval_anomalies <- function(recipe, data) { data$fcst$data <- Ano(data = data$fcst$data, clim = clim_hcst) hcst_ano <- Ano(data = data$hcst$data, clim = clim_hcst) # Terciles limits using the whole hindcast period: - print(categories) lims_fcst <- Apply(hcst_ano, target_dims = c('syear', 'ensemble'), fun = function(x, prob_lims) { lapply(prob_lims, function(ps) { -- GitLab From f332a18146c7f810985d759dd824b15c2d2117ed Mon Sep 17 00:00:00 2001 From: vagudets Date: Wed, 16 Apr 2025 16:03:21 +0200 Subject: [PATCH 07/22] Adjust after change in drop_dims() --- modules/Skill/Skill.R | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/modules/Skill/Skill.R b/modules/Skill/Skill.R index 60589b5e..b607b419 100644 --- a/modules/Skill/Skill.R +++ b/modules/Skill/Skill.R @@ -466,14 +466,22 @@ Probabilities <- function(recipe, data) { } # Rearrange dimensions and return probabilities named_probs <- lapply(named_probs, function(x) {.drop_dims(x)}) + named_probs <- lapply(named_probs, function(x) { + Subset(x, along = "bin", + indices = 1, + drop = "selected")}) named_quantiles <- lapply(named_quantiles, function(x) {.drop_dims(x)}) + named_quantiles <- lapply(named_quantiles, function(x) { + Subset(x, along = "bin", + indices = 1, + drop = "selected")}) if (!is.null(data$fcst)) { fcst_years <- dim(data$fcst$data)[['syear']] named_probs_fcst <- lapply(named_probs_fcst, function(x) {.drop_dims(x)}) - # function(x) {Subset(x, - # along = 'syear', - # indices = 1:fcst_years, - # drop = 'non-selected')}) + named_probs_fcst <- lapply(named_probs_fcst, function(x) { + Subset(x, along = "bin", + indices = 1, + drop = "selected")}) results <- list(probs = named_probs, probs_fcst = named_probs_fcst, percentiles = named_quantiles) -- GitLab From c411696b66893c6f89d5098c535acc06fd8e9ffb Mon Sep 17 00:00:00 2001 From: vagudets Date: Wed, 16 Apr 2025 16:21:05 +0200 Subject: [PATCH 08/22] Comment outdir change --- modules/Crossval/Crossval_anomalies.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/Crossval/Crossval_anomalies.R b/modules/Crossval/Crossval_anomalies.R index 935106ed..c30ad595 100644 --- a/modules/Crossval/Crossval_anomalies.R +++ b/modules/Crossval/Crossval_anomalies.R @@ -257,8 +257,8 @@ Crossval_anomalies <- function(recipe, data) { "#### Anomalies and Probabilities Done #####") if (recipe$Analysis$Workflow$Anomalies$save != 'none') { info(recipe$Run$logger, "##### START SAVING ANOMALIES #####") - recipe$Run$output_dir <- paste0(recipe$Run$output_dir, - "/outputs/Anomalies/") + # recipe$Run$output_dir <- paste0(recipe$Run$output_dir, + # "/", "outputs/Anomalies/") # Save forecast if ((recipe$Analysis$Workflow$Anomalies$save %in% c('all', 'exp_only', 'fcst_only')) && !is.null(data$fcst)) { -- GitLab From 846555879db538cada762cee162f089b4de6afd3 Mon Sep 17 00:00:00 2001 From: vagudets Date: Tue, 22 Apr 2025 11:00:03 +0200 Subject: [PATCH 09/22] Test relative path --- modules/Crossval/Crossval_anomalies.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/Crossval/Crossval_anomalies.R b/modules/Crossval/Crossval_anomalies.R index bdfe76fc..dab15896 100644 --- a/modules/Crossval/Crossval_anomalies.R +++ b/modules/Crossval/Crossval_anomalies.R @@ -256,8 +256,8 @@ Crossval_anomalies <- function(recipe, data) { "#### Anomalies and Probabilities Done #####") if (recipe$Analysis$Workflow$Anomalies$save != 'none') { info(recipe$Run$logger, "##### START SAVING ANOMALIES #####") - # recipe$Run$output_dir <- paste0(recipe$Run$output_dir, - # "/", "outputs/Anomalies/") + recipe$Run$output_dir <- paste0(recipe$Run$output_dir, + "/", "outputs/Anomalies/") # Save forecast if ((recipe$Analysis$Workflow$Anomalies$save %in% c('all', 'exp_only', 'fcst_only')) && !is.null(data$fcst)) { -- GitLab From 42850e3bf38e8891ba3d7d1c7d87dcc16ad4b672 Mon Sep 17 00:00:00 2001 From: vagudets Date: Wed, 23 Apr 2025 14:11:26 +0200 Subject: [PATCH 10/22] Fix pipeline --- modules/Crossval/Crossval_anomalies.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/Crossval/Crossval_anomalies.R b/modules/Crossval/Crossval_anomalies.R index dab15896..fe0a2e05 100644 --- a/modules/Crossval/Crossval_anomalies.R +++ b/modules/Crossval/Crossval_anomalies.R @@ -160,7 +160,7 @@ Crossval_anomalies <- function(recipe, data) { indices = 1, drop = 'selected') recipe$Run$output_dir <- paste0(recipe$Run$output_dir, - "/outputs/Anomalies/") + "/", "outputs/Anomalies/") # Forecast anomalies: if (!is.null(data$fcst)) { @@ -256,8 +256,8 @@ Crossval_anomalies <- function(recipe, data) { "#### Anomalies and Probabilities Done #####") if (recipe$Analysis$Workflow$Anomalies$save != 'none') { info(recipe$Run$logger, "##### START SAVING ANOMALIES #####") - recipe$Run$output_dir <- paste0(recipe$Run$output_dir, - "/", "outputs/Anomalies/") + # recipe$Run$output_dir <- paste0(recipe$Run$output_dir, + # "/", "outputs/Anomalies/") # Save forecast if ((recipe$Analysis$Workflow$Anomalies$save %in% c('all', 'exp_only', 'fcst_only')) && !is.null(data$fcst)) { -- GitLab From 68bca01dfca67e572dd7518f10179983a71e512f Mon Sep 17 00:00:00 2001 From: vagudets Date: Mon, 28 Apr 2025 11:37:42 +0200 Subject: [PATCH 11/22] Add check for remove_NAs, fix bug in check for most_likely_terciles plot --- tools/check_recipe.R | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/tools/check_recipe.R b/tools/check_recipe.R index 4ae1299b..7adb02de 100644 --- a/tools/check_recipe.R +++ b/tools/check_recipe.R @@ -316,6 +316,21 @@ check_recipe <- function(recipe) { # "have a 'name'".) # # are numeric? class list mode list # } + + + # --------------------------------------------------------------------- + # GENERAL ANALYSIS CHECKS + # --------------------------------------------------------------------- + # NA Values + if (is.null(recipe$Analysis$remove_NAs)) { + recipe$Analysis$remove_NAs <- FALSE + warn(recipe$Run$logger, + "Parameter 'remove_NAs' was empty, setting default value 'no/False'") + } else if (!is.logical(recipe$Analysis$remove_NAs)) { + error(recipe$Run$logger, + "Parameter 'remove_NAs' should be 'yes/True' or 'no/False'") + error_status <- TRUE + } # --------------------------------------------------------------------- # WORKFLOW CHECKS # --------------------------------------------------------------------- @@ -755,9 +770,9 @@ check_recipe <- function(recipe) { binary_indices <- which(vapply(recipe$Analysis$Workflow$Probabilities$percentiles, length, integer(1)) == 1) if ("extreme_probabilities" %in% plots && length(binary_indices) == 0) { - stop(recipe$Run$logger, - paste("'extreme_probabilities' plot has been requested, but no", - "binary categories were found in 'Probabilities:percentiles'")) + error(recipe$Run$logger, + paste("'extreme_probabilities' plot has been requested, but no", + "binary categories were found in 'Probabilities:percentiles'")) error_status <- TRUE } } -- GitLab From 15c28d0437da83d60c38a5077f2100d8bd0fa102 Mon Sep 17 00:00:00 2001 From: vagudets Date: Mon, 5 May 2025 09:43:59 +0200 Subject: [PATCH 12/22] Recipe checker: add new metrics --- tools/check_recipe.R | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tools/check_recipe.R b/tools/check_recipe.R index 7adb02de..7b20ee45 100644 --- a/tools/check_recipe.R +++ b/tools/check_recipe.R @@ -622,13 +622,16 @@ check_recipe <- function(recipe) { } # Skill - AVAILABLE_METRICS <- c("enscorr", "corr_individual_members", "rps", "rps_syear", - "rpss", "frps", "frpss", "crps", "crps_syear", + AVAILABLE_METRICS <- c("enscorr", "corr_individual_members", "rps", + "rps_syear", "cov", "covariance", "std", + "standard_deviation", "n_eff", "spread", + "rpss", "frps", "frpss", "crps", "crps_syear", "crpss", "bss10", "bss90", "rms", "mean_bias", "mean_bias_ss", "enssprerr", "rps_clim", "rps_clim_syear", "crps_clim", "crps_clim_syear", - "enscorr_specs", "frps_specs", "rpss_specs", - "frpss_specs", "bss10_specs", "bss90_specs", "rms") + "enscorr_specs", "frps_specs", "rpss_specs", + "frpss_specs", "bss10_specs", "bss90_specs", "rms", "rmss") + if ("Skill" %in% names(recipe$Analysis$Workflow)) { if (is.null(recipe$Analysis$Workflow$Skill$metric)) { error(recipe$Run$logger, -- GitLab From 71ef47984b7f400c9387bcdfecbc938a5562b625 Mon Sep 17 00:00:00 2001 From: vagudets Date: Fri, 16 May 2025 10:44:46 +0200 Subject: [PATCH 13/22] Add necessary s2dv functions --- modules/Crossval/R/CRPS_clim.R | 35 + modules/Crossval/R/RPS_clim.R | 39 + modules/Crossval/R/tmp/Bias.R | 216 +++ modules/Crossval/R/tmp/Corr.R | 484 ++++++ modules/Crossval/R/tmp/EOF.R | 293 ++++ modules/Crossval/R/tmp/Eno.R | 103 ++ modules/Crossval/R/tmp/NAO.R | 574 +++++++ modules/Crossval/R/tmp/ProjectField.R | 272 ++++ modules/Crossval/R/tmp/RPS.R | 408 +++++ modules/Crossval/R/tmp/RPSS.R | 638 ++++++++ modules/Crossval/R/tmp/RandomWalkTest.R | 184 +++ modules/Crossval/R/tmp/SprErr.R | 227 +++ modules/Crossval/R/tmp/Utils.R | 1885 +++++++++++++++++++++++ 13 files changed, 5358 insertions(+) create mode 100644 modules/Crossval/R/CRPS_clim.R create mode 100644 modules/Crossval/R/RPS_clim.R create mode 100644 modules/Crossval/R/tmp/Bias.R create mode 100644 modules/Crossval/R/tmp/Corr.R create mode 100644 modules/Crossval/R/tmp/EOF.R create mode 100644 modules/Crossval/R/tmp/Eno.R create mode 100644 modules/Crossval/R/tmp/NAO.R create mode 100644 modules/Crossval/R/tmp/ProjectField.R create mode 100644 modules/Crossval/R/tmp/RPS.R create mode 100644 modules/Crossval/R/tmp/RPSS.R create mode 100644 modules/Crossval/R/tmp/RandomWalkTest.R create mode 100644 modules/Crossval/R/tmp/SprErr.R create mode 100644 modules/Crossval/R/tmp/Utils.R diff --git a/modules/Crossval/R/CRPS_clim.R b/modules/Crossval/R/CRPS_clim.R new file mode 100644 index 00000000..0e6bef65 --- /dev/null +++ b/modules/Crossval/R/CRPS_clim.R @@ -0,0 +1,35 @@ +# CRPS version for climatology +CRPS_clim <- function(obs, memb_dim ='ensemble', return_mean = TRUE, clim.cross.val= TRUE){ + time_dim <- names(dim(obs)) + obs_time_len <- dim(obs)[time_dim] + + if (isFALSE(clim.cross.val)) { + # Without cross-validation + ref <- array(data = rep(obs, each = obs_time_len), + dim = c(obs_time_len, obs_time_len)) + } else if (isTRUE(clim.cross.val)) { + # With cross-validation (excluding the value of that year to create ref for that year) + ref <- array(data = NA, + dim = c(obs_time_len, obs_time_len - 1)) + for (i in 1:obs_time_len) { + ref[i, ] <- obs[-i] + } + } + + names(dim(ref)) <- c(time_dim, memb_dim) + # ref: [sdate, memb] + # obs: [sdate] + crps_ref <- s2dv:::.CRPS(exp = ref, obs = obs, + time_dim = time_dim, + memb_dim = memb_dim, + dat_dim = NULL, + Fair = FALSE) + + # crps_ref should be [sdate] + if (return_mean == TRUE) { + return(mean(crps_ref)) + } else { + return(crps_ref) + } +} + diff --git a/modules/Crossval/R/RPS_clim.R b/modules/Crossval/R/RPS_clim.R new file mode 100644 index 00000000..6deab3ec --- /dev/null +++ b/modules/Crossval/R/RPS_clim.R @@ -0,0 +1,39 @@ +# RPS version for climatology +RPS_clim <- function(obs, indices_for_clim = NULL, + prob_thresholds = c(1/3, 2/3), cross.val = TRUE, + Fair = FALSE, bin_dim_abs = NULL, return_mean = TRUE) { + if (is.null(indices_for_clim)){ + indices_for_clim <- 1:length(obs) + } + if (is.null(bin_dim_abs)) { + obs_probs <- .GetProbs(data = obs, indices_for_quantiles = indices_for_clim, ## temporarily removed s2dv::: + prob_thresholds = prob_thresholds, weights = NULL, + cross.val = cross.val) + } else { + obs_probs <- obs + } + # clim_probs: [bin, sdate] + clim_probs <- c(prob_thresholds[1], + diff(prob_thresholds), 1 - prob_thresholds[length(prob_thresholds)]) + clim_probs <- array(clim_probs, dim = dim(obs_probs)) + + # Calculate RPS for each time step + probs_clim_cumsum <- apply(clim_probs, 2, cumsum) + probs_obs_cumsum <- apply(obs_probs, 2, cumsum) + rps_ref <- apply((probs_clim_cumsum - probs_obs_cumsum)^2, 2, sum) + if (Fair) { # FairRPS + ## adjustment <- rowSums(-1 * (1/R - 1/R.new) * ens.cum * (R - ens.cum)/R/(R - 1)) + ## [formula taken from SpecsVerification::EnsRps] + ## See explanation in https://freva.met.fu-berlin.de/about/problems/ + R <- dim(obs)[2] #years + adjustment <- (-1) / (R - 1) * probs_clim_cumsum * (1 - probs_clim_cumsum) + adjustment <- colSums(adjustment) + rps_ref <- rps_ref + adjustment + } + + if (return_mean == TRUE) { + return(mean(rps_ref)) + } else { + return(rps_ref) + } +} diff --git a/modules/Crossval/R/tmp/Bias.R b/modules/Crossval/R/tmp/Bias.R new file mode 100644 index 00000000..b9292cae --- /dev/null +++ b/modules/Crossval/R/tmp/Bias.R @@ -0,0 +1,216 @@ +#'Compute the Mean Bias +#' +#'The Mean Bias or Mean Error (Wilks, 2011) is defined as the mean difference +#'between the ensemble mean forecast and the observations. It is a deterministic +#'metric. Positive values indicate that the forecasts are on average too high +#'and negative values indicate that the forecasts are on average too low. +#'It also allows to compute the Absolute Mean Bias or bias without temporal +#'mean. If there is more than one dataset, the result will be computed for each +#'pair of exp and obs data. +#' +#'@param exp A named numerical array of the forecast with at least time +#' dimension. +#'@param obs A named numerical array of the observation with at least time +#' dimension. The dimensions must be the same as 'exp' except 'memb_dim' and +#' 'dat_dim'. +#'@param time_dim A character string indicating the name of the time dimension. +#' The default value is 'sdate'. +#'@param dat_dim A character string indicating the name of dataset dimension. +#' The length of this dimension can be different between 'exp' and 'obs'. +#' The default value is NULL. +#'@param memb_dim A character string indicating the name of the member dimension +#' to compute the ensemble mean; it should be set to NULL if the parameter +#' 'exp' is already the ensemble mean. The default value is NULL. +#'@param na.rm A logical value indicating if NAs should be removed (TRUE) or +#' kept (FALSE) for computation. The default value is FALSE. +#'@param absolute A logical value indicating whether to compute the absolute +#' bias. The default value is FALSE. +#'@param time_mean A logical value indicating whether to compute the temporal +#' mean of the bias. The default value is TRUE. +#'@param alpha A numeric or NULL (default) to indicate the significance level using Weltch test. Only available when absolute is FALSE. +#'@param ncores An integer indicating the number of cores to use for parallel +#' computation. The default value is NULL. +#' +#'@return +#'A numerical array of bias with dimensions c(nexp, nobs, the rest dimensions of +#''exp' except 'time_dim' (if time_mean = T) and 'memb_dim'). nexp is the number +#'of experiment (i.e., 'dat_dim' in exp), and nobs is the number of observation +#'(i.e., 'dat_dim' in obs). If dat_dim is NULL, nexp and nobs are omitted. If alpha is specified, and absolute is FALSE, the result is a list with two elements, the bias as describe above and the significance as logical array with the same dimensions. +#' +#'@references +#'Wilks, 2011; https://doi.org/10.1016/B978-0-12-385022-5.00008-7 +#' +#'@examples +#'exp <- array(rnorm(1000), dim = c(dat = 1, lat = 3, lon = 5, member = 10, sdate = 50)) +#'obs <- array(rnorm(1000), dim = c(dat = 1, lat = 3, lon = 5, sdate = 50)) +#'bias <- Bias(exp = exp, obs = obs, memb_dim = 'member') +#' +#'@import multiApply +#'@importFrom ClimProjDiags Subset +#'@export +Bias <- function(exp, obs, time_dim = 'sdate', memb_dim = NULL, dat_dim = NULL, na.rm = FALSE, + absolute = FALSE, time_mean = TRUE, alpha = NULL, ncores = NULL) { + + # Check inputs + ## exp and obs (1) + if (!is.array(exp) | !is.numeric(exp)) + stop("Parameter 'exp' must be a numeric array.") + if (!is.array(obs) | !is.numeric(obs)) + stop("Parameter 'obs' must be a numeric array.") + if (any(is.null(names(dim(exp)))) | any(nchar(names(dim(exp))) == 0) | + any(is.null(names(dim(obs)))) | any(nchar(names(dim(obs))) == 0)) { + stop("Parameter 'exp' and 'obs' must have dimension names.") + } + ## time_dim + if (!is.character(time_dim) | length(time_dim) != 1) + stop("Parameter 'time_dim' must be a character string.") + if (!time_dim %in% names(dim(exp)) | !time_dim %in% names(dim(obs))) { + stop("Parameter 'time_dim' is not found in 'exp' or 'obs' dimension.") + } + ## memb_dim + if (!is.null(memb_dim)) { + if (!is.character(memb_dim) | length(memb_dim) > 1) { + stop("Parameter 'memb_dim' must be a character string.") + } + if (!memb_dim %in% names(dim(exp))) { + stop("Parameter 'memb_dim' is not found in 'exp' dimension.") + } + if (memb_dim %in% names(dim(obs))) { + if (identical(as.numeric(dim(obs)[memb_dim]), 1)) { + obs <- ClimProjDiags::Subset(x = obs, along = memb_dim, indices = 1, drop = 'selected') + } else { + stop("Not implemented for observations with members ('obs' can have 'memb_dim', ", + "but it should be of length = 1).") + } + } + } + ## dat_dim + if (!is.null(dat_dim)) { + if (!is.character(dat_dim) | length(dat_dim) > 1) { + stop("Parameter 'dat_dim' must be a character string.") + } + if (!dat_dim %in% names(dim(exp)) | !dat_dim %in% names(dim(obs))) { + stop("Parameter 'dat_dim' is not found in 'exp' or 'obs' dimension.", + " Set it as NULL if there is no dataset dimension.") + } + } + ## exp and obs (2) + name_exp <- sort(names(dim(exp))) + name_obs <- sort(names(dim(obs))) + if (!is.null(memb_dim)) { + name_exp <- name_exp[-which(name_exp == memb_dim)] + } + if (!is.null(dat_dim)) { + name_exp <- name_exp[-which(name_exp == dat_dim)] + name_obs <- name_obs[-which(name_obs == dat_dim)] + } + if (!identical(length(name_exp), length(name_obs)) | + !identical(dim(exp)[name_exp], dim(obs)[name_obs])) { + stop("Parameter 'exp' and 'obs' must have same length of ", + "all dimensions except 'memb_dim' and 'dat_dim'.") + } + ## na.rm + if (!is.logical(na.rm) | length(na.rm) > 1) { + stop("Parameter 'na.rm' must be one logical value.") + } + ## absolute + if (!is.logical(absolute) | length(absolute) > 1) { + stop("Parameter 'absolute' must be one logical value.") + } + ## time_mean + if (!is.logical(time_mean) | length(time_mean) > 1) { + stop("Parameter 'time_mean' must be one logical value.") + } + ## alpha + if (!is.null(alpha)) { + if (!is.numeric(alpha) | length(alpha) > 1) { + stop("Parameter 'alpha' must be null or a numeric value.") + } + } + ## ncores + if (!is.null(ncores)) { + if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 | + length(ncores) > 1) { + stop("Parameter 'ncores' must be either NULL or a positive integer.") + } + } + + ############################### + + ## Ensemble mean + if (!is.null(memb_dim)) { + exp <- MeanDims(exp, memb_dim, na.rm = na.rm) + } + + ## (Mean) Bias + bias <- Apply(data = list(exp, obs), + target_dims = c(time_dim, dat_dim), + fun = .Bias, + time_dim = time_dim, + dat_dim = dat_dim, + na.rm = na.rm, + absolute = absolute, + time_mean = time_mean, + alpha = alpha, + ncores = ncores) + + if (is.null(alpha)) { + bias <- bias$output1 + } + return(bias) +} + + +.Bias <- function(exp, obs, time_dim = 'sdate', dat_dim = NULL, na.rm = FALSE, + absolute = FALSE, time_mean = TRUE, alpha = NULL) { + # exp and obs: [sdate, (dat)] + if (is.null(dat_dim)) { + bias <- exp - obs + + if (isTRUE(absolute)) { + bias <- abs(bias) + } + + if (isTRUE(time_mean)) { + bias <- mean(bias, na.rm = na.rm) + } + + if (!is.null(alpha)) { + if (!absolute) { + pval <- t.test(x = obs, y = exp, alternative = "two.sided")$p.value + sig <- pval <= alpha + } + } + } else { + nexp <- as.numeric(dim(exp)[dat_dim]) + nobs <- as.numeric(dim(obs)[dat_dim]) + bias <- array(dim = c(dim(exp)[time_dim], nexp = nexp, nobs = nobs)) + pval <- array(dim = c(nexp = nexp, nobs = nobs)) + sig <- array(dim = c(nexp = nexp, nobs = nobs)) + for (i in 1:nexp) { + for (j in 1:nobs) { + bias[, i, j] <- exp[, i] - obs[, j] + if (!is.null(alpha)) { + if (!absolute) { + pval[i,j] <- t.test(x = obs[,j], y = exp[,i], + alternative = "two.sided")$p.value + sig[i,j] <- pval <= alpha + } + } + } + } + + if (isTRUE(absolute)) { + bias <- abs(bias) + } + + if (isTRUE(time_mean)) { + bias <- MeanDims(bias, time_dim, na.rm = na.rm) + } + } + if (!is.null(alpha) && !absolute) { + return(list(bias = bias, sig = sig)) + } else { + return(bias) + } +} diff --git a/modules/Crossval/R/tmp/Corr.R b/modules/Crossval/R/tmp/Corr.R new file mode 100644 index 00000000..744ff109 --- /dev/null +++ b/modules/Crossval/R/tmp/Corr.R @@ -0,0 +1,484 @@ +#'Compute the correlation coefficient between an array of forecast and their corresponding observation +#' +#'Calculate the correlation coefficient (Pearson, Kendall or Spearman) for +#'an array of forecast and an array of observation. The correlations are +#'computed along 'time_dim' that usually refers to the start date dimension. If +#''comp_dim' is given, the correlations are computed only if obs along comp_dim +#'dimension are complete between limits[1] and limits[2], i.e., there is no NA +#'between limits[1] and limits[2]. This option can be activated if the user +#'wants to account only for the forecasts which the corresponding observations +#'are available at all leadtimes.\cr +#'The confidence interval is computed by the Fisher transformation and the +#'significance level relies on an one-sided student-T distribution.\cr +#'The function can calculate ensemble mean before correlation by 'memb_dim' +#'specified and 'memb = F'. If ensemble mean is not calculated, correlation will +#'be calculated for each member. +#'If there is only one dataset for exp and obs, you can simply use cor() to +#'compute the correlation. +#' +#'@param exp A named numeric array of experimental data, with at least dimension +#' 'time_dim'. +#'@param obs A named numeric array of observational data, same dimensions as +#' parameter 'exp' except along 'dat_dim' and 'memb_dim'. +#'@param time_dim A character string indicating the name of dimension along +#' which the correlations are computed. The default value is 'sdate'. +#'@param dat_dim A character string indicating the name of dataset (nobs/nexp) +#' dimension. The default value is NULL (no dataset). +#'@param comp_dim A character string indicating the name of dimension along which +#' obs is taken into account only if it is complete. The default value +#' is NULL. +#'@param limits A vector of two integers indicating the range along comp_dim to +#' be completed. The default is c(1, length(comp_dim dimension)). +#'@param method A character string indicating the type of correlation: +#' 'pearson', 'spearman', or 'kendall'. The default value is 'pearson'. +#'@param memb_dim A character string indicating the name of the member +#' dimension. It must be one dimension in 'exp' and 'obs'. If there is no +#' member dimension, set NULL. The default value is NULL. +#'@param memb A logical value indicating whether to remain 'memb_dim' dimension +#' (TRUE) or do ensemble mean over 'memb_dim' (FALSE). Only functional when +#' 'memb_dim' is not NULL. The default value is TRUE. +#'@param pval A logical value indicating whether to return or not the p-value +#' of the test Ho: Corr = 0. The default value is TRUE. +#'@param conf A logical value indicating whether to return or not the confidence +#' intervals. The default value is TRUE. +#'@param sign A logical value indicating whether to retrieve the statistical +#' significance of the test Ho: Corr = 0 based on 'alpha'. The default value is +#' FALSE. +#'@param alpha A numeric indicating the significance level for the statistical +#' significance test. The default value is 0.05. +#'@param ncores An integer indicating the number of cores to use for parallel +#' computation. The default value is NULL. +#' +#'@return +#'A list containing the numeric arrays with dimension:\cr +#' c(nexp, nobs, exp_memb, obs_memb, all other dimensions of exp except +#' time_dim and memb_dim).\cr +#'nexp is the number of experiment (i.e., 'dat_dim' in exp), and nobs is the +#'number of observation (i.e., 'dat_dim' in obs). If dat_dim is NULL, nexp and +#'nobs are omitted. exp_memb is the number of member in experiment (i.e., +#''memb_dim' in exp) and obs_memb is the number of member in observation (i.e., +#''memb_dim' in obs). If memb = F, exp_memb and obs_memb are omitted.\cr\cr +#'\item{$corr}{ +#' The correlation coefficient. +#'} +#'\item{$p.val}{ +#' The p-value. Only present if \code{pval = TRUE}. +#'} +#'\item{$conf.lower}{ +#' The lower confidence interval. Only present if \code{conf = TRUE}. +#'} +#'\item{$conf.upper}{ +#' The upper confidence interval. Only present if \code{conf = TRUE}. +#'} +#'\item{$sign}{ +#' The statistical significance. Only present if \code{sign = TRUE}. +#'} +#' +#'@examples +#'# Case 1: Load sample data as in Load() example: +#'example(Load) +#'clim <- Clim(sampleData$mod, sampleData$obs) +#'ano_exp <- Ano(sampleData$mod, clim$clim_exp) +#'ano_obs <- Ano(sampleData$obs, clim$clim_obs) +#'runmean_months <- 12 +#' +#'# Smooth along lead-times +#'smooth_ano_exp <- Smoothing(ano_exp, runmeanlen = runmean_months) +#'smooth_ano_obs <- Smoothing(ano_obs, runmeanlen = runmean_months) +#'required_complete_row <- 3 # Discard start dates which contain any NA lead-times +#'leadtimes_per_startdate <- 60 +#'corr <- Corr(MeanDims(smooth_ano_exp, 'member'), +#' MeanDims(smooth_ano_obs, 'member'), +#' comp_dim = 'ftime', dat_dim = 'dataset', +#' limits = c(ceiling((runmean_months + 1) / 2), +#' leadtimes_per_startdate - floor(runmean_months / 2))) +#' +#'# Case 2: Keep member dimension +#'corr <- Corr(smooth_ano_exp, smooth_ano_obs, memb_dim = 'member', dat_dim = 'dataset') +#'# ensemble mean +#'corr <- Corr(smooth_ano_exp, smooth_ano_obs, memb_dim = 'member', memb = FALSE, +#' dat_dim = 'dataset') +#' +#'@import multiApply +#'@importFrom ClimProjDiags Subset +#'@importFrom stats cor pt qnorm +#'@export +Corr <- function(exp, obs, time_dim = 'sdate', dat_dim = NULL, + comp_dim = NULL, limits = NULL, method = 'pearson', + memb_dim = NULL, memb = TRUE, + pval = TRUE, conf = TRUE, sign = FALSE, + alpha = 0.05, ncores = NULL) { + + # Check inputs + ## exp and obs (1) + if (is.null(exp) | is.null(obs)) { + stop("Parameter 'exp' and 'obs' cannot be NULL.") + } + if (!is.numeric(exp) | !is.numeric(obs)) { + stop("Parameter 'exp' and 'obs' must be a numeric array.") + } + if (is.null(dim(exp)) | is.null(dim(obs))) { + stop(paste0("Parameter 'exp' and 'obs' must be at least two dimensions ", + "containing time_dim and dat_dim.")) + } + if(any(is.null(names(dim(exp))))| any(nchar(names(dim(exp))) == 0) | + any(is.null(names(dim(obs))))| any(nchar(names(dim(obs))) == 0)) { + stop("Parameter 'exp' and 'obs' must have dimension names.") + } + ## time_dim + if (!is.character(time_dim) | length(time_dim) > 1) { + stop("Parameter 'time_dim' must be a character string.") + } + if (!time_dim %in% names(dim(exp)) | !time_dim %in% names(dim(obs))) { + stop("Parameter 'time_dim' is not found in 'exp' or 'obs' dimension.") + } + ## dat_dim + if (!is.null(dat_dim)) { + if (!is.character(dat_dim) | length(dat_dim) > 1) { + stop("Parameter 'dat_dim' must be a character string or NULL.") + } + if (!dat_dim %in% names(dim(exp)) | !dat_dim %in% names(dim(obs))) { + stop("Parameter 'dat_dim' is not found in 'exp' or 'obs' dimension.", + " Set it as NULL if there is no dataset dimension.") + } + } + ## comp_dim + if (!is.null(comp_dim)) { + if (!is.character(comp_dim) | length(comp_dim) > 1) { + stop("Parameter 'comp_dim' must be a character string.") + } + if (!comp_dim %in% names(dim(exp)) | !comp_dim %in% names(dim(obs))) { + stop("Parameter 'comp_dim' is not found in 'exp' or 'obs' dimension.") + } + } + ## limits + if (!is.null(limits)) { + if (is.null(comp_dim)) { + stop("Paramter 'comp_dim' cannot be NULL if 'limits' is assigned.") + } + if (!is.numeric(limits) | any(limits %% 1 != 0) | any(limits < 0) | + length(limits) != 2 | any(limits > dim(exp)[comp_dim])) { + stop(paste0("Parameter 'limits' must be a vector of two positive ", + "integers smaller than the length of paramter 'comp_dim'.")) + } + } + ## method + if (!(method %in% c("kendall", "spearman", "pearson"))) { + stop("Parameter 'method' must be one of 'kendall', 'spearman' or 'pearson'.") + } + ## memb_dim + if (!is.null(memb_dim)) { + if (!is.character(memb_dim) | length(memb_dim) > 1) { + stop("Parameter 'memb_dim' must be a character string.") + } + if (!memb_dim %in% names(dim(exp)) & !memb_dim %in% names(dim(obs))) { + stop("Parameter 'memb_dim' is not found in 'exp' nor 'obs' dimension. Set it as NULL if there is no member dimension.") + } + # Add [member = 1] + if (memb_dim %in% names(dim(exp)) & !memb_dim %in% names(dim(obs))) { + dim(obs) <- c(dim(obs), 1) + names(dim(obs))[length(dim(obs))] <- memb_dim + } + if (!memb_dim %in% names(dim(exp)) & memb_dim %in% names(dim(obs))) { + dim(exp) <- c(dim(exp), 1) + names(dim(exp))[length(dim(exp))] <- memb_dim + } + } + ## memb + if (!is.logical(memb) | length(memb) > 1) { + stop("Parameter 'memb' must be one logical value.") + } + ## pval + if (!is.logical(pval) | length(pval) > 1) { + stop("Parameter 'pval' must be one logical value.") + } + ## conf + if (!is.logical(conf) | length(conf) > 1) { + stop("Parameter 'conf' must be one logical value.") + } + ## sign + if (!is.logical(sign) | length(sign) > 1) { + stop("Parameter 'sign' must be one logical value.") + } + ## alpha + if (!is.numeric(alpha) | alpha < 0 | alpha > 1 | length(alpha) > 1) { + stop("Parameter 'alpha' must be a numeric number between 0 and 1.") + } + ## ncores + if (!is.null(ncores)) { + if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 | + length(ncores) > 1) { + stop("Parameter 'ncores' must be a positive integer.") + } + } + ## exp and obs (2) + name_exp <- sort(names(dim(exp))) + name_obs <- sort(names(dim(obs))) + if (!is.null(dat_dim)) { + name_exp <- name_exp[-which(name_exp == dat_dim)] + name_obs <- name_obs[-which(name_obs == dat_dim)] + } + if (!is.null(memb_dim)) { + name_exp <- name_exp[-which(name_exp == memb_dim)] + name_obs <- name_obs[-which(name_obs == memb_dim)] + } + if (!identical(dim(exp)[name_exp], dim(obs)[name_obs])) { + stop(paste0("Parameter 'exp' and 'obs' must have same length of ", + "all dimension except 'dat_dim' and 'memb_dim'.")) + } + if (dim(exp)[time_dim] < 3) { + stop("The length of time_dim must be at least 3 to compute correlation.") + } + + + ############################### + # Sort dimension + name_exp <- names(dim(exp)) + name_obs <- names(dim(obs)) + order_obs <- match(name_exp, name_obs) + obs <- Reorder(obs, order_obs) + + + ############################### + # Calculate Corr + + # Remove data along comp_dim dim if there is at least one NA between limits + if (!is.null(comp_dim)) { + pos <- which(names(dim(obs)) == comp_dim) + if (is.null(limits)) { + obs_sub <- obs + } else { + obs_sub <- ClimProjDiags::Subset(obs, pos, list(limits[1]:limits[2])) + } + outrows <- is.na(MeanDims(obs_sub, pos, na.rm = FALSE)) + outrows <- InsertDim(outrows, pos, dim(obs)[comp_dim]) + obs[which(outrows)] <- NA + rm(obs_sub, outrows) + } + if (!is.null(memb_dim)) { + if (!memb) { #ensemble mean + exp <- MeanDims(exp, memb_dim, na.rm = TRUE) + obs <- MeanDims(obs, memb_dim, na.rm = TRUE) +# name_exp <- names(dim(exp)) +# margin_dims_ind <- c(1:length(name_exp))[-which(name_exp == memb_dim)] +# exp <- apply(exp, margin_dims_ind, mean, na.rm = TRUE) #NOTE: remove NAs here +# obs <- apply(obs, margin_dims_ind, mean, na.rm = TRUE) + memb_dim <- NULL + } + } + + res <- Apply(list(exp, obs), + target_dims = list(c(time_dim, dat_dim, memb_dim), + c(time_dim, dat_dim, memb_dim)), + fun = .Corr, + dat_dim = dat_dim, memb_dim = memb_dim, + time_dim = time_dim, method = method, + pval = pval, conf = conf, sign = sign, alpha = alpha, + ncores = ncores) + + return(res) +} + +.Corr <- function(exp, obs, dat_dim = NULL, memb_dim = 'member', + time_dim = 'sdate', method = 'pearson', + conf = TRUE, pval = TRUE, sign = FALSE, alpha = 0.05) { + + if (is.null(dat_dim)) { + nexp <- 1 + nobs <- 1 + } else { + nexp <- as.numeric(dim(exp)[dat_dim]) + nobs <- as.numeric(dim(obs)[dat_dim]) + } + + if (is.null(memb_dim)) { + CORR <- array(dim = c(nexp = nexp, nobs = nobs)) + + if (is.null(dat_dim)) { + # exp: [sdate] + # obs: [sdate] + if (any(!is.na(exp)) && sum(!is.na(obs)) > 2) { + CORR[, ] <- cor(exp, obs, use = "pairwise.complete.obs", method = method) + } + } else { + # exp: [sdate, dat_exp] + # obs: [sdate, dat_obs] + for (j in 1:nobs) { + for (y in 1:nexp) { + if (any(!is.na(exp[, y])) && sum(!is.na(obs[, j])) > 2) { + CORR[y, j] <- cor(exp[, y], obs[, j], + use = "pairwise.complete.obs", + method = method) + } + } + } +#---------------------------------------- +# Same as above calculation. +#TODO: Compare which is faster. +# CORR <- sapply(1:nobs, function(i) { +# sapply(1:nexp, function (x) { +# if (any(!is.na(exp[, x])) && sum(!is.na(obs[, i])) > 2) { +# cor(exp[, x], obs[, i], +# use = "pairwise.complete.obs", +# method = method) +# } else { +# NA +# } +# }) +# }) +#----------------------------------------- + } + + } else { # memb_dim != NULL + exp_memb <- as.numeric(dim(exp)[memb_dim]) # memb_dim + obs_memb <- as.numeric(dim(obs)[memb_dim]) + + CORR <- array(dim = c(nexp = nexp, nobs = nobs, exp_memb = exp_memb, obs_memb = obs_memb)) + + if (is.null(dat_dim)) { + # exp: [sdate, memb_exp] + # obs: [sdate, memb_obs] + for (j in 1:obs_memb) { + for (y in 1:exp_memb) { + + if (any(!is.na(exp[,y])) && sum(!is.na(obs[, j])) > 2) { + CORR[, , y, j] <- cor(exp[, y], obs[, j], + use = "pairwise.complete.obs", + method = method) + } + + } + } + } else { + # exp: [sdate, dat_exp, memb_exp] + # obs: [sdate, dat_obs, memb_obs] + for (j in 1:obs_memb) { + for (y in 1:exp_memb) { + CORR[, , y, j] <- sapply(1:nobs, function(i) { + sapply(1:nexp, function (x) { + if (any(!is.na(exp[, x, y])) && sum(!is.na(obs[, i, j])) > 2) { + cor(exp[, x, y], obs[, i, j], + use = "pairwise.complete.obs", + method = method) + } else { + NA + } + }) + }) + + } + } + } + + } + + +# if (pval) { +# for (i in 1:nobs) { +# p.val[, i] <- try(sapply(1:nexp, +# function(x) {(cor.test(exp[, x], obs[, i], +# use = "pairwise.complete.obs", +# method = method)$p.value)/2}), silent = TRUE) +# if (class(p.val[, i]) == 'character') { +# p.val[, i] <- NA +# } +# } +# } + + if (pval || conf || sign) { + if (method == "kendall" | method == "spearman") { + if (!is.null(dat_dim) | !is.null(memb_dim)) { + tmp <- apply(obs, c(1:length(dim(obs)))[-1], rank) # for memb_dim = NULL, 2; for memb_dim, c(2, 3) + names(dim(tmp))[1] <- time_dim + eno <- Eno(tmp, time_dim) + } else { + tmp <- rank(obs) + tmp <- array(tmp) + names(dim(tmp)) <- time_dim + eno <- Eno(tmp, time_dim) + } + } else if (method == "pearson") { + eno <- Eno(obs, time_dim) + } + + if (is.null(memb_dim)) { + eno_expand <- array(dim = c(nexp = nexp, nobs = nobs)) + for (i in 1:nexp) { + eno_expand[i, ] <- eno + } + } else { #member + eno_expand <- array(dim = c(nexp = nexp, nobs = nobs, exp_memb = exp_memb, obs_memb = obs_memb)) + for (i in 1:nexp) { + for (j in 1:exp_memb) { + eno_expand[i, , j, ] <- eno + } + } + } + + } + +#############old################# +#This doesn't return error but it's diff from cor.test() when method is spearman and kendall + if (pval || sign) { + t <- sqrt(CORR * CORR * (eno_expand - 2) / (1 - (CORR ^ 2))) + p.val <- pt(t, eno_expand - 2, lower.tail = FALSE) + if (sign) signif <- !is.na(p.val) & p.val <= alpha + } +################################### + if (conf) { + conf.lower <- alpha / 2 + conf.upper <- 1 - conf.lower + suppressWarnings({ + conflow <- tanh(atanh(CORR) + qnorm(conf.lower) / sqrt(eno_expand - 3)) + confhigh <- tanh(atanh(CORR) + qnorm(conf.upper) / sqrt(eno_expand - 3)) + }) + } + +################################### + # Remove nexp and nobs if dat_dim = NULL + if (is.null(dat_dim)) { +# if (is.null(dat_dim) & !is.null(memb_dim)) { + + if (length(dim(CORR)) == 2) { + dim(CORR) <- NULL + if (pval) { + dim(p.val) <- NULL + } + if (conf) { + dim(conflow) <- NULL + dim(confhigh) <- NULL + } + if (sign) { + dim(signif) <- NULL + } + } else { + dim(CORR) <- dim(CORR)[3:length(dim(CORR))] + if (pval) { + dim(p.val) <- dim(p.val)[3:length(dim(p.val))] + } + if (conf) { + dim(conflow) <- dim(conflow)[3:length(dim(conflow))] + dim(confhigh) <- dim(confhigh)[3:length(dim(confhigh))] + } + if (sign) { + dim(signif) <- dim(signif)[3:length(dim(signif))] + } + } + } + +################################### + + res <- list(corr = CORR) + if (pval) { + res <- c(res, list(p.val = p.val)) + } + if (conf) { + res <- c(res, list(conf.lower = conflow, conf.upper = confhigh)) + } + if (sign) { + res <- c(res, list(sign = signif)) + } + + return(res) + +} diff --git a/modules/Crossval/R/tmp/EOF.R b/modules/Crossval/R/tmp/EOF.R new file mode 100644 index 00000000..87795b66 --- /dev/null +++ b/modules/Crossval/R/tmp/EOF.R @@ -0,0 +1,293 @@ +#'Area-weighted empirical orthogonal function analysis using SVD +#' +#'Perform an area-weighted EOF analysis using single value decomposition (SVD) +#'based on a covariance matrix or a correlation matrix if parameter 'corr' is +#'set to TRUE. +#' +#'@param ano A numerical array of anomalies with named dimensions to calculate +#' EOF. The dimensions must have at least 'time_dim' and 'space_dim'. NAs +#' could exist but it should be consistent along time_dim. That is, if one grid +#' point has NAs, all the time steps at this point should be NAs. +#'@param lat A vector of the latitudes of 'ano'. +#'@param lon A vector of the longitudes of 'ano'. +#'@param time_dim A character string indicating the name of the time dimension +#' of 'ano'. The default value is 'sdate'. +#'@param space_dim A vector of two character strings. The first is the dimension +#' name of latitude of 'ano' and the second is the dimension name of longitude +#' of 'ano'. The default value is c('lat', 'lon'). +#'@param neofs A positive integer of the modes to be kept. The default value is +#' 15. If time length or the product of the length of space_dim is smaller than +#' neofs, neofs will be changed to the minimum of the three values. +#'@param corr A logical value indicating whether to base on a correlation (TRUE) +#' or on a covariance matrix (FALSE). The default value is FALSE. +#'@param ncores An integer indicating the number of cores to use for parallel +#' computation. The default value is NULL. +#' +#'@return +#'A list containing: +#'\item{EOFs}{ +#' An array of EOF patterns normalized to 1 (unitless) with dimensions +#' (number of modes, rest of the dimensions of 'ano' except 'time_dim'). +#' Multiplying \code{EOFs} by \code{PCs} gives the original reconstructed +#' field. +#'} +#'\item{PCs}{ +#' An array of principal components with the units of the original field to +#' the power of 2, with dimensions (time_dim, number of modes, rest of the +#' dimensions of 'ano' except 'space_dim'). +#' 'PCs' contains already the percentage of explained variance so, +#' to reconstruct the original field it's only needed to multiply 'EOFs' +#' by 'PCs'. +#'} +#'\item{var}{ +#' An array of the percentage (%) of variance fraction of total variance +#' explained by each mode (number of modes). The dimensions are (number of +#' modes, rest of the dimensions of 'ano' except 'time_dim' and 'space_dim'). +#'} +#'\item{mask}{ +#' An array of the mask with dimensions (space_dim, rest of the dimensions of +#' 'ano' except 'time_dim'). It is made from 'ano', 1 for the positions that +#' 'ano' has value and NA for the positions that 'ano' has NA. It is used to +#' replace NAs with 0s for EOF calculation and mask the result with NAs again +#' after the calculation. +#'} +#'\item{wght}{ +#' An array of the area weighting with dimensions 'space_dim'. It is calculated +#' by cosine of 'lat' and used to compute the fraction of variance explained by +#' each EOFs. +#'} +#'\item{tot_var}{ +#' A number or a numeric array of the total variance explained by all the modes. +#' The dimensions are same as 'ano' except 'time_dim' and 'space_dim'. +#'} +#' +#'@seealso ProjectField, NAO, PlotBoxWhisker +#'@examples +#'# This example computes the EOFs along forecast horizons and plots the one +#'# that explains the greatest amount of variability. The example data has low +#'# resolution so the result may not be explanatory, but it displays how to +#'# use this function. +#'\dontshow{ +#'startDates <- c('19851101', '19901101', '19951101', '20001101', '20051101') +#'sampleData <- s2dv:::.LoadSampleData('tos', c('experiment'), +#' c('observation'), startDates, +#' leadtimemin = 1, +#' leadtimemax = 4, +#' output = 'lonlat', +#' latmin = 27, latmax = 48, +#' lonmin = -12, lonmax = 40) +#'} +#'ano <- Ano_CrossValid(sampleData$mod, sampleData$obs) +#'tmp <- MeanDims(ano$exp, c('dataset', 'member')) +#'ano <- tmp[1, , ,] +#'names(dim(ano)) <- names(dim(tmp))[-2] +#'eof <- EOF(ano, sampleData$lat, sampleData$lon) +#'\dontrun{ +#'PlotEquiMap(eof$EOFs[1, , ], sampleData$lon, sampleData$lat) +#'} +#' +#'@import multiApply +#'@importFrom stats sd +#'@export +EOF <- function(ano, lat, lon, time_dim = 'sdate', space_dim = c('lat', 'lon'), + neofs = 15, corr = FALSE, ncores = NULL) { + + # Check inputs + ## ano + if (is.null(ano)) { + stop("Parameter 'ano' cannot be NULL.") + } + if (!is.numeric(ano)) { + stop("Parameter 'ano' must be a numeric array.") + } + if (any(is.null(names(dim(ano)))) | any(nchar(names(dim(ano))) == 0)) { + stop("Parameter 'ano' must have dimension names.") + } + ## time_dim + if (!is.character(time_dim) | length(time_dim) > 1) { + stop("Parameter 'time_dim' must be a character string.") + } + if (!time_dim %in% names(dim(ano))) { + stop("Parameter 'time_dim' is not found in 'ano' dimension.") + } + ## space_dim + if (!is.character(space_dim) | length(space_dim) != 2) { + stop("Parameter 'space_dim' must be a character vector of 2.") + } + if (!all(space_dim %in% names(dim(ano)))) { + stop("Parameter 'space_dim' is not found in 'ano' dimension.") + } + ## lat + if (!is.numeric(lat) | length(lat) != dim(ano)[space_dim[1]]) { + stop("Parameter 'lat' must be a numeric vector with the same ", + "length as the latitude dimension of 'ano'.") + } + if (any(lat > 90 | lat < -90)) { + stop("Parameter 'lat' must contain values within the range [-90, 90].") + } + ## lon + if (!is.numeric(lon) | length(lon) != dim(ano)[space_dim[2]]) { + stop("Parameter 'lon' must be a numeric vector with the same ", + "length as the longitude dimension of 'ano'.") + } + if (any(lon > 360 | lon < -360)) { + .warning("Some 'lon' is out of the range [-360, 360].") + } + ## neofs + if (!is.numeric(neofs) | neofs %% 1 != 0 | neofs <= 0 | length(neofs) > 1) { + stop("Parameter 'neofs' must be a positive integer.") + } + ## corr + if (!is.logical(corr) | length(corr) > 1) { + stop("Parameter 'corr' must be one logical value.") + } + ## ncores + if (!is.null(ncores)) { + if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 | + length(ncores) > 1) { + stop("Parameter 'ncores' must be a positive integer.") + } + } + + ############################### + # Calculate EOF + +# # Replace mask of NAs with 0s for EOF analysis. +# ano[!is.finite(ano)] <- 0 + + # Area weighting. Weights for EOF; needed to compute the + # fraction of variance explained by each EOFs + space_ind <- sapply(space_dim, function(a) which(names(dim(ano)) == a)) + wght <- array(cos(lat * pi / 180), dim = dim(ano)[space_ind]) + + # We want the covariance matrix to be weigthed by the grid + # cell area so the anomaly field is weighted by its square + # root since the covariance matrix equals transpose(ano) + # times ano. + wght <- sqrt(wght) + + # neofs is bounded + if (neofs != min(dim(ano)[time_dim], prod(dim(ano)[space_dim]), neofs)) { + neofs <- min(dim(ano)[time_dim], prod(dim(ano)[space_dim]), neofs) + .warning(paste0("Parameter 'neofs' is changed to ", neofs, ", the minimum among ", + "the length of time_dim, the production of the length of space_dim, ", + "and neofs.")) + } + + res <- Apply(ano, + target_dims = c(time_dim, space_dim), + output_dims = list(EOFs = c('mode', space_dim), + PCs = c(time_dim, 'mode'), + var = 'mode', + tot_var = NULL, + mask = space_dim), + fun = .EOF, + corr = corr, neofs = neofs, + wght = wght, + ncores = ncores) + + return(c(res, wght = list(wght))) + +} + +.EOF <- function(ano, neofs = 15, corr = FALSE, wght = wght) { + # ano: [time, lat, lon] + + # Dimensions + nt <- dim(ano)[1] + ny <- dim(ano)[2] + nx <- dim(ano)[3] + + # Check if all the time steps at one grid point are NA-consistent. + # The grid point should have all NAs or no NA along time dim. + if (anyNA(ano)) { + ano_latlon <- array(ano, dim = c(nt, ny * nx)) # [time, lat*lon] + na_ind <- which(is.na(ano_latlon), arr.ind = T) + if (dim(na_ind)[1] != nt * length(unique(na_ind[, 2]))) { + stop("Detect certain grid points have NAs but not consistent across time ", + "dimension. If the grid point is NA, it should have NA at all time step.") + } + } + + # Build the mask + mask <- ano[1, , ] + mask[!is.finite(mask)] <- NA + mask[is.finite(mask)] <- 1 + dim(mask) <- c(ny, nx) + + # Replace mask of NAs with 0s for EOF analysis. + ano[!is.finite(ano)] <- 0 + + ano <- ano * InsertDim(wght, 1, nt) + + # The use of the correlation matrix is done under the option corr. + if (corr) { + stdv <- apply(ano, c(2, 3), sd, na.rm = T) + ano <- ano / InsertDim(stdv, 1, nt) + } + + # Time/space matrix for SVD + dim(ano) <- c(nt, ny * nx) + dim.dat <- dim(ano) + + # 'transpose' means the array needs to be transposed before + # calling La.svd for computational efficiency because the + # spatial dimension is larger than the time dimension. This + # goes with transposing the outputs of LA.svd also. + if (dim.dat[2] > dim.dat[1]) { + transpose <- TRUE + } else { + transpose <- FALSE + } + if (transpose) { + pca <- La.svd(t(ano)) + } else { + pca <- La.svd(ano) + } + + # La.svd conventions: decomposition X = U D t(V) La.svd$u + # returns U La.svd$d returns diagonal values of D La.svd$v + # returns t(V) !! The usual convention is PC=U and EOF=V. + # If La.svd is called for ano (transpose=FALSE case): EOFs: + # $v PCs: $u If La.svd is called for t(ano) (transposed=TRUE + # case): EOFs: t($u) PCs: t($v) + + if (transpose) { + pca.EOFs <- t(pca$u) + pca.PCs <- t(pca$v) + } else { + pca.EOFs <- pca$v + pca.PCs <- pca$u + } + + # The numbers of transposition is limited to neofs + PC <- pca.PCs[, 1:neofs] + EOF <- pca.EOFs[1:neofs, ] + dim(EOF) <- c(neofs, ny, nx) + + # To sort out crash when neofs=1. + if (neofs == 1) { + PC <- InsertDim(PC, 2, 1, name = 'new') + } + + # Computation of the % of variance associated with each mode + W <- pca$d[1:neofs] + tot.var <- sum(pca$d^2) + var.eof <- 100 * pca$d[1:neofs]^2 / tot.var + + for (e in 1:neofs) { + # Set all masked grid points to NA in the EOFs + # Divide patterns by area weights so that EOF * PC gives unweigthed (original) data + EOF[e, , ] <- EOF[e, , ] * mask / wght + # PC is multiplied by the explained variance, + # so that the reconstruction is only EOF * PC + PC[, e] <- PC[, e] * W[e] + } + + if (neofs == 1) { + var.eof <- as.array(var.eof) + } + + return(invisible(list(EOFs = EOF, PCs = PC, var = var.eof, tot_var = tot.var, mask = mask))) +} + diff --git a/modules/Crossval/R/tmp/Eno.R b/modules/Crossval/R/tmp/Eno.R new file mode 100644 index 00000000..cb927602 --- /dev/null +++ b/modules/Crossval/R/tmp/Eno.R @@ -0,0 +1,103 @@ +#'Compute effective sample size with classical method +#' +#'Compute the number of effective samples along one dimension of an array. This +#'effective number of independent observations can be used in +#'statistical/inference tests.\cr +#'The calculation is based on eno function from Caio Coelho from rclim.txt. +#' +#'@param data A numeric array with named dimensions. +#'@param time_dim A function indicating the dimension along which to compute +#' the effective sample size. The default value is 'sdate'. +#'@param na.action A function. It can be na.pass (missing values are allowed) +#' or na.fail (no missing values are allowed). See details in stats::acf(). +#' The default value is na.pass. +#'@param ncores An integer indicating the number of cores to use for parallel +#' computation. The default value is NULL. +#' +#'@return An array with the same dimension as parameter 'data' except the +#' time_dim dimension, which is removed after the computation. The array +#' indicates the number of effective sample along time_dim. +#' +#'@examples +#'set.seed(1) +#'data <- array(rnorm(800), dim = c(dataset = 1, member = 2, sdate = 4, +#' ftime = 4, lat = 10, lon = 10)) +#'na <- floor(runif(40, min = 1, max = 800)) +#'data[na] <- NA +#'res <- Eno(data) +#' +#'@importFrom stats acf na.pass na.fail +#'@import multiApply +#'@export +Eno <- function(data, time_dim = 'sdate', na.action = na.pass, ncores = NULL) { + + # Check inputs + ## data + if (is.null(data)) { + stop("Parameter 'data' cannot be NULL.") + } + if (!is.numeric(data)) { + stop("Parameter 'data' must be a numeric array.") + } + if (is.null(dim(data))) { #is vector + dim(data) <- c(length(data)) + names(dim(data)) <- time_dim + } + if (any(is.null(names(dim(data)))) | any(nchar(names(dim(data))) == 0)) { + stop("Parameter 'data' must have dimension names.") + } + ## time_dim + if (!is.character(time_dim) | length(time_dim) > 1) { + stop("Parameter 'time_dim' must be a character string.") + } + if (!time_dim %in% names(dim(data))) { + stop("Parameter 'time_dim' is not found in 'data' dimension.") + } + ## na.action + if (as.character(substitute(na.action)) != "na.pass" & + as.character(substitute(na.action)) != "na.fail") { + stop("Parameter 'na.action' must be a function either na.pass or na.fail.") + } + if (as.character(substitute(na.action)) == "na.fail" && anyNA(data)) { + stop("Calculation fails because NA is found in paratemter 'data', ", + "which is not accepted when ", + "parameter 'na.action' = na.fail.") + } + ## ncores + if (!is.null(ncores)) { + if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 | + length(ncores) > 1) { + stop("Parameter 'ncores' must be a positive integer.") + } + } + + ############################### + # Calculate Eno + + eno <- Apply(data = list(data), + target_dims = time_dim, + output_dims = NULL, + fun = .Eno, + na.action = na.action, + ncores = ncores)$output1 + + return(eno) +} + +.Eno <- function(x, na.action) { + n <- length(sort(x)) + if (n > 1) { + a <- acf(x, lag.max = n - 1, plot = FALSE, + na.action = na.action)$acf[2:n, 1, 1] + s <- 0 + for (k in 1:(n - 1)) { + s <- s + (((n - k) / n) * a[k]) + } + eno <- min(n / (1 + (2 * s)), n) + } else { + eno <- NA + } + + return(eno) +} + diff --git a/modules/Crossval/R/tmp/NAO.R b/modules/Crossval/R/tmp/NAO.R new file mode 100644 index 00000000..255e2a9f --- /dev/null +++ b/modules/Crossval/R/tmp/NAO.R @@ -0,0 +1,574 @@ +#'Compute the North Atlantic Oscillation (NAO) Index +#' +#'Compute the North Atlantic Oscillation (NAO) index based on the leading EOF +#'of the sea level pressure (SLP) anomalies over the north Atlantic region +#'(20N-80N, 80W-40E). The PCs are obtained by projecting the forecast and +#'observed anomalies onto the observed EOF pattern or the forecast +#'anomalies onto the EOF pattern of the other years of the forecast. +#'By default (ftime_avg = 2:4), NAO() computes the NAO index for 1-month +#'lead seasonal forecasts that can be plotted with PlotBoxWhisker(). It returns +#'cross-validated PCs of the NAO index for hindcast (exp) and observations +#'(obs) based on the leading EOF pattern, or, if forecast (exp_cor) is provided, +#'the NAO index for forecast and the corresponding data (exp and obs). +#' +#'@param exp A named numeric array of North Atlantic SLP (20N-80N, 80W-40E) +#' hindcast anomalies from \code{Ano()} or \code{Ano_CrossValid()} with +#' dimensions 'time_dim', 'memb_dim', 'ftime_dim', and 'space_dim' at least. +#' If only NAO of observational data needs to be computed, this parameter can +#' be left to NULL. The default value is NULL. +#'@param obs A named numeric array of North Atlantic SLP (20N-80N, 80W-40E) +#' observed anomalies from \code{Ano()} or \code{Ano_CrossValid()} with +#' dimensions 'time_dim', 'ftime_dim', and 'space_dim' at least. +#' If only NAO of experimental data needs to be computed, this parameter can +#' be left to NULL. The default value is NULL. +#'@param exp_cor A named numeric array of the Nort Atlantic SLP (20-80N, 80W-40E) +#' forecast anomalies from \code{Ano()} or \code{Ano_CrossValid()} with +#' dimension 'time_dim' of length 1 (as in the case of an operational +#' forecast), 'memb_dim', 'ftime_dim', and 'space_dim' at least. +#' If only NAO of reference period needs to be computed, this parameter can +#' be left to NULL. The default value is NULL. +#'@param lat A vector of the latitudes of 'exp' and 'obs'. +#'@param lon A vector of the longitudes of 'exp' and 'obs'. +#'@param time_dim A character string indicating the name of the time dimension +#' of 'exp' and 'obs'. The default value is 'sdate'. +#'@param memb_dim A character string indicating the name of the member +#' dimension of 'exp' (and 'obs', optional). If 'obs' has memb_dim, the length +#' must be 1. The default value is 'member'. +#'@param space_dim A vector of two character strings. The first is the dimension +#' name of latitude of 'ano' and the second is the dimension name of longitude +#' of 'ano'. The default value is c('lat', 'lon'). +#'@param ftime_dim A character string indicating the name of the forecast time +#' dimension of 'exp' and 'obs'. The default value is 'ftime'. +#'@param ftime_avg A numeric vector of the forecast time steps to average +#' across the target period. If average is not needed, set NULL. The default +#' value is 2:4, i.e., from 2nd to 4th forecast time steps. +#'@param obsproj A logical value indicating whether to compute the NAO index by +#' projecting the forecast anomalies onto the leading EOF of observational +#' reference (TRUE, default) or compute the NAO by first computing the leading +#' EOF of the forecast anomalies (in cross-validation mode, i.e. leave the +#' evaluated year out), then projecting forecast anomalies onto this EOF +#' (FALSE). If 'exp_cor' is provided, 'obs' will be used when obsproj is TRUE +#' and 'exp' will be used when obsproj is FALSE, and no cross-validation is +#' applied. +#'@param ncores An integer indicating the number of cores to use for parallel +#' computation. The default value is NULL. +#' +#'@return +#'A list which contains some of the following items depending on the data inputs: +#'\item{exp}{ +#' A numeric array of hindcast NAO index in verification format with the same +#' dimensions as 'exp' except space_dim and ftime_dim. If ftime_avg is NULL, +#' ftime_dim remains. +#' } +#'\item{obs}{ +#' A numeric array of observation NAO index in verification format with the same +#' dimensions as 'obs' except space_dim and ftime_dim. If ftime_avg is NULL, +#' ftime_dim remains. +#'} +#'\item{exp_cor}{ +#' A numeric array of forecast NAO index in verification format with the same +#' dimensions as 'exp_cor' except space_dim and ftime_dim. If ftime_avg is NULL, +#' ftime_dim remains. +#' } +#' +#'@references +#'Doblas-Reyes, F.J., Pavan, V. and Stephenson, D. (2003). The skill of +#' multi-model seasonal forecasts of the wintertime North Atlantic +#' Oscillation. Climate Dynamics, 21, 501-514. +#' DOI: 10.1007/s00382-003-0350-4 +#' +#'@examples +#'# Make up synthetic data +#'set.seed(1) +#'exp <- array(rnorm(1620), dim = c(member = 2, sdate = 3, ftime = 5, lat = 6, lon = 9)) +#'set.seed(2) +#'obs <- array(rnorm(1620), dim = c(member = 1, sdate = 3, ftime = 5, lat = 6, lon = 9)) +#'lat <- seq(20, 80, length.out = 6) +#'lon <- seq(-80, 40, length.out = 9) +#'nao <- NAO(exp = exp, obs = obs, lat = lat, lon = lon) +#' +#'exp_cor <- array(rnorm(540), dim = c(member = 2, sdate = 1, ftime = 5, lat = 6, lon = 9)) +#'nao <- NAO(exp = exp, obs = obs, exp_cor = exp_cor, lat = lat, lon = lon, obsproj = TRUE) +#'# plot the NAO index +#' \dontrun{ +#'nao$exp <- Reorder(nao$exp, c(2, 1)) +#'nao$obs <- Reorder(nao$obs, c(2, 1)) +#'PlotBoxWhisker(nao$exp, nao$obs, "NAO index, DJF", "NAO index (PC1) TOS", +#' monini = 12, yearini = 1985, freq = 1, "Exp. A", "Obs. X") +#' } +#' +#'@import multiApply +#'@importFrom ClimProjDiags Subset +#'@export +NAO <- function(exp = NULL, obs = NULL, exp_cor = NULL, lat, lon, time_dim = 'sdate', + memb_dim = 'member', space_dim = c('lat', 'lon'), + ftime_dim = 'ftime', ftime_avg = 2:4, + obsproj = TRUE, ncores = NULL) { + # Check inputs + ## exp, obs, and exp_cor (1) + if (is.null(obs) & is.null(exp)) { + stop("Parameter 'exp' and 'obs' cannot both be NULL.") + } + if (!is.null(exp)) { + if (!is.numeric(exp)) { + stop("Parameter 'exp' must be a numeric array.") + } + if (is.null(dim(exp))) { + stop("Parameter 'exp' must have at least dimensions ", + "time_dim, memb_dim, space_dim, and ftime_dim.") + } + if (any(is.null(names(dim(exp)))) | any(nchar(names(dim(exp))) == 0)) { + stop("Parameter 'exp' must have dimension names.") + } + } + if (!is.null(obs)) { + if (!is.numeric(obs)) { + stop("Parameter 'obs' must be a numeric array.") + } + if (is.null(dim(obs))) { + stop("Parameter 'obs' must have at least dimensions ", + "time_dim, space_dim, and ftime_dim.") + } + if (any(is.null(names(dim(obs)))) | any(nchar(names(dim(obs))) == 0)) { + stop("Parameter 'obs' must have dimension names.") + } + } + if (!is.null(exp_cor)) { + if (!is.numeric(exp_cor)) { + stop("Parameter 'exp_cor' must be a numeric array.") + } + if (is.null(dim(exp_cor))) { + stop(paste0("Parameter 'exp_cor' must have at least dimensions ", + "time_dim, memb_dim, space_dim, and ftime_dim.")) + } + if (any(is.null(names(dim(exp_cor)))) | any(nchar(names(dim(exp_cor))) == 0)) { + stop("Parameter 'exp_cor' must have dimension names.") + } + if (is.null(exp) || is.null(obs)) { + stop("Parameters 'exp' and 'obs' are required when 'exp_cor' is not provided.") + } + } + ## time_dim + if (!is.character(time_dim) | length(time_dim) > 1) { + stop("Parameter 'time_dim' must be a character string.") + } + if (!is.null(exp)) { + if (!time_dim %in% names(dim(exp))) { + stop("Parameter 'time_dim' is not found in 'exp' dimension.") + } + } + if (!is.null(obs)) { + if (!time_dim %in% names(dim(obs))) { + stop("Parameter 'time_dim' is not found in 'obs' dimension.") + } + } + if (!is.null(exp_cor)) { + if (!time_dim %in% names(dim(exp_cor))) { + stop("Parameter 'time_dim' is not found in 'exp_cor' dimension.") + } + if (dim(exp_cor)[time_dim] > 1) { + stop("Parameter 'exp_cor' is expected to have length 1 in ", + time_dim, "dimension.") + } + } + + ## memb_dim + if (!is.character(memb_dim) | length(memb_dim) > 1) { + stop("Parameter 'memb_dim' must be a character string.") + } + if (!is.null(exp)) { + if (!memb_dim %in% names(dim(exp))) { + stop("Parameter 'memb_dim' is not found in 'exp' dimension.") + } + } + add_member_back <- FALSE + if (!is.null(obs)) { + if (memb_dim %in% names(dim(obs))) { + if (dim(obs)[memb_dim] != 1) { + stop("The length of parameter 'memb_dim' in 'obs' must be 1.") + } else { + add_member_back <- TRUE + obs <- ClimProjDiags::Subset(obs, memb_dim, 1, drop = 'selected') + } + } + } + if (!is.null(exp_cor)) { + if (!memb_dim %in% names(dim(exp_cor))) { + stop("Parameter 'memb_dim' is not found in 'exp_cor' dimension.") + } + } + ## space_dim + if (!is.character(space_dim) | length(space_dim) != 2) { + stop("Parameter 'space_dim' must be a character vector of 2.") + } + if (!is.null(exp)) { + if (!all(space_dim %in% names(dim(exp)))) { + stop("Parameter 'space_dim' is not found in 'exp' or 'obs' dimension.") + } + } + if (!is.null(obs)) { + if (!all(space_dim %in% names(dim(obs)))) { + stop("Parameter 'space_dim' is not found in 'exp' or 'obs' dimension.") + } + } + if (!is.null(exp_cor)) { + if (any(!space_dim %in% names(dim(exp_cor)))) { + stop("Parameter 'space_dim' is not found in 'exp_cor' dimensions.") + } + } + ## ftime_dim + if (!is.character(ftime_dim) | length(ftime_dim) > 1) { + stop("Parameter 'ftime_dim' must be a character string.") + } + if (!is.null(exp)) { + if (!ftime_dim %in% names(dim(exp))) { + stop("Parameter 'ftime_dim' is not found in 'exp' or 'obs' dimension.") + } + } + if (!is.null(obs)) { + if (!ftime_dim %in% names(dim(obs))) { + stop("Parameter 'ftime_dim' is not found in 'exp' or 'obs' dimension.") + } + } + if (!is.null(exp_cor)) { + if (!ftime_dim %in% names(dim(exp_cor))) { + stop("Parameter 'ftime_dim' is not found in 'exp_cor' dimensions.") + } + } + ## exp and obs (2) + #TODO: Add checks for exp_cor + if (!is.null(exp) & !is.null(obs)) { + name_exp <- sort(names(dim(exp))) + name_obs <- sort(names(dim(obs))) + name_exp <- name_exp[-which(name_exp == memb_dim)] + throw_error <- FALSE + if (length(name_exp) != length(name_obs)) { + throw_error <- TRUE + } else if (any(name_exp != name_obs)) { + throw_error <- TRUE + } else if (!all(dim(exp)[name_exp] == dim(obs)[name_obs])) { + throw_error <- TRUE + } + if (throw_error) { + stop("Parameter 'exp' and 'obs' must have the same names and lengths ", + "of all the dimensions except 'memb_dim'.") + } + } + ## ftime_avg + if (!is.null(ftime_avg)) { + if (!is.vector(ftime_avg) | !is.numeric(ftime_avg)) { + stop("Parameter 'ftime_avg' must be an integer vector.") + } + if (!is.null(exp)) { + if (max(ftime_avg) > dim(exp)[ftime_dim] | min(ftime_avg) < 1) { + stop("Parameter 'ftime_avg' must be within the range of ftime_dim length.") + } + } + if (!is.null(obs)) { + if (max(ftime_avg) > dim(obs)[ftime_dim] | min(ftime_avg) < 1) { + stop("Parameter 'ftime_avg' must be within the range of ftime_dim length.") + } + } + if (!is.null(exp_cor)) { + if (max(ftime_avg) > dim(exp_cor)[ftime_dim] | min(ftime_avg) < 1) { + stop("Parameter 'ftime_avg' must be within the range of ftime_dim length.") + } + } + } + ## sdate >= 2 + if (!is.null(exp)) { + if (dim(exp)[time_dim] < 2) { + stop("The length of time_dim must be at least 2.") + } + } else { + if (dim(obs)[time_dim] < 2) { + stop("The length of time_dim must be at least 2.") + } + } + ## lat and lon + if (!is.null(exp)) { + if (!is.numeric(lat) | length(lat) != dim(exp)[space_dim[1]]) { + stop("Parameter 'lat' must be a numeric vector with the same ", + "length as the latitude dimension of 'exp' and 'obs'.") + } + if (!is.numeric(lon) | length(lon) != dim(exp)[space_dim[2]]) { + stop("Parameter 'lon' must be a numeric vector with the same ", + "length as the longitude dimension of 'exp' and 'obs'.") + } + } + if (!is.null(obs)) { + if (!is.numeric(lat) | length(lat) != dim(obs)[space_dim[1]]) { + stop("Parameter 'lat' must be a numeric vector with the same ", + "length as the latitude dimension of 'exp' and 'obs'.") + } + if (!is.numeric(lon) | length(lon) != dim(obs)[space_dim[2]]) { + stop("Parameter 'lon' must be a numeric vector with the same ", + "length as the longitude dimension of 'exp' and 'obs'.") + } + } + if (!is.null(exp_cor)) { + if (!is.numeric(lat) | length(lat) != dim(exp_cor)[space_dim[1]]) { + stop("Parameter 'lat' must be a numeric vector with the same ", + "length as the latitude dimension of 'exp_cor'.") + } + if (!is.numeric(lon) | length(lon) != dim(exp_cor)[space_dim[2]]) { + stop("Parameter 'lon' must be a numeric vector with the same ", + "length as the longitude dimension of 'exp_cor'.") + } + } + stop_needed <- FALSE + if (max(lat) > 80 | min(lat) < 20) { + stop_needed <- TRUE + } + #NOTE: different from s2dverification + # lon is not used in the calculation actually. EOF only uses lat to do the + # weight. So we just need to ensure the data is in this region, regardless + # the order. + if (any(lon < 0)) { #[-180, 180] + if (!(min(lon) > -90 & min(lon) < -70 & max(lon) < 50 & max(lon) > 30)) { + stop_needed <- TRUE + } + } else { #[0, 360] + if (any(lon >= 50 & lon <= 270)) { + stop_needed <- TRUE + } else { + lon_E <- lon[which(lon < 50)] + lon_W <- lon[-which(lon < 50)] + if (max(lon_E) < 30 | min(lon_W) > 290) { + stop_needed <- TRUE + } + } + } + if (stop_needed) { + stop("The typical domain used to compute the NAO is 20N-80N, ", + "80W-40E. 'lat' or 'lon' is out of range.") + } + ## obsproj + if (!is.logical(obsproj) | length(obsproj) > 1) { + stop("Parameter 'obsproj' must be either TRUE or FALSE.") + } + if (obsproj) { + if (is.null(obs)) { + stop("Parameter 'obsproj' set to TRUE but no 'obs' provided.") + } + if (is.null(exp) & is.null(exp_cor)) { + .warning("parameter 'obsproj' set to TRUE but no 'exp' nor 'exp_cor' provided.") + } + } + ## ncores + if (!is.null(ncores)) { + if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores == 0 | + length(ncores) > 1) { + stop("Parameter 'ncores' must be a positive integer.") + } + } + + # Average ftime + if (!is.null(ftime_avg)) { + if (!is.null(exp)) { + exp_sub <- ClimProjDiags::Subset(exp, ftime_dim, ftime_avg, drop = FALSE) + exp <- MeanDims(exp_sub, ftime_dim, na.rm = TRUE) + ## Cross-validated PCs. Fabian. This should be extended to + ## nmod and nlt by simple loops. Virginie + } + if (!is.null(obs)) { + obs_sub <- ClimProjDiags::Subset(obs, ftime_dim, ftime_avg, drop = FALSE) + obs <- MeanDims(obs_sub, ftime_dim, na.rm = TRUE) + } + if (!is.null(exp_cor)) { + exp_cor_sub <- ClimProjDiags::Subset(exp_cor, ftime_dim, ftime_avg, drop = FALSE) + exp_cor <- MeanDims(exp_cor_sub, ftime_dim, na.rm = TRUE) + } + } + + # wght + wght <- array(sqrt(cos(lat * pi / 180)), dim = c(length(lat), length(lon))) + if (is.null(exp_cor)) { + if (!is.null(exp) & !is.null(obs)) { + res <- Apply(list(exp, obs), + target_dims = list(exp = c(memb_dim, time_dim, space_dim), + obs = c(time_dim, space_dim)), + fun = .NAO, + lat = lat, wght = wght, + obsproj = obsproj, add_member_back = add_member_back, + ncores = ncores) + } else if (!is.null(exp)) { + res <- Apply(list(exp = exp), + target_dims = list(exp = c(memb_dim, time_dim, space_dim)), + fun = .NAO, + lat = lat, wght = wght, obs = NULL, + obsproj = obsproj, add_member_back = FALSE, + ncores = ncores) + } else if (!is.null(obs)) { + if (add_member_back) { + output_dims <- list(obs = c(time_dim, memb_dim)) + } else { + output_dims <- list(obs = time_dim) + } + res <- Apply(list(obs = obs), + target_dims = list(obs = c(time_dim, space_dim)), + output_dims = output_dims, + fun = .NAO, + lat = lat, wght = wght, exp = NULL, + obsproj = obsproj, add_member_back = add_member_back, + ncores = ncores) + } + } else { # exp_cor provided + res <- Apply(list(exp = exp, obs = obs, exp_cor = exp_cor), + target_dims = list(exp = c(memb_dim, time_dim, space_dim), + obs = c(time_dim, space_dim), + exp_cor = c(memb_dim, time_dim, space_dim)), + fun = .NAO, + lat = lat, wght = wght, + obsproj = obsproj, add_member_back = add_member_back, + ncores = ncores) + } + + return(res) +} + +.NAO <- function(exp = NULL, obs = NULL, exp_cor = NULL, lat, wght, obsproj = TRUE, + add_member_back = FALSE) { + # exp: [memb_exp, sdate, lat, lon] + # obs: [sdate, lat, lon] + # exp_cor: [memb, sdate = 1, lat, lon] + # wght: [lat, lon] + + if (!is.null(exp)) { + ntime <- dim(exp)[2] + nlat <- dim(exp)[3] + nlon <- dim(exp)[4] + nmemb_exp <- dim(exp)[1] + } else { + ntime <- dim(obs)[1] + nlat <- dim(obs)[2] + nlon <- dim(obs)[3] + } + if (!is.null(exp_cor)) { + ntime_exp_cor <- dim(exp_cor)[2] # should be 1 + nmemb_exp_cor <- dim(exp_cor)[1] + } + + if (!is.null(obs)) nao_obs <- array(NA, dim = ntime) + if (!is.null(exp)) nao_exp <- array(NA, dim = c(ntime, nmemb_exp)) + if (!is.null(exp_cor)) { + nao_exp_cor <- array(NA, dim = c(ntime_exp_cor, nmemb_exp_cor)) + #NOTE: The dimensions are flipped to fill in data correctly. Need to flip it back later. + } + + if (is.null(exp_cor)) { + + for (tt in 1:ntime) { # cross-validation + + if (!is.null(obs)) { + ## Calculate observation EOF. Excluding one forecast start year. + obs_sub <- obs[(1:ntime)[-tt], , , drop = FALSE] + EOF_obs <- .EOF(obs_sub, neofs = 1, wght = wght)$EOFs # [mode = 1, lat, lon] + ## Correct polarity of pattern + # EOF_obs: [mode = 1, lat, lon] + if (0 < mean(EOF_obs[1, which.min(abs(lat - 65)), ], na.rm = T)) { + EOF_obs <- EOF_obs * (-1) + } + ## Project observed anomalies. + PF <- .ProjectField(obs, eof_mode = EOF_obs[1, , ], wght = wght) # [sdate] + ## Keep PCs of excluded forecast start year. Fabian. + nao_obs[tt] <- PF[tt] + } + + if (!is.null(exp)) { + if (!obsproj) { + exp_sub <- exp[, (1:ntime)[-tt], , , drop = FALSE] + # Combine 'memb' and 'sdate' to calculate EOF + dim(exp_sub) <- c(nmemb_exp * (ntime - 1), nlat, nlon) + EOF_exp <- .EOF(exp_sub, neofs = 1, wght = wght)$EOFs # [mode = 1, lat, lon] + + ## Correct polarity of pattern + ##NOTE: different from s2dverification, which doesn't use mean(). +# if (0 < EOF_exp[1, which.min(abs(lat - 65)), ]) { + if (0 < mean(EOF_exp[1, which.min(abs(lat - 65)), ], na.rm = T)) { + EOF_exp <- EOF_exp * (-1) + } + + ### Lines below could be simplified further by computing + ### ProjectField() only on the year of interest... (though this is + ### not vital). Lauriane + for (imemb in 1:nmemb_exp) { + PF <- .ProjectField(exp[imemb, , , ], eof_mode = EOF_exp[1, , ], wght = wght) # [sdate, memb] + nao_exp[tt, imemb] <- PF[tt] + } + } else { + ## Project forecast anomalies on obs EOF + for (imemb in 1:nmemb_exp) { + PF <- .ProjectField(exp[imemb, , , ], eof_mode = EOF_obs[1, , ], wght = wght) # [sdate] + nao_exp[tt, imemb] <- PF[tt] + } + } + } + + } # for loop sdate + + } else { # exp_cor provided + + ## Calculate observation EOF. Without cross-validation + EOF_obs <- .EOF(obs, neofs = 1, wght = wght)$EOFs # [mode = 1, lat, lon] + ## Correct polarity of pattern + # EOF_obs: [mode, lat, lon] + if (0 < mean(EOF_obs[1, which.min(abs(lat - 65)), ], na.rm = T)) { + EOF_obs <- EOF_obs * (-1) + } + ## Project observed anomalies + PF <- .ProjectField(obs, eof_mode = EOF_obs, wght = wght) # [mode = 1, sdate] + nao_obs[] <- PF[1, ] + + if (!obsproj) { + # Calculate EOF_exp + tmp <- array(exp, dim = c(nmemb_exp * ntime, nlat, nlon)) + EOF_exp <- .EOF(tmp, neofs = 1, wght = wght)$EOFs # [mode = 1, lat, lon] + ## Correct polarity of pattern + if (0 < mean(EOF_exp[1, which.min(abs(lat - 65)), ], na.rm = T)) { + EOF_exp <- EOF_exp * (-1) + } + eof_mode_input <- EOF_exp[1, , ] + } else { + eof_mode_input <- EOF_obs[1, , ] + } + + # Calculate NAO_exp + for (imemb in 1:dim(exp)[1]) { + exp_sub <- ClimProjDiags::Subset(exp, along = 1, indices = imemb, + drop = 'selected') + PF <- .ProjectField(exp_sub, eof_mode = eof_mode_input, wght = wght) # [sdate] + nao_exp[ , imemb] <- PF + } + + # Calculate NAO_exp_cor + for (imemb in 1:dim(exp_cor)[1]) { + exp_sub <- ClimProjDiags::Subset(exp_cor, along = 1, indices = imemb, + drop = 'selected') + PF <- .ProjectField(exp_sub, eof_mode = eof_mode_input, wght = wght) # [sdate] + nao_exp_cor[, imemb] <- PF + } + + } + # add_member_back + if (add_member_back) { + memb_dim_name <- ifelse(!is.null(names(dim(exp))[1]), names(dim(exp))[1], 'member') + nao_obs <- InsertDim(nao_obs, 2, 1, name = memb_dim_name) + } + + # Return results + if (is.null(exp_cor)) { + res <- NULL + if (!is.null(exp)) { + res <- c(res, list(exp = nao_exp)) + } + if (!is.null(obs)) { + res <- c(res, list(obs = nao_obs)) + } + return(res) + + } else { + return(list(exp = nao_exp, obs = nao_obs, exp_cor = nao_exp_cor)) + } +} + diff --git a/modules/Crossval/R/tmp/ProjectField.R b/modules/Crossval/R/tmp/ProjectField.R new file mode 100644 index 00000000..efa35dc3 --- /dev/null +++ b/modules/Crossval/R/tmp/ProjectField.R @@ -0,0 +1,272 @@ +#'Project anomalies onto modes of variability +#' +#'Project anomalies onto modes of variability to get the temporal evolution of +#'the EOF mode selected. It returns principal components (PCs) by area-weighted +#'projection onto EOF pattern (from \code{EOF()}) or REOF pattern (from +#'\code{REOF()} or \code{EuroAtlanticTC()}). The calculation removes NA and +#'returns NA if the whole spatial pattern is NA. +#' +#'@param ano A numerical array of anomalies with named dimensions. The +#' dimensions must have at least 'time_dim' and 'space_dim'. It can be +#' generated by Ano(). +#'@param eof A list that contains at least 'EOFs' or 'REOFs' and 'wght', which +#' are both arrays. 'EOFs' or 'REOFs' must have dimensions 'mode' and +#' 'space_dim' at least. 'wght' has dimensions space_dim. It can be generated +#' by EOF() or REOF(). +#'@param time_dim A character string indicating the name of the time dimension +#' of 'ano'. The default value is 'sdate'. +#'@param space_dim A vector of two character strings. The first is the dimension +#' name of latitude of 'ano' and the second is the dimension name of longitude +#' of 'ano'. The default value is c('lat', 'lon'). +#'@param mode An integer of the variability mode number in the EOF to be +#' projected on. The default value is NULL, which means all the modes of 'eof' +#' is calculated. +#'@param ncores An integer indicating the number of cores to use for parallel +#' computation. The default value is NULL. +#' +#'@return A numerical array of the principal components in the verification +#' format. The dimensions are the same as 'ano' except 'space_dim'. +#' +#'@seealso EOF, NAO, PlotBoxWhisker +#'@examples +#'\dontshow{ +#'startDates <- c('19851101', '19901101', '19951101', '20001101', '20051101') +#'sampleData <- s2dv:::.LoadSampleData('tos', c('experiment'), +#' c('observation'), startDates, +#' leadtimemin = 1, +#' leadtimemax = 4, +#' output = 'lonlat', +#' latmin = 27, latmax = 48, +#' lonmin = -12, lonmax = 40) +#'} +#'ano <- Ano_CrossValid(sampleData$mod, sampleData$obs) +#'eof_exp <- EOF(ano$exp, sampleData$lat, sampleData$lon) +#'eof_obs <- EOF(ano$obs, sampleData$lat, sampleData$lon) +#'mode1_exp <- ProjectField(ano$exp, eof_exp, mode = 1) +#'mode1_obs <- ProjectField(ano$obs, eof_obs, mode = 1) +#' +#'\dontrun{ +#' # Plot the forecast and the observation of the first mode for the last year +#' # of forecast +#' sdate_dim_length <- dim(mode1_obs)['sdate'] +#' plot(mode1_obs[sdate_dim_length, 1, 1, ], type = "l", ylim = c(-1, 1), +#' lwd = 2) +#' for (i in 1:dim(mode1_exp)['member']) { +#' par(new = TRUE) +#' plot(mode1_exp[sdate_dim_length, 1, i, ], type = "l", col = rainbow(10)[i], +#' ylim = c(-15000, 15000)) +#' } +#'} +#' +#'@import multiApply +#'@export +ProjectField <- function(ano, eof, time_dim = 'sdate', space_dim = c('lat', 'lon'), + mode = NULL, ncores = NULL) { + + # Check inputs + ## ano (1) + if (is.null(ano)) { + stop("Parameter 'ano' cannot be NULL.") + } + if (!is.numeric(ano)) { + stop("Parameter 'ano' must be a numeric array.") + } + if (any(is.null(names(dim(ano)))) | any(nchar(names(dim(ano))) == 0)) { + stop("Parameter 'ano' must have dimension names.") + } + ## eof (1) + if (is.null(eof)) { + stop("Parameter 'eof' cannot be NULL.") + } + if (!is.list(eof)) { + stop("Parameter 'eof' must be a list generated by EOF() or REOF().") + } + if ('EOFs' %in% names(eof)) { + EOFs <- "EOFs" + } else if ('REOFs' %in% names(eof)) { + EOFs <- "REOFs" + } else if ('patterns' %in% names(eof)) { + EOFs <- "patterns" + } else { + stop("Parameter 'eof' must be a list that contains 'EOFs', 'REOFs', ", + "or 'patterns'. It can be generated by EOF(), REOF(), or EuroAtlanticTC().") + } + if (!'wght' %in% names(eof)) { + stop("Parameter 'eof' must be a list that contains 'wght'. ", + "It can be generated by EOF() or REOF().") + } + if (!is.numeric(eof[[EOFs]]) || !is.array(eof[[EOFs]])) { + stop("The component 'EOFs' or 'REOFs' of parameter 'eof' must be a numeric array.") + } + if (!is.numeric(eof$wght) || !is.array(eof$wght)) { + stop("The component 'wght' of parameter 'eof' must be a numeric array.") + } + ## time_dim + if (!is.character(time_dim) | length(time_dim) > 1) { + stop("Parameter 'time_dim' must be a character string.") + } + if (!time_dim %in% names(dim(ano))) { + stop("Parameter 'time_dim' is not found in 'ano' dimension.") + } + ## space_dim + if (!is.character(space_dim) | length(space_dim) != 2) { + stop("Parameter 'space_dim' must be a character vector of 2.") + } + if (!all(space_dim %in% names(dim(ano)))) { + stop("Parameter 'space_dim' is not found in 'ano' dimension.") + } + ## ano (2) + if (!all(space_dim %in% names(dim(ano))) | !time_dim %in% names(dim(ano))) { + stop("Parameter 'ano' must be an array with dimensions named as ", + "parameter 'space_dim' and 'time_dim'.") + } + ## eof (2) + if (!all(space_dim %in% names(dim(eof[[EOFs]]))) | + !'mode' %in% names(dim(eof[[EOFs]]))) { + stop("The component 'EOFs' or 'REOFs' of parameter 'eof' must be an array ", + "with dimensions named as parameter 'space_dim' and 'mode'.") + } + if (length(dim(eof$wght)) != 2 | !all(names(dim(eof$wght)) %in% space_dim)) { + stop("The component 'wght' of parameter 'eof' must be an array ", + "with dimensions named as parameter 'space_dim'.") + } + ## mode + if (!is.null(mode)) { + if (!is.numeric(mode) | mode %% 1 != 0 | mode < 0 | length(mode) > 1) { + stop("Parameter 'mode' must be NULL or a positive integer.") + } + if (mode > dim(eof[[EOFs]])['mode']) { + stop("Parameter 'mode' is greater than the number of available ", + "modes in 'eof'.") + } + } + ## ncores + if (!is.null(ncores)) { + if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 | + length(ncores) > 1) { + stop("Parameter 'ncores' must be a positive integer.") + } + } + +#------------------------------------------------------- + + # Keep the chosen mode + if (!is.null(mode)) { + eof_mode <- ClimProjDiags::Subset(eof[[EOFs]], 'mode', mode, drop = 'selected') + } else { + eof_mode <- eof[[EOFs]] + } + + if ('mode' %in% names(dim(eof_mode))) { + dimnames_without_mode <- names(dim(eof_mode))[-which(names(dim(eof_mode)) == 'mode')] + } else { + dimnames_without_mode <- names(dim(eof_mode)) + } + + if (all(dimnames_without_mode %in% space_dim)) { # eof_mode: [lat, lon] or [mode, lat, lon] + if ('mode' %in% names(dim(eof_mode))) { + eof_mode_target <- c('mode', space_dim) + output_dims <- c('mode', time_dim) + } else { + eof_mode_target <- space_dim + output_dims <- time_dim + } + res <- Apply(list(ano, eof_mode), + target_dims = list(c(time_dim, space_dim), + eof_mode_target), + output_dims = output_dims, + wght = eof$wght, + fun = .ProjectField, + ncores = ncores)$output1 + + } else { + + if (!all(dimnames_without_mode %in% names(dim(ano)))) { + stop("The array 'EOF' in parameter 'eof' has dimension not in parameter ", + "'ano'. Check if 'ano' and 'eof' are compatible.") + } + + common_dim_ano <- dim(ano)[which(names(dim(ano)) %in% dimnames_without_mode)] + if (any(common_dim_ano[match(dimnames_without_mode, names(common_dim_ano))] != + dim(eof_mode)[dimnames_without_mode])) { + stop("Found paramter 'ano' and 'EOF' in parameter 'eof' have common dimensions ", + "with different length. Check if 'ano' and 'eof' are compatible.") + } + + # Enlarge eof/ano is needed. The margin_dims of Apply() must be consistent + # between ano and eof. + additional_dims <- dim(ano)[-which(names(dim(ano)) %in% names(dim(eof_mode)))] + additional_dims <- additional_dims[-which(names(additional_dims) == time_dim)] + if (length(additional_dims) != 0) { + for (i in seq_along(additional_dims)) { + eof_mode <- InsertDim(eof_mode, posdim = (length(dim(eof_mode)) + 1), + lendim = additional_dims[i], name = names(additional_dims)[i]) + } + } + if ('mode' %in% names(dim(eof_mode))) { + eof_mode_target <- c('mode', space_dim) + output_dims <- c('mode', time_dim) + } else { + eof_mode_target <- space_dim + output_dims <- time_dim + } + res <- Apply(list(ano, eof_mode), + target_dims = list(c(time_dim, space_dim), + eof_mode_target), + output_dims = output_dims, + wght = eof$wght, + fun = .ProjectField, + ncores = ncores)$output1 + } + + return(res) +} + + +.ProjectField <- function(ano, eof_mode, wght) { + # ano: [sdate, lat, lon] + # eof_mode: [lat, lon] or [mode, lat, lon] + # wght: [lat, lon] + + ntime <- dim(ano)[1] + if (length(dim(eof_mode)) == 2) { # mode != NULL + # Initialization of pc.ver. + pc.ver <- array(NA, dim = ntime) #[sdate] + + # Weight + e.1 <- eof_mode * wght + ano <- ano * InsertDim(wght, 1, ntime) + #ano <- aaply(ano, 1, '*', wght) # much heavier + + na <- rowMeans(ano, na.rm = TRUE) # if [lat, lon] all NA, it's NA + #na <- apply(ano, 1, mean, na.rm = TRUE) # much heavier + tmp <- ano * InsertDim(e.1, 1, ntime) # [sdate, lat, lon] + rm(ano) + #pc.ver <- apply(tmp, 1, sum, na.rm = TRUE) # much heavier + pc.ver <- rowSums(tmp, na.rm = TRUE) + pc.ver[which(is.na(na))] <- NA + + } else { # mode = NULL + # Weight + e.1 <- eof_mode * InsertDim(wght, 1, dim(eof_mode)[1]) + dim(e.1) <- c(dim(eof_mode)[1], prod(dim(eof_mode)[2:3])) # [mode, lat*lon] + ano <- ano * InsertDim(wght, 1, ntime) + dim(ano) <- c(ntime, prod(dim(ano)[2:3])) # [sdate, lat*lon] + + na <- rowMeans(ano, na.rm = TRUE) # if [lat, lon] all NA, it's NA + na <- aperm(array(na, dim = c(ntime, dim(e.1)[1])), c(2, 1)) + + # Matrix multiplication e.1 [mode, lat*lon] by ano [lat*lon, sdate] + # Result: [mode, sdate] + pc.ver <- e.1 %*% t(ano) + pc.ver[which(is.na(na))] <- NA + +# # Change back dimensions to feet original input +# dim(projection) <- c(moredims, mode = unname(neofs)) +# return(projection) + } + + return(pc.ver) +} + + diff --git a/modules/Crossval/R/tmp/RPS.R b/modules/Crossval/R/tmp/RPS.R new file mode 100644 index 00000000..0ed599ac --- /dev/null +++ b/modules/Crossval/R/tmp/RPS.R @@ -0,0 +1,408 @@ +#'Compute the Ranked Probability Score +#' +#'The Ranked Probability Score (RPS; Wilks, 2011) is defined as the sum of the +#'squared differences between the cumulative forecast probabilities (computed +#'from the ensemble members) and the observations (defined as 0% if the category +#'did not happen and 100% if it happened). It can be used to evaluate the skill +#'of multi-categorical probabilistic forecasts. The RPS ranges between 0 +#'(perfect forecast) and n-1 (worst possible forecast), where n is the number of +#'categories. In the case of a forecast divided into two categories (the lowest +#'number of categories that a probabilistic forecast can have), the RPS +#'corresponds to the Brier Score (BS; Wilks, 2011), therefore ranging between 0 +#'and 1.\cr +#'The function first calculates the probabilities for forecasts and observations, +#'then use them to calculate RPS. Or, the probabilities of exp and obs can be +#'provided directly to compute the score. If there is more than one dataset, RPS +#'will be computed for each pair of exp and obs data. The fraction of acceptable +#'NAs can be adjusted. +#' +#'@param exp A named numerical array of either the forecasts with at least time +#' and member dimensions, or the probabilities with at least time and category +#' dimensions. The probabilities can be generated by \code{s2dv::GetProbs}. +#'@param obs A named numerical array of either the observation with at least +#' time dimension, or the probabilities with at least time and category +#' dimensions. The probabilities can be generated by \code{s2dv::GetProbs}. The +#' dimensions must be the same as 'exp' except 'memb_dim' and 'dat_dim'. +#'@param time_dim A character string indicating the name of the time dimension. +#' The default value is 'sdate'. +#'@param memb_dim A character string indicating the name of the member dimension +#' to compute the probabilities of the forecast. The default value is 'member'. +#' If the data are probabilities, set memb_dim as NULL. +#'@param cat_dim A character string indicating the name of the category +#' dimension that is needed when the exp and obs are probabilities. The default +#' value is NULL, which means that the data are not probabilities. +#'@param dat_dim A character string indicating the name of dataset dimension. +#' The length of this dimension can be different between 'exp' and 'obs'. +#' The default value is NULL. +#'@param prob_thresholds A numeric vector of the relative thresholds (from 0 to +#' 1) between the categories. The default value is c(1/3, 2/3), which +#' corresponds to tercile equiprobable categories. +#'@param indices_for_clim A vector of the indices to be taken along 'time_dim' +#' for computing the thresholds between the probabilistic categories. If NULL, +#' the whole period is used. The default value is NULL. +#'@param Fair A logical indicating whether to compute the FairRPS (the +#' potential RPS that the forecast would have with an infinite ensemble size). +#' The default value is FALSE. +#'@param nmemb A numeric value indicating the number of members used to compute the probabilities. This parameter is necessary when calculating FairRPS from probabilities. Default is NULL. +#'@param weights A named numerical array of the weights for 'exp' probability +#' calculation. If 'dat_dim' is NULL, the dimensions should include 'memb_dim' +#' and 'time_dim'. Else, the dimension should also include 'dat_dim'. The +#' default value is NULL. The ensemble should have at least 70 members or span +#' at least 10 time steps and have more than 45 members if consistency between +#' the weighted and unweighted methodologies is desired. +#'@param cross.val A logical indicating whether to compute the thresholds +#' between probabilistic categories in cross-validation. The default value is +#' FALSE. +#'@param return_mean A logical indicating whether to return the temporal mean +#' of the RPS or not. If TRUE, the temporal mean is calculated along time_dim, +#' if FALSE the time dimension is not aggregated. The default is TRUE. +#'@param na.rm A logical or numeric value between 0 and 1. If it is numeric, it +#' means the lower limit for the fraction of the non-NA values. 1 is equal to +#' FALSE (no NA is acceptable), 0 is equal to TRUE (all NAs are acceptable). +# The function returns NA if the fraction of non-NA values in the data is less +#' than na.rm. Otherwise, RPS will be calculated. The default value is FALSE. +#'@param ncores An integer indicating the number of cores to use for parallel +#' computation. The default value is NULL. +#' +#'@return +#'A numerical array of RPS with dimensions c(nexp, nobs, the rest dimensions of +#''exp' except 'time_dim' and 'memb_dim' dimensions). nexp is the number of +#'experiment (i.e., dat_dim in exp), and nobs is the number of observation +#'(i.e., dat_dim in obs). If dat_dim is NULL, nexp and nobs are omitted. +#' +#'@references +#'Wilks, 2011; https://doi.org/10.1016/B978-0-12-385022-5.00008-7 +#' +#'@examples +#'# Use synthetic data +#'exp <- array(rnorm(1000), dim = c(lat = 3, lon = 2, member = 10, sdate = 50)) +#'obs <- array(rnorm(1000), dim = c(lat = 3, lon = 2, sdate = 50)) +#'res <- RPS(exp = exp, obs = obs) +#'# Use probabilities as inputs +#'exp_probs <- GetProbs(exp, time_dim = 'sdate', memb_dim = 'member') +#'obs_probs <- GetProbs(obs, time_dim = 'sdate', memb_dim = NULL) +#'res2 <- RPS(exp = exp_probs, obs = obs_probs, memb_dim = NULL, cat_dim = 'bin') +#' +#' +#'@import multiApply +#'@importFrom easyVerification convert2prob +#'@export +RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', cat_dim = NULL, + dat_dim = NULL, prob_thresholds = c(1/3, 2/3), indices_for_clim = NULL, + Fair = FALSE, nmemb = NULL, weights = NULL, + cross.val = FALSE, return_mean = TRUE, + na.rm = FALSE, ncores = NULL) { + + # Check inputs + ## exp and obs (1) + if (!is.array(exp) | !is.numeric(exp)) + stop('Parameter "exp" must be a numeric array.') + if (!is.array(obs) | !is.numeric(obs)) + stop('Parameter "obs" must be a numeric array.') + if (any(is.null(names(dim(exp)))) | any(nchar(names(dim(exp))) == 0) | + any(is.null(names(dim(obs)))) | any(nchar(names(dim(obs))) == 0)) { + stop("Parameter 'exp' and 'obs' must have dimension names.") + } + ## time_dim + if (!is.character(time_dim) | length(time_dim) != 1) + stop('Parameter "time_dim" must be a character string.') + if (!time_dim %in% names(dim(exp)) | !time_dim %in% names(dim(obs))) { + stop("Parameter 'time_dim' is not found in 'exp' or 'obs' dimension.") + } + ## memb_dim & cat_dim + if (is.null(memb_dim) + is.null(cat_dim) != 1) { + stop("Only one of the two parameters 'memb_dim' and 'cat_dim' can have value.") + } + ## memb_dim + if (!is.null(memb_dim)) { + if (!is.character(memb_dim) | length(memb_dim) > 1) { + stop("Parameter 'memb_dim' must be a character string.") + } + if (!memb_dim %in% names(dim(exp))) { + stop("Parameter 'memb_dim' is not found in 'exp' dimension.") + } + } + ## cat_dim + if (!is.null(cat_dim)) { + if (!is.character(cat_dim) | length(cat_dim) > 1) { + stop("Parameter 'cat_dim' must be a character string.") + } + if (!cat_dim %in% names(dim(exp)) | !cat_dim %in% names(dim(obs))) { + stop("Parameter 'cat_dim' is not found in 'exp' or 'obs' dimension.") + } + } + ## dat_dim + if (!is.null(dat_dim)) { + if (!is.character(dat_dim) | length(dat_dim) > 1) { + stop("Parameter 'dat_dim' must be a character string.") + } + if (!dat_dim %in% names(dim(exp)) | !dat_dim %in% names(dim(obs))) { + stop("Parameter 'dat_dim' is not found in 'exp' or 'obs' dimension.", + " Set it as NULL if there is no dataset dimension.") + } + } + ## exp and obs (2) + name_exp <- sort(names(dim(exp))) + name_obs <- sort(names(dim(obs))) + if (!is.null(memb_dim)) { + name_exp <- name_exp[-which(name_exp == memb_dim)] + if (memb_dim %in% name_obs) { + name_obs <- name_obs[-which(name_obs == memb_dim)] + } + } + if (!is.null(dat_dim)) { + name_exp <- name_exp[-which(name_exp == dat_dim)] + name_obs <- name_obs[-which(name_obs == dat_dim)] + } + if (!identical(length(name_exp), length(name_obs)) | + !identical(dim(exp)[name_exp], dim(obs)[name_obs])) { + stop("Parameter 'exp' and 'obs' must have same length of ", + "all dimensions except 'memb_dim' and 'dat_dim'.") + } + ## prob_thresholds + if (!is.numeric(prob_thresholds) | !is.vector(prob_thresholds) | + any(prob_thresholds <= 0) | any(prob_thresholds >= 1)) { + stop("Parameter 'prob_thresholds' must be a numeric vector between 0 and 1.") + } + ## indices_for_clim + if (is.null(indices_for_clim)) { + indices_for_clim <- seq_len(dim(obs)[time_dim]) + } else { + if (!is.numeric(indices_for_clim) | !is.vector(indices_for_clim)) { + stop("Parameter 'indices_for_clim' must be NULL or a numeric vector.") + } else if (length(indices_for_clim) > dim(obs)[time_dim] | + max(indices_for_clim) > dim(obs)[time_dim] | + any(indices_for_clim) < 1) { + stop("Parameter 'indices_for_clim' should be the indices of 'time_dim'.") + } + } + ## Fair + if (!is.logical(Fair) | length(Fair) > 1) { + stop("Parameter 'Fair' must be either TRUE or FALSE.") + } + if (Fair) { + if (!is.null(cat_dim)) { + if (cat_dim %in% names(dim(exp))) { + if (is.null(nmemb)) { + stop("Parameter 'nmemb' necessary to compute Fair", + "score from probabilities") + } + } + } + } + ## return_mean + if (!is.logical(return_mean) | length(return_mean) > 1) { + stop("Parameter 'return_mean' must be either TRUE or FALSE.") + } + ## cross.val + if (!is.logical(cross.val) | length(cross.val) > 1) { + stop("Parameter 'cross.val' must be either TRUE or FALSE.") + } + ## weights + if (!is.null(weights) & is.null(cat_dim)) { + if (!is.array(weights) | !is.numeric(weights)) + stop("Parameter 'weights' must be a named numeric array.") + if (is.null(dat_dim)) { + if (length(dim(weights)) != 2 | !all(names(dim(weights)) %in% c(memb_dim, time_dim))) + stop("Parameter 'weights' must have two dimensions with the names of ", + "'memb_dim' and 'time_dim'.") + if (dim(weights)[memb_dim] != dim(exp)[memb_dim] | + dim(weights)[time_dim] != dim(exp)[time_dim]) { + stop("Parameter 'weights' must have the same dimension lengths ", + "as 'memb_dim' and 'time_dim' in 'exp'.") + } + weights <- Reorder(weights, c(time_dim, memb_dim)) + + } else { + if (length(dim(weights)) != 3 | !all(names(dim(weights)) %in% c(memb_dim, time_dim, dat_dim))) + stop("Parameter 'weights' must have three dimensions with the names of ", + "'memb_dim', 'time_dim' and 'dat_dim'.") + if (dim(weights)[memb_dim] != dim(exp)[memb_dim] | + dim(weights)[time_dim] != dim(exp)[time_dim] | + dim(weights)[dat_dim] != dim(exp)[dat_dim]) { + stop("Parameter 'weights' must have the same dimension lengths ", + "as 'memb_dim', 'time_dim' and 'dat_dim' in 'exp'.") + } + weights <- Reorder(weights, c(time_dim, memb_dim, dat_dim)) + + } + } else if (!is.null(weights) & !is.null(cat_dim)) { + .warning(paste0("Parameter 'exp' and 'obs' are probabilities already, so parameter ", + "'weights' is not used. Change 'weights' to NULL.")) + weights <- NULL + } + ## na.rm + if (!isTRUE(na.rm) & !isFALSE(na.rm) & !(is.numeric(na.rm) & na.rm >= 0 & na.rm <= 1)) { + stop('"na.rm" should be TRUE, FALSE or a numeric between 0 and 1') + } + ## ncores + if (!is.null(ncores)) { + if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 | + length(ncores) > 1) { + stop("Parameter 'ncores' must be either NULL or a positive integer.") + } + } + + ############################### + + # Compute RPS + + ## Decide target_dims + if (!is.null(memb_dim)) { + target_dims_exp <- c(time_dim, memb_dim, dat_dim) + if (!memb_dim %in% names(dim(obs))) { + target_dims_obs <- c(time_dim, dat_dim) + } else { + target_dims_obs <- c(time_dim, memb_dim, dat_dim) + } + } else { # cat_dim + target_dims_exp <- target_dims_obs <- c(time_dim, cat_dim, dat_dim) + } + + rps <- Apply(data = list(exp = exp, obs = obs), + target_dims = list(exp = target_dims_exp, + obs = target_dims_obs), + fun = .RPS, + dat_dim = dat_dim, time_dim = time_dim, + memb_dim = memb_dim, cat_dim = cat_dim, + prob_thresholds = prob_thresholds, nmemb = nmemb, + indices_for_clim = indices_for_clim, Fair = Fair, + weights = weights, cross.val = cross.val, + na.rm = na.rm, ncores = ncores)$output1 + + if (return_mean) { + rps <- MeanDims(rps, time_dim, na.rm = TRUE) + } else { + rps <- rps + } + + return(rps) +} + + +.RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', cat_dim = NULL, + dat_dim = NULL, prob_thresholds = c(1/3, 2/3), indices_for_clim = NULL, + Fair = FALSE, nmemb = NULL, weights = NULL, + cross.val = FALSE, na.rm = FALSE) { + #--- if memb_dim: + # exp: [sdate, memb, (dat)] + # obs: [sdate, (memb), (dat)] + # weights: NULL or same as exp + #--- if cat_dim: + # exp: [sdate, bin, (dat)] + # obs: [sdate, bin, (dat)] + + # Adjust dimensions to be [sdate, memb, dat] for both exp and obs + if (!is.null(memb_dim)) { + if (!memb_dim %in% names(dim(obs))) { + obs <- InsertDim(obs, posdim = 2, lendim = 1, name = memb_dim) + } + } + + if (is.null(dat_dim)) { + nexp <- 1 + nobs <- 1 + dim(exp) <- c(dim(exp), nexp = nexp) + dim(obs) <- c(dim(obs), nobs = nobs) + if (!is.null(weights)) dim(weights) <- c(dim(weights), nexp = nexp) + } else { + nexp <- as.numeric(dim(exp)[dat_dim]) + nobs <- as.numeric(dim(obs)[dat_dim]) + } + + rps <- array(dim = c(dim(exp)[time_dim], nexp = nexp, nobs = nobs)) + + for (i in 1:nexp) { + for (j in 1:nobs) { + exp_data <- exp[, , i] + obs_data <- obs[, , j] + + if (is.null(dim(exp_data))) dim(exp_data) <- c(dim(exp)[1:2]) + if (is.null(dim(obs_data))) dim(obs_data) <- c(dim(obs)[1:2]) + + # Find the fraction of NAs + ## If any member/bin is NA at this time step, it is not good value. + exp_mean <- rowMeans(exp_data) + obs_mean <- rowMeans(obs_data) + good_values <- !is.na(exp_mean) & !is.na(obs_mean) + + if (isTRUE(na.rm)) { + f_NAs <- 0 + } else if (isFALSE(na.rm)) { + f_NAs <- 1 + } else { + f_NAs <- na.rm + } + + if (f_NAs <= sum(good_values) / length(obs_mean)) { + + exp_data <- exp_data[good_values, , drop = F] + obs_data <- obs_data[good_values, , drop = F] + + # If the data inputs are forecast/observation, calculate probabilities + if (is.null(cat_dim)) { + if (!is.null(weights)) { + weights_data <- weights[which(good_values), , i] + if (is.null(dim(weights_data))) dim(weights_data) <- c(dim(weights)[1:2]) + } else { + weights_data <- weights #NULL + } + + # Subset indices_for_clim + dum <- match(indices_for_clim, which(good_values)) + good_indices_for_clim <- dum[!is.na(dum)] + + exp_probs <- .GetProbs(data = exp_data, indices_for_quantiles = good_indices_for_clim, + prob_thresholds = prob_thresholds, weights = weights_data, + cross.val = cross.val) + # exp_probs: [bin, sdate] + obs_probs <- .GetProbs(data = obs_data, indices_for_quantiles = good_indices_for_clim, + prob_thresholds = prob_thresholds, weights = NULL, + cross.val = cross.val) + # obs_probs: [bin, sdate] + + } else { # inputs are probabilities already + if (all(names(dim(exp_data)) == c(time_dim, memb_dim)) || + all(names(dim(exp_data)) == c(time_dim, cat_dim))) { + exp_probs <- t(exp_data) + obs_probs <- t(obs_data) + } + } + + probs_exp_cumsum <- apply(exp_probs, 2, cumsum) + probs_obs_cumsum <- apply(obs_probs, 2, cumsum) + + # rps: [sdate, nexp, nobs] + rps [good_values, i, j] <- colSums((probs_exp_cumsum - probs_obs_cumsum)^2) + if (Fair) { # FairRPS + if (!is.null(memb_dim)) { + if (memb_dim %in% names(dim(exp))) { + ## adjustment <- rowSums(-1 * (1/R - 1/R.new) * ens.cum * (R - ens.cum)/R/(R - 1)) + ## [formula taken from SpecsVerification::EnsRps] + R <- dim(exp)[2] #memb + } + } else { + R <- nmemb + } + warning("Applying fair correction.") + adjustment <- (-1) / (R - 1) * probs_exp_cumsum * (1 - probs_exp_cumsum) + adjustment <- colSums(adjustment) + rps[, i, j] <- rps[, i, j] + adjustment + } + + } else { ## not enough values different from NA + + rps[, i, j] <- NA_real_ + + } + + } + } + + if (is.null(dat_dim)) { + dim(rps) <- dim(exp)[time_dim] + } + + return(rps) +} + diff --git a/modules/Crossval/R/tmp/RPSS.R b/modules/Crossval/R/tmp/RPSS.R new file mode 100644 index 00000000..fc9931ad --- /dev/null +++ b/modules/Crossval/R/tmp/RPSS.R @@ -0,0 +1,638 @@ +#'Compute the Ranked Probability Skill Score +#' +#'The Ranked Probability Skill Score (RPSS; Wilks, 2011) is the skill score +#'based on the Ranked Probability Score (RPS; Wilks, 2011). It can be used to +#'assess whether a forecast presents an improvement or worsening with respect to +#'a reference forecast. The RPSS ranges between minus infinite and 1. If the +#'RPSS is positive, it indicates that the forecast has higher skill than the +#'reference forecast, while a negative value means that it has a lower skill.\cr +#'Examples of reference forecasts are the climatological forecast (same +#'probabilities for all categories for all time steps), persistence, a previous +#'model version, and another model. It is computed as +#'\code{RPSS = 1 - RPS_exp / RPS_ref}. The statistical significance is obtained +#'based on a Random Walk test at the specified confidence level (DelSole and +#'Tippett, 2016).\cr +#'The function accepts either the ensemble members or the probabilities of +#'each data as inputs. If there is more than one dataset, RPSS will be +#'computed for each pair of exp and obs data. The NA ratio of data will be +#'examined before the calculation. If the ratio is higher than the threshold +#'(assigned by parameter \code{na.rm}), NA will be returned directly. NAs are +#'counted by per-pair method, which means that only the time steps that all the +#'datasets have values count as non-NA values. +#' +#'@param exp A named numerical array of either the forecast with at least time +#' and member dimensions, or the probabilities with at least time and category +#' dimensions. The probabilities can be generated by \code{s2dv::GetProbs}. +#'@param obs A named numerical array of either the observation with at least +#' time dimension, or the probabilities with at least time and category +#' dimensions. The probabilities can be generated by \code{s2dv::GetProbs}. The +#' dimensions must be the same as 'exp' except 'memb_dim' and 'dat_dim'. +#'@param ref A named numerical array of either the reference forecast with at +#' least time and member dimensions, or the probabilities with at least time and +#' category dimensions. The probabilities can be generated by +#' \code{s2dv::GetProbs}. The dimensions must be the same as 'exp' except +#' 'memb_dim' and 'dat_dim'. If there is only one reference dataset, it should +#' not have dataset dimension. If there is corresponding reference for each +#' experiment, the dataset dimension must have the same length as in 'exp'. If +#' 'ref' is NULL, the climatological forecast is used as reference forecast. +#' The default value is NULL. +#'@param time_dim A character string indicating the name of the time dimension. +#' The default value is 'sdate'. +#'@param memb_dim A character string indicating the name of the member dimension +#' to compute the probabilities of the forecast and the reference forecast. The +#' default value is 'member'. If the data are probabilities, set memb_dim as +#' NULL. +#'@param cat_dim A character string indicating the name of the category +#' dimension that is needed when exp, obs, and ref are probabilities. The +#' default value is NULL, which means that the data are not probabilities. +#'@param dat_dim A character string indicating the name of dataset dimension. +#' The length of this dimension can be different between 'exp' and 'obs'. +#' The default value is NULL. +#'@param prob_thresholds A numeric vector of the relative thresholds (from 0 to +#' 1) between the categories. The default value is c(1/3, 2/3), which +#' corresponds to tercile equiprobable categories. +#'@param indices_for_clim A vector of the indices to be taken along 'time_dim' +#' for computing the thresholds between the probabilistic categories. If NULL, +#' the whole period is used. The default value is NULL. +#'@param Fair A logical indicating whether to compute the FairRPSS (the +#' potential RPSS that the forecast would have with an infinite ensemble size). +#' The default value is FALSE. +#'@param weights_exp A named numerical array of the forecast ensemble weights +#' for probability calculation. The dimension should include 'memb_dim', +#' 'time_dim' and 'dat_dim' if there are multiple datasets. All dimension +#' lengths must be equal to 'exp' dimension lengths. The default value is NULL, +#' which means no weighting is applied. The ensemble should have at least 70 +#' members or span at least 10 time steps and have more than 45 members if +#' consistency between the weighted and unweighted methodologies is desired. +#'@param weights_ref Same as 'weights_exp' but for the reference forecast. +#'@param cross.val A logical indicating whether to compute the thresholds +#' between probabilistics categories in cross-validation. The default value is +#' FALSE. +#'@param na.rm A logical or numeric value between 0 and 1. If it is numeric, it +#' means the lower limit for the fraction of the non-NA values. 1 is equal to +#' FALSE (no NA is acceptable), 0 is equal to TRUE (all NAs are acceptable). +# The function returns NA if the fraction of non-NA values in the data is less +#' than na.rm. Otherwise, RPS will be calculated. The default value is FALSE. +#'@param sig_method.type A character string indicating the test type of the +#' significance method. Check \code{RandomWalkTest()} parameter +#' \code{test.type} for details. The default is 'two.sided.approx', which is +#' the default of \code{RandomWalkTest()}. +#'@param alpha A numeric of the significance level to be used in the statistical +#' significance test. The default value is 0.05. +#'@param ncores An integer indicating the number of cores to use for parallel +#' computation. The default value is NULL. +#' +#'@return +#'\item{$rpss}{ +#' A numerical array of RPSS with dimensions c(nexp, nobs, the rest dimensions +#' of 'exp' except 'time_dim' and 'memb_dim' dimensions). nexp is the number of +#' experiment (i.e., dat_dim in exp), and nobs is the number of observation +#' i.e., dat_dim in obs). If dat_dim is NULL, nexp and nobs are omitted. +#'} +#'\item{$sign}{ +#' A logical array of the statistical significance of the RPSS with the same +#' dimensions as $rpss. +#'} +#' +#'@references +#'Wilks, 2011; https://doi.org/10.1016/B978-0-12-385022-5.00008-7 +#'DelSole and Tippett, 2016; https://doi.org/10.1175/MWR-D-15-0218.1 +#' +#'@examples +#'set.seed(1) +#'exp <- array(rnorm(3000), dim = c(lat = 3, lon = 2, member = 10, sdate = 50)) +#'set.seed(2) +#'obs <- array(rnorm(300), dim = c(lat = 3, lon = 2, sdate = 50)) +#'set.seed(3) +#'ref <- array(rnorm(3000), dim = c(lat = 3, lon = 2, member = 10, sdate = 50)) +#'weights <- sapply(1:dim(exp)['sdate'], function(i) { +#' n <- abs(rnorm(10)) +#' n/sum(n) +#' }) +#'dim(weights) <- c(member = 10, sdate = 50) +#'# Use data as input +#'res <- RPSS(exp = exp, obs = obs) ## climatology as reference forecast +#'res <- RPSS(exp = exp, obs = obs, ref = ref) ## ref as reference forecast +#'res <- RPSS(exp = exp, obs = obs, ref = ref, weights_exp = weights, weights_ref = weights) +#'res <- RPSS(exp = exp, obs = obs, alpha = 0.01, sig_method.type = 'two.sided') +#' +#'# Use probs as input +#'exp_probs <- GetProbs(exp, memb_dim = 'member') +#'obs_probs <- GetProbs(obs, memb_dim = NULL) +#'ref_probs <- GetProbs(ref, memb_dim = 'member') +#'res <- RPSS(exp = exp_probs, obs = obs_probs, ref = ref_probs, memb_dim = NULL, +#' cat_dim = 'bin') +#' +#'@import multiApply +#'@export +RPSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', memb_dim = 'member', cat_dim = NULL, + dat_dim = NULL, prob_thresholds = c(1/3, 2/3), indices_for_clim = NULL, + Fair = FALSE, nmemb = NULL, nmemb_ref = NULL, + weights_exp = NULL, weights_ref = NULL, + cross.val = FALSE, na.rm = FALSE, + sig_method.type = 'two.sided.approx', alpha = 0.05, ncores = NULL) { + + # Check inputs + ## exp, obs, and ref (1) + if (!is.array(exp) | !is.numeric(exp)) { + stop("Parameter 'exp' must be a numeric array.") + } + if (!is.array(obs) | !is.numeric(obs)) { + stop("Parameter 'obs' must be a numeric array.") + } + if (any(is.null(names(dim(exp)))) | any(nchar(names(dim(exp))) == 0) | + any(is.null(names(dim(obs)))) | any(nchar(names(dim(obs))) == 0)) { + stop("Parameter 'exp' and 'obs' must have dimension names.") + } + if (!is.null(ref)) { + if (!is.array(ref) | !is.numeric(ref)) + stop("Parameter 'ref' must be a numeric array.") + if (any(is.null(names(dim(ref)))) | any(nchar(names(dim(ref))) == 0)) { + stop("Parameter 'ref' must have dimension names.") + } + } + ## time_dim + if (!is.character(time_dim) | length(time_dim) != 1) { + stop("Parameter 'time_dim' must be a character string.") + } + if (!time_dim %in% names(dim(exp)) | !time_dim %in% names(dim(obs))) { + stop("Parameter 'time_dim' is not found in 'exp' or 'obs' dimension.") + } + if (!is.null(ref) & !time_dim %in% names(dim(ref))) { + stop("Parameter 'time_dim' is not found in 'ref' dimension.") + } + ## memb_dim & cat_dim + if (is.null(memb_dim) + is.null(cat_dim) != 1) { + stop("Only one of the two parameters 'memb_dim' and 'cat_dim' can have value.") + } + ## memb_dim + if (!is.null(memb_dim)) { + if (!is.character(memb_dim) | length(memb_dim) > 1) { + stop("Parameter 'memb_dim' must be a character string.") + } + if (!memb_dim %in% names(dim(exp))) { + stop("Parameter 'memb_dim' is not found in 'exp' dimension.") + } + if (!is.null(ref) & !memb_dim %in% names(dim(ref))) { + stop("Parameter 'memb_dim' is not found in 'ref' dimension.") + } + } + ## cat_dim + if (!is.null(cat_dim)) { + if (!is.character(cat_dim) | length(cat_dim) > 1) { + stop("Parameter 'cat_dim' must be a character string.") + } + if (!cat_dim %in% names(dim(exp)) | !cat_dim %in% names(dim(obs)) | + (!is.null(ref) & !cat_dim %in% names(dim(ref)))) { + stop("Parameter 'cat_dim' is not found in 'exp', 'obs', or 'ref' dimension.") + } + } + ## dat_dim + if (!is.null(dat_dim)) { + if (!is.character(dat_dim) | length(dat_dim) > 1) { + stop("Parameter 'dat_dim' must be a character string.") + } + if (!dat_dim %in% names(dim(exp)) | !dat_dim %in% names(dim(obs))) { + stop("Parameter 'dat_dim' is not found in 'exp' or 'obs' dimension.", + " Set it as NULL if there is no dataset dimension.") + } + } + ## exp, obs, and ref (2) + name_exp <- sort(names(dim(exp))) + name_obs <- sort(names(dim(obs))) + if (!is.null(memb_dim)) { + name_exp <- name_exp[-which(name_exp == memb_dim)] + if (memb_dim %in% name_obs) { + name_obs <- name_obs[-which(name_obs == memb_dim)] + } + } + if (!is.null(dat_dim)) { + name_exp <- name_exp[-which(name_exp == dat_dim)] + name_obs <- name_obs[-which(name_obs == dat_dim)] + } + if (!identical(length(name_exp), length(name_obs)) | + !identical(dim(exp)[name_exp], dim(obs)[name_obs])) { + stop("Parameter 'exp' and 'obs' must have same length of ", + "all dimensions except 'memb_dim' and 'dat_dim'.") + } + if (!is.null(ref)) { + name_ref <- sort(names(dim(ref))) + if (!is.null(memb_dim)) { + name_ref <- name_ref[-which(name_ref == memb_dim)] + } + if (!is.null(dat_dim)) { + if (dat_dim %in% name_ref) { + if (!identical(dim(exp)[dat_dim], dim(ref)[dat_dim])) { + stop("If parameter 'ref' has dataset dimension, it must be", + " equal to dataset dimension of 'exp'.") + } + name_ref <- name_ref[-which(name_ref == dat_dim)] + } + } + if (!identical(length(name_exp), length(name_ref)) | + !identical(dim(exp)[name_exp], dim(ref)[name_ref])) { + stop("Parameter 'exp' and 'ref' must have the same length of ", + "all dimensions except 'memb_dim' and 'dat_dim' if there is ", + "only one reference dataset.") + } + } + ## prob_thresholds + if (!is.numeric(prob_thresholds) | !is.vector(prob_thresholds) | + any(prob_thresholds <= 0) | any(prob_thresholds >= 1)) { + stop("Parameter 'prob_thresholds' must be a numeric vector between 0 and 1.") + } + ## indices_for_clim + if (is.null(indices_for_clim)) { + indices_for_clim <- seq_len(dim(obs)[time_dim]) + } else { + if (!is.numeric(indices_for_clim) | !is.vector(indices_for_clim)) { + stop("Parameter 'indices_for_clim' must be NULL or a numeric vector.") + } else if (length(indices_for_clim) > dim(obs)[time_dim] | + max(indices_for_clim) > dim(obs)[time_dim] | + any(indices_for_clim) < 1) { + stop("Parameter 'indices_for_clim' should be the indices of 'time_dim'.") + } + } + ## Fair + if (!is.logical(Fair) | length(Fair) > 1) { + stop("Parameter 'Fair' must be either TRUE or FALSE.") + } + ## cross.val + if (!is.logical(cross.val) | length(cross.val) > 1) { + stop("Parameter 'cross.val' must be either TRUE or FALSE.") + } + ## weights_exp + if (!is.null(weights_exp) & is.null(cat_dim)) { + if (!is.array(weights_exp) | !is.numeric(weights_exp)) + stop("Parameter 'weights_exp' must be a named numeric array.") + + if (is.null(dat_dim)) { + if (length(dim(weights_exp)) != 2 | + !all(names(dim(weights_exp)) %in% c(memb_dim, time_dim))) { + stop("Parameter 'weights_exp' must have two dimensions with the names of ", + "'memb_dim' and 'time_dim'.") + } + if (dim(weights_exp)[memb_dim] != dim(exp)[memb_dim] | + dim(weights_exp)[time_dim] != dim(exp)[time_dim]) { + stop("Parameter 'weights_exp' must have the same dimension lengths as ", + "'memb_dim' and 'time_dim' in 'exp'.") + } + weights_exp <- Reorder(weights_exp, c(time_dim, memb_dim)) + + } else { + if (length(dim(weights_exp)) != 3 | + !all(names(dim(weights_exp)) %in% c(memb_dim, time_dim, dat_dim))) { + stop("Parameter 'weights_exp' must have three dimensions with the names of ", + "'memb_dim', 'time_dim' and 'dat_dim'.") + } + if (dim(weights_exp)[memb_dim] != dim(exp)[memb_dim] | + dim(weights_exp)[time_dim] != dim(exp)[time_dim] | + dim(weights_exp)[dat_dim] != dim(exp)[dat_dim]) { + stop("Parameter 'weights_exp' must have the same dimension lengths ", + "as 'memb_dim', 'time_dim' and 'dat_dim' in 'exp'.") + } + weights_exp <- Reorder(weights_exp, c(time_dim, memb_dim, dat_dim)) + } + } else if (!is.null(weights_exp) & !is.null(cat_dim)) { + .warning(paste0("Parameter 'exp' is probability already, so parameter ", + "'weights_exp' is not used. Change 'weights_exp' to NULL.")) + weights_exp <- NULL + } + ## weights_ref + if (!is.null(weights_ref) & is.null(cat_dim)) { + if (!is.array(weights_ref) | !is.numeric(weights_ref)) + stop("Parameter 'weights_ref' must be a named numeric array.") + + if (is.null(dat_dim) | ((!is.null(dat_dim)) && (!dat_dim %in% names(dim(ref))))) { + if (length(dim(weights_ref)) != 2 | + !all(names(dim(weights_ref)) %in% c(memb_dim, time_dim))) { + stop("Parameter 'weights_ref' must have two dimensions with the names of ", + "'memb_dim' and 'time_dim'.") + } + if (dim(weights_ref)[memb_dim] != dim(exp)[memb_dim] | + dim(weights_ref)[time_dim] != dim(exp)[time_dim]) { + stop("Parameter 'weights_ref' must have the same dimension lengths as ", + "'memb_dim' and 'time_dim' in 'ref'.") + } + weights_ref <- Reorder(weights_ref, c(time_dim, memb_dim)) + + } else { + if (length(dim(weights_ref)) != 3 | + !all(names(dim(weights_ref)) %in% c(memb_dim, time_dim, dat_dim))) { + stop("Parameter 'weights_ref' must have three dimensions with the names of ", + "'memb_dim', 'time_dim' and 'dat_dim'.") + } + if (dim(weights_ref)[memb_dim] != dim(ref)[memb_dim] | + dim(weights_ref)[time_dim] != dim(ref)[time_dim] | + dim(weights_ref)[dat_dim] != dim(ref)[dat_dim]) { + stop("Parameter 'weights_ref' must have the same dimension lengths ", + "as 'memb_dim', 'time_dim' and 'dat_dim' in 'ref'.") + } + weights_ref <- Reorder(weights_ref, c(time_dim, memb_dim, dat_dim)) + } + } else if (!is.null(weights_ref) & !is.null(cat_dim)) { + .warning(paste0("Parameter 'ref' is probability already, so parameter ", + "'weights_ref' is not used. Change 'weights_ref' to NULL.")) + weights_ref <- NULL + } + ## na.rm + if (!isTRUE(na.rm) & !isFALSE(na.rm) & !(is.numeric(na.rm) & na.rm >= 0 & na.rm <= 1)) { + stop('"na.rm" should be TRUE, FALSE or a numeric between 0 and 1') + } + ## alpha + if (any(!is.numeric(alpha) | alpha <= 0 | alpha >= 1 | length(alpha) > 1)) { + stop("Parameter 'alpha' must be a number between 0 and 1.") + } + ## sig_method.type + #NOTE: These are the types of RandomWalkTest() + if (!sig_method.type %in% c('two.sided.approx', 'two.sided', 'greater', 'less')) { + stop("Parameter 'sig_method.type' must be 'two.sided.approx', 'two.sided', ", + "'greater', or 'less'.") + } + if (sig_method.type == 'two.sided.approx' && alpha != 0.05) { + .warning("DelSole and Tippett (2016) aproximation is valid for alpha ", + "= 0.05 only. Returning the significance at the 0.05 significance level.") + } + ## ncores + if (!is.null(ncores)) { + if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 | + length(ncores) > 1) { + stop("Parameter 'ncores' must be either NULL or a positive integer.") + } + } + + ############################### + + # Compute RPSS + + ## Decide target_dims + if (!is.null(memb_dim)) { + target_dims_exp <- c(time_dim, memb_dim, dat_dim) + if (!memb_dim %in% names(dim(obs))) { + target_dims_obs <- c(time_dim, dat_dim) + } else { + target_dims_obs <- c(time_dim, memb_dim, dat_dim) + } + } else { # cat_dim + target_dims_exp <- target_dims_obs <- c(time_dim, cat_dim, dat_dim) + } + + if (!is.null(ref)) { # use "ref" as reference forecast + if (!is.null(memb_dim)) { + if (!is.null(dat_dim) && (dat_dim %in% names(dim(ref)))) { + target_dims_ref <- c(time_dim, memb_dim, dat_dim) + } else { + target_dims_ref <- c(time_dim, memb_dim) + } + } else { + target_dims_ref <- c(time_dim, cat_dim, dat_dim) + } + data <- list(exp = exp, obs = obs, ref = ref) + target_dims = list(exp = target_dims_exp, + obs = target_dims_obs, + ref = target_dims_ref) + } else { + data <- list(exp = exp, obs = obs) + target_dims = list(exp = target_dims_exp, + obs = target_dims_obs) + } + + output <- Apply(data, + target_dims = target_dims, + fun = .RPSS, + time_dim = time_dim, memb_dim = memb_dim, + cat_dim = cat_dim, dat_dim = dat_dim, + prob_thresholds = prob_thresholds, + indices_for_clim = indices_for_clim, Fair = Fair, + nmemb = nmemb, nmemb_ref = nmemb_ref, + weights_exp = weights_exp, + weights_ref = weights_ref, + cross.val = cross.val, + na.rm = na.rm, sig_method.type = sig_method.type, alpha = alpha, + ncores = ncores) + + return(output) + +} + +.RPSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', memb_dim = 'member', cat_dim = NULL, + dat_dim = NULL, prob_thresholds = c(1/3, 2/3), indices_for_clim = NULL, + Fair = FALSE, nmemb = NULL, nmemb_ref = NULL, + weights_exp = NULL, weights_ref = NULL, cross.val = FALSE, + na.rm = FALSE, sig_method.type = 'two.sided.approx', alpha = 0.05) { + #--- if memb_dim: + # exp: [sdate, memb, (dat)] + # obs: [sdate, (memb), (dat)] + # ref: [sdate, memb, (dat)] or NULL + #--- if cat_dim: + # exp: [sdate, bin, (dat)] + # obs: [sdate, bin, (dat)] + # ref: [sdate, bin, (dat)] or NULL + + if (isTRUE(na.rm)) { + f_NAs <- 0 + } else if (isFALSE(na.rm)) { + f_NAs <- 1 + } else { + f_NAs <- na.rm + } + + if (is.null(dat_dim)) { + nexp <- 1 + nobs <- 1 + } else { + nexp <- as.numeric(dim(exp)[dat_dim]) + nobs <- as.numeric(dim(obs)[dat_dim]) + } + + # Calculate RPS + + if (!is.null(ref)) { + + # Adjust dimensions to be [sdate, memb, dat] for both exp, obs, and ref + ## Insert memb_dim in obs + if (!is.null(memb_dim)) { + if (!memb_dim %in% names(dim(obs))) { + obs <- InsertDim(obs, posdim = 2, lendim = 1, name = memb_dim) + } + } + ## Insert dat_dim + if (is.null(dat_dim)) { + dim(obs) <- c(dim(obs), dat = nobs) + dim(exp) <- c(dim(exp), dat = nexp) + if (!is.null(weights_exp)) dim(weights_exp) <- c(dim(weights_exp), dat = nexp) + } + if (is.null(dat_dim) || (!is.null(dat_dim) && !dat_dim %in% names(dim(ref)))) { + nref <- 1 + dim(ref) <- c(dim(ref), dat = nref) + if (!is.null(weights_ref)) dim(weights_ref) <- c(dim(weights_ref), dat = nref) + } else { + nref <- as.numeric(dim(ref)[dat_dim]) # should be the same as nexp + } + + # Find good values then calculate RPS + rps_exp <- array(NA, dim = c(dim(exp)[time_dim], nexp = nexp, nobs = nobs)) + rps_ref <- array(NA, dim = c(dim(exp)[time_dim], nexp = nexp, nobs = nobs)) + for (i in 1:nexp) { + for (j in 1:nobs) { + for (k in 1:nref) { + if (nref != 1 & k != i) { # if nref is 1 or equal to nexp, calculate rps + next + } + exp_data <- exp[, , i, drop = F] + obs_data <- obs[, , j, drop = F] + ref_data <- ref[, , k, drop = F] + exp_mean <- rowMeans(exp_data) + obs_mean <- rowMeans(obs_data) + ref_mean <- rowMeans(ref_data) + good_values <- !is.na(exp_mean) & !is.na(obs_mean) & !is.na(ref_mean) + dum <- match(indices_for_clim, which(good_values)) + good_indices_for_clim <- dum[!is.na(dum)] + + if (f_NAs <= sum(good_values) / length(good_values)) { + rps_exp[good_values, i, j] <- .RPS(exp = exp[good_values, , i], + obs = obs[good_values, , j], + time_dim = time_dim, memb_dim = memb_dim, + cat_dim = cat_dim, dat_dim = NULL, + prob_thresholds = prob_thresholds, + indices_for_clim = good_indices_for_clim, + Fair = Fair, nmemb = nmemb, + weights = weights_exp[good_values, , i], + cross.val = cross.val, na.rm = na.rm) + rps_ref[good_values, i, j] <- .RPS(exp = ref[good_values, , k], + obs = obs[good_values, , j], + time_dim = time_dim, memb_dim = memb_dim, + cat_dim = cat_dim, dat_dim = NULL, + prob_thresholds = prob_thresholds, + indices_for_clim = good_indices_for_clim, + Fair = Fair, nmemb = nmemb_ref, + weights = weights_ref[good_values, , k], + na.rm = na.rm, cross.val = cross.val) + } + } + } + } + + } else { # ref is NULL + rps_exp <- .RPS(exp = exp, obs = obs, time_dim = time_dim, memb_dim = memb_dim, + cat_dim = cat_dim, dat_dim = dat_dim, prob_thresholds = prob_thresholds, + indices_for_clim = indices_for_clim, Fair = Fair, + nmemb = nmemb, weights = weights_exp, + cross.val = cross.val, na.rm = na.rm) + + # RPS of the reference forecast + if (!is.null(memb_dim)) { + if (!memb_dim %in% names(dim(obs))) { + obs <- InsertDim(obs, posdim = 2, lendim = 1, name = memb_dim) + } + } + + rps_ref <- array(NA, dim = c(dim(obs)[time_dim], nexp = nexp, nobs = nobs)) + + if (is.null(dat_dim)) { + dim(obs) <- c(dim(obs), nobs = nobs) + dim(exp) <- c(dim(exp), nexp = nexp) + dim(rps_exp) <- dim(rps_ref) + } + + for (i in 1:nexp) { + for (j in 1:nobs) { + # Use good values only + good_values <- !is.na(rps_exp[, i, j]) + if (f_NAs <= sum(good_values) / length(good_values)) { + obs_data <- obs[good_values, , j] + if (is.null(dim(obs_data))) dim(obs_data) <- c(length(obs_data), 1) + + if (is.null(cat_dim)) { # calculate probs + # Subset indices_for_clim + dum <- match(indices_for_clim, which(good_values)) + good_indices_for_clim <- dum[!is.na(dum)] + obs_probs <- .GetProbs(data = obs_data, + indices_for_quantiles = good_indices_for_clim, + prob_thresholds = prob_thresholds, + weights = NULL, cross.val = cross.val) + } else { + obs_probs <- t(obs_data) + } + # obs_probs: [bin, sdate] + + clim_probs <- c(prob_thresholds[1], diff(prob_thresholds), + 1 - prob_thresholds[length(prob_thresholds)]) + clim_probs <- array(clim_probs, dim = dim(obs_probs)) + # clim_probs: [bin, sdate] + + # Calculate RPS for each time step + probs_clim_cumsum <- apply(clim_probs, 2, cumsum) + probs_obs_cumsum <- apply(obs_probs, 2, cumsum) + rps_ref[good_values, i, j] <- colSums((probs_clim_cumsum - probs_obs_cumsum)^2) + } + if (Fair) { # FairRPS + if (!is.null(memb_dim)) { + if (memb_dim %in% names(dim(exp))) { + ## adjustment <- rowSums(-1 * (1/R - 1/R.new) * ens.cum * (R - ens.cum)/R/(R - 1)) + ## [formula taken from SpecsVerification::EnsRps] + R <- dim(obs)[1] #number of years + } + } else { + R <- nmemb_ref + } + adjustment <- (-1) / (R - 1) * probs_clim_cumsum * (1 - probs_clim_cumsum) + adjustment <- colSums(adjustment) + rps_ref[, i, j] <- rps_ref[, i, j] + adjustment + } + } + } + } + + if (is.null(dat_dim)) { + dim(rps_ref) <- dim(rps_exp) <- dim(exp)[time_dim] + } + +#---------------------------------------------- + # Calculate RPSS + + if (!is.null(dat_dim)) { + # rps_exp and rps_ref: [sdate, nexp, nobs] + rps_exp_mean <- colMeans(rps_exp, na.rm = TRUE) + rps_ref_mean <- colMeans(rps_ref, na.rm = TRUE) + rpss <- array(dim = c(nexp = nexp, nobs = nobs)) + sign <- array(dim = c(nexp = nexp, nobs = nobs)) + + if (!all(is.na(rps_exp_mean))) { + for (i in 1:nexp) { + for (j in 1:nobs) { + rpss[i, j] <- 1 - rps_exp_mean[i, j] / rps_ref_mean[i, j] + ind_nonNA <- !is.na(rps_exp[, i, j]) + if (!any(ind_nonNA)) { + sign[i, j] <- NA + } else { + sign[i, j] <- .RandomWalkTest(skill_A = rps_exp[ind_nonNA, i, j], + skill_B = rps_ref[ind_nonNA, i, j], + test.type = sig_method.type, alpha = alpha, + sign = T, pval = F)$sign + } + } + } + } + + # Turn NaN into NA + if (any(is.nan(rpss))) rpss[which(is.nan(rpss))] <- NA + + } else { # dat_dim is NULL + + ind_nonNA <- !is.na(rps_exp) + if (!any(ind_nonNA)) { + rpss <- NA + sign <- NA + } else { + # rps_exp and rps_ref: [sdate] + rpss <- 1 - mean(rps_exp, na.rm = TRUE) / mean(rps_ref, na.rm = TRUE) + sign <- .RandomWalkTest(skill_A = rps_exp[ind_nonNA], + skill_B = rps_ref[ind_nonNA], + test.type = sig_method.type, alpha = alpha, + sign = T, pval = F)$sign + } + } + + return(list(rpss = rpss, sign = sign)) +} diff --git a/modules/Crossval/R/tmp/RandomWalkTest.R b/modules/Crossval/R/tmp/RandomWalkTest.R new file mode 100644 index 00000000..16d89f6d --- /dev/null +++ b/modules/Crossval/R/tmp/RandomWalkTest.R @@ -0,0 +1,184 @@ +#'Random Walk test for skill differences +#' +#'Forecast comparison of the skill obtained with 2 forecasts (with respect to a +#'common observational reference) based on Random Walks (DelSole and Tippett, +#'2016). +#' +#'@param skill_A A numerical array of the time series of the scores obtained +#' with the forecaster A. +#'@param skill_B A numerical array of the time series of the scores obtained +#' with the forecaster B. The dimensions should be identical as parameter +#' 'skill_A'. +#'@param time_dim A character string indicating the name of the dimension along +#' which the tests are computed. The default value is 'sdate'. +#'@param test.type A character string indicating the type of significance test. +#' It can be "two.sided.approx" (to assess whether forecaster A and forecaster +#' B are significantly different in terms of skill with a two-sided test using +#' the approximation of DelSole and Tippett, 2016), "two.sided" (to assess +#' whether forecaster A and forecaster B are significantly different in terms +#' of skill with an exact two-sided test), "greater" (to assess whether +#' forecaster A shows significantly better skill than forecaster B with a +#' one-sided test for negatively oriented scores), or "less" (to assess whether +#' forecaster A shows significantly better skill than forecaster B with a +#' one-sided test for positively oriented scores). The default value is +#' "two.sided.approx". +#'@param alpha A numeric of the significance level to be used in the statistical +#' significance test (output "sign"). The default value is 0.05. +#'@param pval A logical value indicating whether to return the p-value of the +#' significance test. The default value is TRUE. +#'@param sign A logical value indicating whether to return the statistical +#' significance of the test based on 'alpha'. The default value is FALSE. +#'@param ncores An integer indicating the number of cores to use for parallel +#' computation. The default value is NULL. +#' +#'@return A list with: +#'\item{$score}{ +#' A numerical array with the same dimensions as the input arrays except +#' 'time_dim'. The number of times that forecaster A has been better than +#' forecaster B minus the number of times that forecaster B has been better +#' than forecaster A (for skill negatively oriented, i.e., the lower the +#' better). If $score is positive, forecaster A has been better more times +#' than forecaster B. If $score is negative, forecaster B has been better more +#' times than forecaster A. +#'} +#'\item{$sign}{ +#' A logical array of the statistical significance with the same dimensions +#' as the input arrays except "time_dim". Returned only if "sign" is TRUE. +#'} +#'\item{$p.val}{ +#' A numeric array of the p-values with the same dimensions as the input arrays +#' except "time_dim". Returned only if "pval" is TRUE. +#'} +#' +#'@details +#' Null and alternative hypothesis for "two-sided" test (regardless of the +#' orientation of the scores):\cr +#' H0: forecaster A and forecaster B are not different in terms of skill\cr +#' H1: forecaster A and forecaster B are different in terms of skill +#' +#' Null and alternative hypothesis for one-sided "greater" (for negatively +#' oriented scores, i.e., the lower the better) and "less" (for positively +#' oriented scores, i.e., the higher the better) tests:\cr +#' H0: forecaster A is not better than forecaster B\cr +#' H1: forecaster A is better than forecaster B +#' +#' Examples of negatively oriented scores are the RPS, RMSE and the Error, while +#' the ROC score is a positively oriented score. +#' +#' DelSole and Tippett (2016) approximation for two-sided test at 95% confidence +#' level: significant if the difference between the number of times that +#' forecaster A has been better than forecaster B and forecaster B has been +#' better than forecaster A is above 2sqrt(N) or below -2sqrt(N). +#' +#'@references +#'DelSole and Tippett (2016): https://doi.org/10.1175/MWR-D-15-0218.1 +#' +#'@examples +#' fcst_A <- array(data = 11:50, dim = c(sdate = 10, lat = 2, lon = 2)) +#' fcst_B <- array(data = 21:60, dim = c(sdate = 10, lat = 2, lon = 2)) +#' reference <- array(data = 1:40, dim = c(sdate = 10, lat = 2, lon = 2)) +#' scores_A <- abs(fcst_A - reference) +#' scores_B <- abs(fcst_B - reference) +#' res1 <- RandomWalkTest(skill_A = scores_A, skill_B = scores_B, pval = FALSE, sign = TRUE) +#' res2 <- RandomWalkTest(skill_A = scores_A, skill_B = scores_B, test.type = 'greater') +#' +#'@import multiApply +#'@importFrom ClimProjDiags Subset +#'@export +RandomWalkTest <- function(skill_A, skill_B, time_dim = 'sdate', + test.type = 'two.sided.approx', alpha = 0.05, pval = TRUE, + sign = FALSE, ncores = NULL) { + + # Check inputs + ## skill_A and skill_B + if (is.null(skill_A) | is.null(skill_B)) { + stop("Parameters 'skill_A' and 'skill_B' cannot be NULL.") + } + if (!is.numeric(skill_A) | !is.numeric(skill_B)) { + stop("Parameters 'skill_A' and 'skill_B' must be a numerical array.") + } + if (!identical(dim(skill_A), dim(skill_B))) { + stop("Parameters 'skill_A' and 'skill_B' must have the same dimensions.") + } + ## time_dim + if (!is.character(time_dim) | length(time_dim) != 1) { + stop("Parameter 'time_dim' must be a character string.") + } + if (!time_dim %in% names(dim(skill_A)) | !time_dim %in% names(dim(skill_B))) { + stop("Parameter 'time_dim' is not found in 'skill_A' or 'skill_B' dimensions.") + } + ## alpha + if (any(!is.numeric(alpha) | alpha <= 0 | alpha >= 1 | length(alpha) > 1)) { + stop("Parameter 'alpha' must be a number between 0 and 1.") + } + ## test.type + if (!test.type %in% c('two.sided.approx', 'two.sided', 'greater', 'less')) { + stop("Parameter 'test.type' must be 'two.sided.approx', 'two.sided', 'greater', or 'less'.") + } + if (test.type == 'two.sided.approx') { + if (alpha != 0.05) { + .warning("DelSole and Tippett (2016) aproximation is valid for alpha ", + "= 0.05 only. Returning the significance at the 0.05 significance level.") + } + if (pval) { + .warning("p-value cannot be returned with the DelSole and Tippett (2016) ", + "aproximation. Returning the significance at the 0.05 significance level.") + } + sign <- TRUE + } + ## ncores + if (!is.null(ncores)) { + if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 | length(ncores) > 1) { + stop("Parameter 'ncores' must be a positive integer.") + } + } + + ## Compute the Random Walk Test + res <- Apply(data = list(skill_A = skill_A, + skill_B = skill_B), + target_dims = list(skill_A = time_dim, + skill_B = time_dim), + fun = .RandomWalkTest, + test.type = test.type, + alpha = alpha, pval = pval, sign = sign, + ncores = ncores) + + return(res) +} + +.RandomWalkTest <- function(skill_A, skill_B, test.type = 'two.sided.approx', + alpha = 0.05, pval = TRUE, sign = FALSE) { + #skill_A and skill_B: [sdate] + + N.eff <- length(skill_A) + + A_better <- sum(skill_B > skill_A) + B_better <- sum(skill_B < skill_A) + + output <- NULL + output$score <- A_better - B_better + + if (test.type == 'two.sided.approx') { + output$sign <- abs(output$score) > (2 * sqrt(N.eff)) + + } else { + + if (!is.na(output$score)) { + p.val <- binom.test(x = A_better, n = floor(N.eff), p = 0.5, conf.level = 1 - alpha, + alternative = test.type)$p.value + + } else { + p.val <- NA + } + + if (pval) { + output$p.val <- p.val + } + if (sign) { + output$sign <- !is.na(p.val) & p.val <= alpha + } + + } + + return(output) +} diff --git a/modules/Crossval/R/tmp/SprErr.R b/modules/Crossval/R/tmp/SprErr.R new file mode 100644 index 00000000..33642eab --- /dev/null +++ b/modules/Crossval/R/tmp/SprErr.R @@ -0,0 +1,227 @@ +#'Compute the ratio between the ensemble spread and RMSE +#' +#'Compute the ratio between the spread of the members around the +#'ensemble mean in experimental data and the RMSE between the ensemble mean of +#'experimental and observational data. The p-value and/or the statistical +#'significance is provided by a one-sided Fisher's test. +#' +#'@param exp A named numeric array of experimental data with at least two +#' dimensions 'memb_dim' and 'time_dim'. +#'@param obs A named numeric array of observational data with at least two +#' dimensions 'memb_dim' and 'time_dim'. It should have the same dimensions as +#' parameter 'exp' except along 'dat_dim' and 'memb_dim'. +#'@param dat_dim A character string indicating the name of dataset (nobs/nexp) +#' dimension. The default value is NULL (no dataset). +#'@param memb_dim A character string indicating the name of the member +#' dimension. It must be one dimension in 'exp' and 'obs'. The default value +#' is 'member'. +#'@param time_dim A character string indicating the name of dimension along +#' which the ratio is computed. The default value is 'sdate'. +#'@param pval A logical value indicating whether to compute or not the p-value +#' of the test Ho : SD/RMSE = 1 or not. The default value is TRUE. +#'@param sign A logical value indicating whether to retrieve the statistical +#' significance of the test Ho: ACC = 0 based on 'alpha'. The default value is +#' FALSE. +#'@param alpha A numeric indicating the significance level for the statistical +#' significance test. The default value is 0.05. +#'@param na.rm A logical value indicating whether to remove NA values. The default +#' value is TRUE. +#'@param ncores An integer indicating the number of cores to use for parallel +#' computation. The default value is NULL. +#' +#'@return A list of two arrays with dimensions c(nexp, nobs, the rest of +#' dimensions of 'exp' and 'obs' except memb_dim and time_dim), which nexp is +#' the length of dat_dim of 'exp' and nobs is the length of dat_dim of 'obs'. +#' If dat_dim is NULL, nexp and nobs are omitted. \cr +#'\item{$ratio}{ +#' The ratio of the ensemble spread and RMSE. +#'} +#'\item{$p_val}{ +#' The p-value of the one-sided Fisher's test with Ho: SD/RMSE = 1. Only present +#' if \code{pval = TRUE}. +#'} +#' +#'@examples +#'# Load sample data as in Load() example: +#'example(Load) +#'rsdrms <- RatioSDRMS(sampleData$mod, sampleData$obs, dat_dim = 'dataset') +#'# Reorder the data in order to plot it with PlotVsLTime +#'rsdrms_plot <- array(dim = c(dim(rsdrms$ratio)[1:2], 4, dim(rsdrms$ratio)[3])) +#'rsdrms_plot[, , 2, ] <- rsdrms$ratio +#'rsdrms_plot[, , 4, ] <- rsdrms$p.val +#'\dontrun{ +#'PlotVsLTime(rsdrms_plot, toptitle = "Ratio ensemble spread / RMSE", ytitle = "", +#' monini = 11, limits = c(-1, 1.3), listexp = c('CMIP5 IC3'), +#' listobs = c('ERSST'), biglab = FALSE, siglev = TRUE) +#'} +#' +#'@import multiApply +#'@export +SprErr <- function(exp, obs, dat_dim = NULL, memb_dim = 'member', + time_dim = 'sdate', pval = TRUE, sign = FALSE, + alpha = 0.05, na.rm = FALSE, ncores = NULL) { + + # Check inputs + ## exp and obs (1) + if (is.null(exp) | is.null(obs)) { + stop("Parameter 'exp' and 'obs' cannot be NULL.") + } + if (!is.numeric(exp) | !is.numeric(obs)) { + stop("Parameter 'exp' and 'obs' must be a numeric array.") + } + if (is.null(dim(exp)) | is.null(dim(obs))) { + stop(paste0("Parameter 'exp' and 'obs' must be array with as least two ", + "dimensions memb_dim and time_dim.")) + } + if (any(is.null(names(dim(exp))))| any(nchar(names(dim(exp))) == 0) | + any(is.null(names(dim(obs))))| any(nchar(names(dim(obs))) == 0)) { + stop("Parameter 'exp' and 'obs' must have dimension names.") + } + ## dat_dim + if (!is.null(dat_dim)) { + if (!is.character(dat_dim) | length(dat_dim) > 1) { + stop("Parameter 'dat_dim' must be a character string.") + } + if (!dat_dim %in% names(dim(exp)) | !dat_dim %in% names(dim(obs))) { + stop("Parameter 'dat_dim' is not found in 'exp' or 'obs' dimension.") + } + } + ## memb_dim + if (!is.character(memb_dim) | length(memb_dim) > 1) { + stop("Parameter 'memb_dim' must be a character string.") + } + if (!memb_dim %in% names(dim(exp)) & !memb_dim %in% names(dim(obs))) { + stop("Parameter 'memb_dim' is not found in 'exp' nor 'obs' dimension. ", + "Set it as NULL if there is no member dimension.") + } + # Add [member = 1] + if (memb_dim %in% names(dim(exp)) & !memb_dim %in% names(dim(obs))) { + dim(obs) <- c(dim(obs), 1) + names(dim(obs))[length(dim(obs))] <- memb_dim + } + if (!memb_dim %in% names(dim(exp)) & memb_dim %in% names(dim(obs))) { + dim(exp) <- c(dim(exp), 1) + names(dim(exp))[length(dim(exp))] <- memb_dim + } + ## time_dim + if (!is.character(time_dim) | length(time_dim) > 1) { + stop("Parameter 'time_dim' must be a character string.") + } + if (!time_dim %in% names(dim(exp)) | !time_dim %in% names(dim(obs))) { + stop("Parameter 'time_dim' is not found in 'exp' or 'obs' dimension.") + } + ## exp and obs (2) + name_exp <- sort(names(dim(exp))) + name_obs <- sort(names(dim(obs))) + if (!is.null(dat_dim)) { + name_exp <- name_exp[-which(name_exp == dat_dim)] + name_obs <- name_obs[-which(name_obs == dat_dim)] + } + name_exp <- name_exp[-which(name_exp == memb_dim)] + name_obs <- name_obs[-which(name_obs == memb_dim)] + if (!identical(dim(exp)[name_exp], dim(obs)[name_obs])) { + stop(paste0("Parameter 'exp' and 'obs' must have same length of ", + "all the dimensions except 'dat_dim' and 'memb_dim'.")) + } + ## pval + if (!is.logical(pval) | length(pval) > 1) { + stop("Parameter 'pval' must be one logical value.") + } + ## sign + if (!is.logical(sign) | length(sign) > 1) { + stop("Parameter 'sign' must be one logical value.") + } + # alpha + if (!is.numeric(alpha) | any(alpha < 0) | any(alpha > 1) | length(alpha) > 1) { + stop("Parameter 'alpha' must be a numeric number between 0 and 1.") + } + # na.rm + if (!na.rm %in% c(TRUE, FALSE)) { + stop("Parameter 'na.rm' must be TRUE or FALSE") + } + ## ncores + if (!is.null(ncores)) { + if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 | + length(ncores) > 1) { + stop("Parameter 'ncores' must be a positive integer.") + } + } + + + ############################### + # Calculate RatioSDRMS + + # If dat_dim = NULL, insert dat dim + remove_dat_dim <- FALSE + if (is.null(dat_dim)) { + dat_dim <- 'dataset' + exp <- InsertDim(exp, posdim = 1, lendim = 1, name = 'dataset') + obs <- InsertDim(obs, posdim = 1, lendim = 1, name = 'dataset') + remove_dat_dim <- TRUE + } + + res <- Apply(list(exp, obs), + target_dims = list(c(dat_dim, memb_dim, time_dim), + c(dat_dim, memb_dim, time_dim)), + pval = pval, + sign = sign, + na.rm = na.rm, + fun = .SprErr, + ncores = ncores) + + if (remove_dat_dim) { + if (length(dim(res[[1]])) > 2) { + res <- lapply(res, Subset, c('nexp', 'nobs'), list(1, 1), drop = 'selected') + } else { + res <- lapply(res, as.numeric) + } + } + + return(res) +} + +.SprErr <- function(exp, obs, pval = TRUE, sign = FALSE, alpha = 0.05, na.rm = FALSE) { + + # exp: [dat_exp, member, sdate] + # obs: [dat_obs, member, sdate] + nexp <- dim(exp)[1] + nobs <- dim(obs)[1] + + # ensemble mean + ens_exp <- MeanDims(exp, 2, na.rm = na.rm) # [dat, sdate] + ens_obs <- MeanDims(obs, 2, na.rm = na.rm) + + # Create empty arrays + ratio <- array(dim = c(nexp = as.numeric(nexp), nobs = as.numeric(nobs))) # [nexp, nobs] + p.val <- array(dim = c(nexp = as.numeric(nexp), nobs = as.numeric(nobs))) # [nexp, nobs] + + for (jexp in 1:nexp) { + for (jobs in 1:nobs) { + + # spread and error + spread <- sqrt(mean(apply(exp[jexp,,], 2, var, na.rm = na.rm), na.rm = na.rm)) + error <- sqrt(mean((ens_obs - ens_exp[jexp,])^2, na.rm = na.rm)) + ratio[jexp, jobs] <- spread/error + + # effective sample size + enospr <- sum(Eno(apply(exp[jexp,,], 2, var, na.rm = na.rm), names(dim(exp))[3])) + enodif <- .Eno((ens_exp[jexp, ] - ens_obs[jobs, ])^2, na.action = na.pass) + if (pval) { + F <- (enospr[jexp] * spread[jexp]^2 / (enospr[jexp] - 1)) / (enodif * error^2 / (enodif - 1)) + if (!is.na(F) & !is.na(enospr[jexp]) & !is.na(enodif) & any(enospr > 2) & enodif > 2) { + p.val[jexp, jobs] <- pf(F, enospr[jexp] - 1, enodif - 1) + p.val[jexp, jobs] <- 2 * min(p.val[jexp, jobs], 1 - p.val[jexp, jobs]) + } else { + ratio[jexp, jobs] <- NA + } + } + } + } + + res <- list(ratio = ratio) + if (pval) {res$p.val <- p.val} + if (sign) {res$sign <- p.val <= alpha} + + return(res) +} + diff --git a/modules/Crossval/R/tmp/Utils.R b/modules/Crossval/R/tmp/Utils.R new file mode 100644 index 00000000..cd7a1e10 --- /dev/null +++ b/modules/Crossval/R/tmp/Utils.R @@ -0,0 +1,1885 @@ +#'@importFrom abind abind +#'@import plyr ncdf4 +#'@importFrom grDevices png jpeg pdf svg bmp tiff +#'@importFrom easyVerification convert2prob + +## Function to tell if a regexpr() match is a complete match to a specified name +.IsFullMatch <- function(x, name) { + x > 0 && attributes(x)$match.length == nchar(name) +} + +.ConfigReplaceVariablesInString <- function(string, replace_values, + allow_undefined_key_vars = FALSE) { + # This function replaces all the occurrences of a variable in a string by + # their corresponding string stored in the replace_values. + if (length(strsplit(string, "\\$")[[1]]) > 1) { + parts <- strsplit(string, "\\$")[[1]] + output <- "" + i <- 0 + for (part in parts) { + if (i %% 2 == 0) { + output <- paste0(output, part) + } else { + if (part %in% names(replace_values)) { + output <- paste0(output, + .ConfigReplaceVariablesInString(replace_values[[part]], + replace_values, + allow_undefined_key_vars)) + } else if (allow_undefined_key_vars) { + output <- paste0(output, "$", part, "$") + } else { + stop('Error: The variable $', part, + '$ was not defined in the configuration file.', sep = '') + } + } + i <- i + 1 + } + output + } else { + string + } +} + +.KnownLonNames <- function() { + known_lon_names <- c('lon', 'longitude', 'x', 'i', 'nav_lon') + return(known_lon_names) +} + +.KnownLatNames <- function() { + known_lat_names <- c('lat', 'latitude', 'y', 'j', 'nav_lat') + return(known_lat_names) +} + +.t2nlatlon <- function(t) { + ## As seen in cdo's griddes.c: ntr2nlat() + nlats <- (t * 3 + 1) / 2 + if ((nlats > 0) && (nlats - trunc(nlats) >= 0.5)) { + nlats <- ceiling(nlats) + } else { + nlats <- round(nlats) + } + if (nlats %% 2 > 0) { + nlats <- nlats + 1 + } + ## As seen in cdo's griddes.c: compNlon(), and as specified in ECMWF + nlons <- 2 * nlats + keep_going <- TRUE + while (keep_going) { + n <- nlons + if (n %% 8 == 0) n <- trunc(n / 8) + while (n %% 6 == 0) n <- trunc(n / 6) + while (n %% 5 == 0) n <- trunc(n / 5) + while (n %% 4 == 0) n <- trunc(n / 4) + while (n %% 3 == 0) n <- trunc(n / 3) + if (n %% 2 == 0) n <- trunc(n / 2) + if (n <= 8) { + keep_going <- FALSE + } else { + nlons <- nlons + 2 + if (nlons > 9999) { + stop("Error: pick another gaussian grid truncation. ", + "It doesn't fulfill the standards to apply FFT.") + } + } + } + c(nlats, nlons) +} + +.nlat2t <- function(nlats) { + trunc((nlats * 2 - 1) / 3) +} + +.LoadDataFile <- function(work_piece, explore_dims = FALSE, silent = FALSE) { + # The purpose, working modes, inputs and outputs of this function are + # explained in ?LoadDataFile + #suppressPackageStartupMessages({library(ncdf4)}) + #suppressPackageStartupMessages({library(bigmemory)}) + #suppressPackageStartupMessages({library(plyr)}) + # Auxiliar function to convert array indices to lineal indices + arrayIndex2VectorIndex <- function(indices, dims) { + if (length(indices) > length(dims)) { + stop("Error: indices do not match dimensions in arrayIndex2VectorIndex.") + } + position <- 1 + dims <- rev(dims) + indices <- rev(indices) + for (i in seq_along(indices)) { + position <- position + (indices[i] - 1) * prod(dims[-(1:i)]) + } + position + } + + found_file <- NULL + dims <- NULL + grid_name <- units <- var_long_name <- NULL + is_2d_var <- array_across_gw <- NULL + data_across_gw <- NULL + + filename <- work_piece[['filename']] + namevar <- work_piece[['namevar']] + output <- work_piece[['output']] + # The names of all data files in the directory of the repository that match + # the pattern are obtained. + if (any(grep("^http", filename))) { + is_url <- TRUE + files <- filename + ## TODO: Check that the user is not using shell globbing exps. + } else { + is_url <- FALSE + files <- Sys.glob(filename) + } + + # If we don't find any, we leave the flag 'found_file' with a NULL value. + if (length(files) > 0) { + # The first file that matches the pattern is chosen and read. + filename <- head(files, 1) + filein <- filename + found_file <- filename + mask <- work_piece[['mask']] + + if (!silent && explore_dims) { + .message(paste("Exploring dimensions...", filename)) + ##} else { + ## cat(paste("* Reading & processing data...", filename, '\n')) + ##} + } + + # We will fill in 'expected_dims' with the names of the expected dimensions of + # the data array we'll retrieve from the file. + expected_dims <- NULL + remap_needed <- FALSE + # But first we open the file and work out whether the requested variable is 2d + fnc <- nc_open(filein) + if (!(namevar %in% names(fnc$var))) { + stop("Error: The variable", namevar, "is not defined in the file", filename) + } + var_long_name <- fnc$var[[namevar]]$longname + units <- fnc$var[[namevar]]$units + file_dimnames <- unlist(lapply(fnc$var[[namevar]][['dim']], '[[', 'name')) + # The following two 'ifs' are to allow for 'lon'/'lat' by default, instead of + # 'longitude'/'latitude'. + if (!(work_piece[['dimnames']][['lon']] %in% file_dimnames) && + (work_piece[['dimnames']][['lon']] == 'longitude') && + ('lon' %in% file_dimnames)) { + work_piece[['dimnames']][['lon']] <- 'lon' + } + if (!(work_piece[['dimnames']][['lat']] %in% file_dimnames) && + (work_piece[['dimnames']][['lat']] == 'latitude') && + ('lat' %in% file_dimnames)) { + work_piece[['dimnames']][['lat']] <- 'lat' + } + if (is.null(work_piece[['is_2d_var']])) { + is_2d_var <- all(c(work_piece[['dimnames']][['lon']], + work_piece[['dimnames']][['lat']]) %in% + unlist(lapply(fnc$var[[namevar]][['dim']], + '[[', 'name'))) + } else { + is_2d_var <- work_piece[['is_2d_var']] + } + if ((is_2d_var || work_piece[['is_file_per_dataset']])) { + if (Sys.which("cdo")[[1]] == "") { + stop("Error: CDO libraries not available") + } + + cdo_version <- + strsplit(suppressWarnings( + system2("cdo", args = '-V', stderr = TRUE))[[1]], ' ')[[1]][5] + + cdo_version <- + as.numeric_version(unlist(strsplit(cdo_version, "[A-Za-z]", fixed = FALSE))[[1]]) + + } + # If the variable to load is 2-d, we need to determine whether: + # - interpolation is needed + # - subsetting is requested + if (is_2d_var) { + ## We read the longitudes and latitudes from the file. + lon <- ncvar_get(fnc, work_piece[['dimnames']][['lon']]) + lat <- ncvar_get(fnc, work_piece[['dimnames']][['lat']]) + first_lon_in_original_file <- lon[1] + # If a common grid is requested or we are exploring the file dimensions + # we need to read the grid type and size of the file to finally work out the + # CDO grid name. + if (!is.null(work_piece[['grid']]) || explore_dims) { + # Here we read the grid type and its number of longitudes and latitudes + file_info <- system(paste('cdo -s griddes', filein, '2> /dev/null'), intern = TRUE) + grids_positions <- grep('# gridID', file_info) + if (length(grids_positions) < 1) { + stop("The grid should be defined in the files.") + } + grids_first_lines <- grids_positions + 2 + grids_last_lines <- c((grids_positions - 2)[-1], length(file_info)) + grids_info <- as.list(seq_along(grids_positions)) + grids_info <- lapply(grids_info, + function (x) file_info[grids_first_lines[x]:grids_last_lines[x]]) + grids_info <- lapply(grids_info, function (x) gsub(" *", " ", x)) + grids_info <- lapply(grids_info, function (x) gsub("^ | $", "", x)) + grids_info <- lapply(grids_info, function (x) unlist(strsplit(x, " | = "))) + grids_types <- unlist(lapply(grids_info, function (x) x[grep('gridtype', x) + 1])) + grids_matches <- unlist(lapply(grids_info, function (x) { + nlons <- if (any(grep('xsize', x))) { + as.numeric(x[grep('xsize', x) + 1]) + } else { + NA + } + nlats <- if (any(grep('ysize', x))) { + as.numeric(x[grep('ysize', x) + 1]) + } else { + NA + } + result <- FALSE + if (!anyNA(c(nlons, nlats))) { + if ((nlons == length(lon)) && + (nlats == length(lat))) { + result <- TRUE + } + } + result + })) + grids_matches <- grids_matches[which(grids_types %in% c('gaussian', 'lonlat'))] + grids_info <- grids_info[which(grids_types %in% c('gaussian', 'lonlat'))] + grids_types <- grids_types[which(grids_types %in% c('gaussian', 'lonlat'))] + if (length(grids_matches) == 0) { + stop("Error: Only 'gaussian' and 'lonlat' grids supported. See e.g: cdo sinfo ", filename) + } + if (sum(grids_matches) > 1) { + if ((all(grids_types[which(grids_matches)] == 'gaussian') || + all(grids_types[which(grids_matches)] == 'lonlat')) && + all(unlist(lapply(grids_info[which(grids_matches)], identical, + grids_info[which(grids_matches)][[1]])))) { + grid_type <- grids_types[which(grids_matches)][1] + } else { + stop("Error: Load() can't disambiguate: ", + "More than one lonlat/gaussian grids with the same size as ", + "the requested variable defined in ", filename) + } + } else if (sum(grids_matches) == 1) { + grid_type <- grids_types[which(grids_matches)] + } else { + stop("Unexpected error.") + } + grid_lons <- length(lon) + grid_lats <- length(lat) + # Convert to CDO grid name as seen in cdo's griddes.c: nlat2ntr() + if (grid_type == 'lonlat') { + grid_name <- paste0('r', grid_lons, 'x', grid_lats) + } else { + grid_name <- paste0('t', .nlat2t(grid_lats), 'grid') + } + if (is.null(work_piece[['grid']])) { + .warning(paste0("Detect the grid type to be '", grid_name, "'. ", + "If it is not expected, assign parameter 'grid' to avoid wrong result.")) + } + } + # If a common grid is requested, we will also calculate its size which we will use + # later on. + if (!is.null(work_piece[['grid']])) { + # Now we calculate the common grid type and its lons and lats + if (any(grep('^t\\d{1,+}grid$', work_piece[['grid']]))) { + common_grid_type <- 'gaussian' + common_grid_res <- as.numeric(strsplit(work_piece[['grid']], '[^0-9]{1,+}')[[1]][2]) + nlonlat <- .t2nlatlon(common_grid_res) + common_grid_lats <- nlonlat[1] + common_grid_lons <- nlonlat[2] + } else if (any(grep('^r\\d{1,+}x\\d{1,+}$', work_piece[['grid']]))) { + common_grid_type <- 'lonlat' + common_grid_lons <- as.numeric(strsplit(work_piece[['grid']], '[^0-9]{1,+}')[[1]][2]) + common_grid_lats <- as.numeric(strsplit(work_piece[['grid']], '[^0-9]{1,+}')[[1]][3]) + } else { + stop("Error: Only supported grid types in parameter 'grid' are tgrid and rx") + } + } else { + ## If no 'grid' is specified, there is no common grid. + ## But these variables are filled in for consistency in the code. + common_grid_lons <- length(lon) + common_grid_lats <- length(lat) + } + first_common_grid_lon <- 0 + last_common_grid_lon <- 360 - 360 / common_grid_lons + ## This is not true for gaussian grids or for some regular grids, but + ## is a safe estimation + first_common_grid_lat <- -90 + last_common_grid_lat <- 90 + # And finally determine whether interpolation is needed or not + remove_shift <- FALSE + if (!is.null(work_piece[['grid']])) { + if ((grid_lons != common_grid_lons) || + (grid_lats != common_grid_lats) || + (grid_type != common_grid_type) || + (lon[1] != first_common_grid_lon)) { + if (grid_lons == common_grid_lons && grid_lats == common_grid_lats && + grid_type == common_grid_type && lon[1] != first_common_grid_lon) { + remove_shift <- TRUE + } + remap_needed <- TRUE + common_grid_name <- work_piece[['grid']] + } + } else if ((lon[1] != first_common_grid_lon) && explore_dims && + !work_piece[['single_dataset']]) { + remap_needed <- TRUE + common_grid_name <- grid_name + remove_shift <- TRUE + } + if (remap_needed && (work_piece[['remap']] == 'con') && + (cdo_version >= as.numeric_version('1.7.0'))) { + work_piece[['remap']] <- 'ycon' + } + if (remove_shift && !explore_dims) { + if (!is.null(work_piece[['progress_amount']])) { + cat("\n") + } + .warning(paste0("The dataset with index ", + tail(work_piece[['indices']], 1), " in '", + work_piece[['dataset_type']], + "' doesn't start at longitude 0 and will be re-interpolated in order ", + "to align its longitudes with the standard CDO grids definable with ", + "the names 'tgrid' or 'rx', which are by definition ", + "starting at the longitude 0.\n")) + if (!is.null(mask)) { + .warning(paste0("A mask was provided for the dataset with index ", + tail(work_piece[['indices']], 1), " in '", + work_piece[['dataset_type']], + "'. This dataset has been re-interpolated to align its longitudes to ", + "start at 0. You must re-interpolate the corresponding mask to align ", + "its longitudes to start at 0 as well, if you haven't done so yet. ", + "Running cdo remapcon,", common_grid_name, + " original_mask_file.nc new_mask_file.nc will fix it.\n")) + } + } + if (remap_needed && (grid_lons < common_grid_lons || grid_lats < common_grid_lats)) { + if (!is.null(work_piece[['progress_amount']])) { + cat("\n") + } + if (!explore_dims) { + .warning(paste0("The dataset with index ", tail(work_piece[['indices']], 1), + " in '", work_piece[['dataset_type']], "' is originally on ", + "a grid coarser than the common grid and it has been ", + "extrapolated. Check the results carefully. It is ", + "recommended to specify as common grid the coarsest grid ", + "among all requested datasets via the parameter 'grid'.\n")) + } + } + # Now calculate if the user requests for a lonlat subset or for the + # entire field + lonmin <- work_piece[['lon_limits']][1] + lonmax <- work_piece[['lon_limits']][2] + latmin <- work_piece[['lat_limits']][1] + latmax <- work_piece[['lat_limits']][2] + lon_subsetting_requested <- FALSE + lonlat_subsetting_requested <- FALSE + if (lonmin <= lonmax) { + if ((lonmin > first_common_grid_lon) || (lonmax < last_common_grid_lon)) { + lon_subsetting_requested <- TRUE + } + } else { + if ((lonmin - lonmax) > 360 / common_grid_lons) { + lon_subsetting_requested <- TRUE + } else { + gap_width <- floor(lonmin / (360 / common_grid_lons)) - + floor(lonmax / (360 / common_grid_lons)) + if (gap_width > 0) { + if (!(gap_width == 1 && (lonmin %% (360 / common_grid_lons) == 0) && + (lonmax %% (360 / common_grid_lons) == 0))) { + lon_subsetting_requested <- TRUE + } + } + } + } + if ((latmin > first_common_grid_lat) || (latmax < last_common_grid_lat) + || (lon_subsetting_requested)) { + lonlat_subsetting_requested <- TRUE + } + # Now that we know if subsetting was requested, we can say if final data + # will go across greenwich + if (lonmax < lonmin) { + data_across_gw <- TRUE + } else { + data_across_gw <- !lon_subsetting_requested + } + + # When remap is needed but no subsetting, the file is copied locally + # so that cdo works faster, and then interpolated. + # Otherwise the file is kept as is and the subset will have to be + # interpolated still. + if (!lonlat_subsetting_requested && remap_needed) { + nc_close(fnc) + filecopy <- tempfile(pattern = "load", fileext = ".nc") + file.copy(filein, filecopy) + filein <- tempfile(pattern = "loadRegridded", fileext = ".nc") + # "-L" is to serialize I/O accesses. It prevents potential segmentation fault in the + # underlying hdf5 library. + system(paste0("cdo -L -s remap", work_piece[['remap']], ",", + common_grid_name, + " -selname,", namevar, " ", filecopy, " ", filein, + " 2>/dev/null")) + file.remove(filecopy) + work_piece[['dimnames']][['lon']] <- 'lon' + work_piece[['dimnames']][['lat']] <- 'lat' + fnc <- nc_open(filein) + lon <- ncvar_get(fnc, work_piece[['dimnames']][['lon']]) + lat <- ncvar_get(fnc, work_piece[['dimnames']][['lat']]) + } + + # Read and check also the mask + if (!is.null(mask)) { + ###mask_file <- tempfile(pattern = 'loadMask', fileext = '.nc') + if (is.list(mask)) { + if (!file.exists(mask[['path']])) { + stop("Error: Couldn't find the mask file", mask[['path']]) + } + mask_file <- mask[['path']] + ###file.copy(work_piece[['mask']][['path']], mask_file) + fnc_mask <- nc_open(mask_file) + vars_in_mask <- sapply(fnc_mask$var, '[[', 'name') + if ('nc_var_name' %in% names(mask)) { + if (!(mask[['nc_var_name']] %in% + vars_in_mask)) { + stop("Error: couldn't find variable", mask[['nc_var_name']], + "in the mask file", mask[['path']]) + } + } else { + if (length(vars_in_mask) != 1) { + stop("Error: one and only one non-coordinate variable should be ", + "defined in the mask file", + mask[['path']], + "if the component 'nc_var_name' is not specified. ", + "Currently found: ", + toString(vars_in_mask), ".") + } else { + mask[['nc_var_name']] <- vars_in_mask + } + } + if (sum(fnc_mask$var[[mask[['nc_var_name']]]]$size > 1) != 2) { + stop("Error: the variable '", + mask[['nc_var_name']], + "' must be defined only over the dimensions '", + work_piece[['dimnames']][['lon']], "' and '", + work_piece[['dimnames']][['lat']], + "' in the mask file ", + mask[['path']]) + } + mask <- ncvar_get(fnc_mask, mask[['nc_var_name']], collapse_degen = TRUE) + nc_close(fnc_mask) + ### mask_lon <- ncvar_get(fnc_mask, work_piece[['dimnames']][['lon']]) + ### mask_lat <- ncvar_get(fnc_mask, work_piece[['dimnames']][['lat']]) + ###} else { + ### dim_longitudes <- ncdim_def(work_piece[['dimnames']][['lon']], "degrees_east", lon) + ### dim_latitudes <- ncdim_def(work_piece[['dimnames']][['lat']], "degrees_north", lat) + ### ncdf_var <- ncvar_def('LSM', "", list(dim_longitudes, dim_latitudes), NA, 'double') + ### fnc_mask <- nc_create(mask_file, list(ncdf_var)) + ### ncvar_put(fnc_mask, ncdf_var, work_piece[['mask']]) + ### nc_close(fnc_mask) + ### fnc_mask <- nc_open(mask_file) + ### work_piece[['mask']] <- list(path = mask_file, nc_var_name = 'LSM') + ### mask_lon <- lon + ### mask_lat <- lat + ###} + ###} + ### Now ready to check that the mask is right + ##if (!(lonlat_subsetting_requested && remap_needed)) { + ### if ((dim(mask)[2] != length(lon)) || (dim(mask)[1] != length(lat))) { + ### stop(paste("Error: the mask of the dataset with index ", + ### tail(work_piece[['indices']], 1), " in '", + ### work_piece[['dataset_type']], "' is wrong. ", + ### "It must be on the common grid if the selected output type is 'lonlat', ", + ### "'lon' or 'lat', or 'areave' and 'grid' has been specified. It must be on ", + ### "the grid of the corresponding dataset if the selected output type is ", + ### "'areave' and no 'grid' has been specified. For more information ", + ### "check ?Load and see help on parameters 'grid', 'maskmod' and ", + ### "'maskobs'.", sep = "")) + ### } + ###if (!(identical(mask_lon, lon) && identical(mask_lat, lat))) { + ### stop(paste0("Error: the longitudes and latitudes in the masks must be ", + ### "identical to the ones in the corresponding data files if output = 'areave' ", + ### " or, if the selected output is 'lon', 'lat' or 'lonlat', the longitudes in ", + ### "the mask file must start by 0 and the latitudes must be ordered from ", + ### "highest to lowest. See\n ", + ### work_piece[['mask']][['path']], " and ", filein)) + ###} + } + } + + lon_indices <- seq_along(lon) + if (!(lonlat_subsetting_requested && remap_needed)) { + lon[which(lon < 0)] <- lon[which(lon < 0)] + 360 + } + if (lonmax >= lonmin) { + lon_indices <- lon_indices[which(((lon %% 360) >= lonmin) & ((lon %% 360) <= lonmax))] + } else if (!remap_needed) { + lon_indices <- lon_indices[which(((lon %% 360) <= lonmax) | ((lon %% 360) >= lonmin))] + } + lat_indices <- which(lat >= latmin & lat <= latmax) + ## In most of the cases the latitudes are ordered from -90 to 90. + ## We will reorder them to be in the order from 90 to -90, so mostly + ## always the latitudes are reordered. + ## TODO: This could be avoided in future. + if (lat[1] < lat[length(lat)]) { + lat_indices <- lat_indices[rev(seq_along(lat_indices))] + } + if (!is.null(mask) && !(lonlat_subsetting_requested && remap_needed)) { + if ((dim(mask)[1] != length(lon)) || (dim(mask)[2] != length(lat))) { + stop("Error: the mask of the dataset with index ", tail(work_piece[['indices']], 1), + " in '", work_piece[['dataset_type']], "' is wrong. It must be on the ", + "common grid if the selected output type is 'lonlat', 'lon' or 'lat', ", + "or 'areave' and 'grid' has been specified. It must be on the grid of ", + "the corresponding dataset if the selected output type is 'areave' and ", + "no 'grid' has been specified. For more information check ?Load and see ", + "help on parameters 'grid', 'maskmod' and 'maskobs'.") + } + mask <- mask[lon_indices, lat_indices] + } + ## If the user requests subsetting, we must extend the lon and lat limits if possible + ## so that the interpolation after is done properly + maximum_extra_points <- work_piece[['remapcells']] + if (lonlat_subsetting_requested && remap_needed) { + if ((maximum_extra_points > (head(lon_indices, 1) - 1)) || + (maximum_extra_points > (length(lon) - tail(lon_indices, 1)))) { + ## if the requested number of points goes beyond the left or right + ## sides of the map, we need to take the entire map so that the + ## interpolation works properly + lon_indices <- seq_along(lon) + } else { + extra_points <- min(maximum_extra_points, head(lon_indices, 1) - 1) + if (extra_points > 0) { + lon_indices <- + c((head(lon_indices, 1) - extra_points):(head(lon_indices, 1) - 1), lon_indices) + } + extra_points <- min(maximum_extra_points, length(lon) - tail(lon_indices, 1)) + if (extra_points > 0) { + lon_indices <- c(lon_indices, + (tail(lon_indices, 1) + 1):(tail(lon_indices, 1) + extra_points)) + } + } + min_lat_ind <- min(lat_indices) + max_lat_ind <- max(lat_indices) + extra_points <- min(maximum_extra_points, min_lat_ind - 1) + if (extra_points > 0) { + if (lat[1] < tail(lat, 1)) { + lat_indices <- c(lat_indices, (min_lat_ind - 1):(min_lat_ind - extra_points)) + } else { + lat_indices <- c((min_lat_ind - extra_points):(min_lat_ind - 1), lat_indices) + } + } + extra_points <- min(maximum_extra_points, length(lat) - max_lat_ind) + if (extra_points > 0) { + if (lat[1] < tail(lat, 1)) { + lat_indices <- c((max_lat_ind + extra_points):(max_lat_ind + 1), lat_indices) + } else { + lat_indices <- c(lat_indices, (max_lat_ind + 1):(max_lat_ind + extra_points)) + } + } + } + lon <- lon[lon_indices] + lat <- lat[lat_indices] + expected_dims <- c(work_piece[['dimnames']][['lon']], + work_piece[['dimnames']][['lat']]) + } else { + lon <- 0 + lat <- 0 + } + # We keep on filling the expected dimensions + var_dimnames <- unlist(lapply(fnc$var[[namevar]][['dim']], '[[', 'name')) + nmemb <- nltime <- NULL + ## Sometimes CDO renames 'members' dimension to 'lev' + old_members_dimname <- NULL + if (('lev' %in% var_dimnames) && !(work_piece[['dimnames']][['member']] %in% var_dimnames)) { + old_members_dimname <- work_piece[['dimnames']][['member']] + work_piece[['dimnames']][['member']] <- 'lev' + } + if (work_piece[['dimnames']][['member']] %in% var_dimnames) { + nmemb <- fnc$var[[namevar]][['dim']][[match(work_piece[['dimnames']][['member']], + var_dimnames)]]$len + expected_dims <- c(expected_dims, work_piece[['dimnames']][['member']]) + } else { + nmemb <- 1 + } + if (length(expected_dims) > 0) { + dim_matches <- match(expected_dims, var_dimnames) + if (anyNA(dim_matches)) { + if (!is.null(old_members_dimname)) { + expected_dims[which(expected_dims == 'lev')] <- old_members_dimname + } + stop("Error: the expected dimension(s)", + toString(expected_dims[which(is.na(dim_matches))]), + "were not found in", filename) + } + time_dimname <- var_dimnames[-dim_matches] + } else { + time_dimname <- var_dimnames + } + if (length(time_dimname) > 0) { + if (length(time_dimname) == 1) { + nltime <- fnc$var[[namevar]][['dim']][[match(time_dimname, var_dimnames)]]$len + expected_dims <- c(expected_dims, time_dimname) + dim_matches <- match(expected_dims, var_dimnames) + } else { + if (!is.null(old_members_dimname)) { + expected_dims[which(expected_dims == 'lev')] <- old_members_dimname + } + stop("Error: the variable ", namevar, + " is defined over more dimensions than the expected (", + toString(c(expected_dims, 'time')), + "). It could also be that the members, longitude or latitude ", + "dimensions are named incorrectly. In that case, either rename ", + "the dimensions in the file or adjust Load() to recognize the actual ", + "name with the parameter 'dimnames'. See file ", filename) + } + } else { + nltime <- 1 + } + + # Now we must retrieve the data from the file, but only the asked indices. + # So we build up the indices to retrieve. + # Longitudes or latitudes have been retrieved already. + if (explore_dims) { + # If we're exploring the file we only want one time step from one member, + # to regrid it and work out the number of longitudes and latitudes. + # We don't need more. + members <- 1 + ltimes_list <- list(1) + } else { + # The data is arranged in the array 'tmp' with the dimensions in a + # common order: + # 1) Longitudes + # 2) Latitudes + # 3) Members (even if is not a file per member experiment) + # 4) Lead-times + if (work_piece[['is_file_per_dataset']]) { + time_indices <- 1:nltime + mons <- strsplit(system(paste('cdo showmon ', filein, + ' 2>/dev/null'), intern = TRUE), split = ' ') + years <- strsplit(system(paste('cdo showyear ', filein, + ' 2>/dev/null'), intern = TRUE), split = ' ') + mons <- as.numeric(mons[[1]][which(mons[[1]] != "")]) + years <- as.numeric(years[[1]][which(years[[1]] != "")]) + time_indices <- ts(time_indices, start = c(years[1], mons[1]), + end = c(years[length(years)], mons[length(mons)]), + frequency = 12) + ltimes_list <- list() + for (sdate in work_piece[['startdates']]) { + selected_time_indices <- window(time_indices, start = c(as.numeric( + substr(sdate, 1, 4)), as.numeric(substr(sdate, 5, 6))), + end = c(3000, 12), frequency = 12, extend = TRUE) + selected_time_indices <- selected_time_indices[work_piece[['leadtimes']]] + ltimes_list <- c(ltimes_list, list(selected_time_indices)) + } + } else { + ltimes <- work_piece[['leadtimes']] + #if (work_piece[['dataset_type']] == 'exp') { + ltimes_list <- list(ltimes[which(ltimes <= nltime)]) + #} + } + ## TODO: Put, when reading matrices, this kind of warnings + # if (nmember < nmemb) { + # cat("Warning: + members <- 1:work_piece[['nmember']] + members <- members[which(members <= nmemb)] + } + + # Now, for each list of leadtimes to load (usually only one list with all leadtimes), + # we'll join the indices and retrieve data + found_disordered_dims <- FALSE + for (ltimes in ltimes_list) { + if (is_2d_var) { + start <- c(min(lon_indices), min(lat_indices)) + end <- c(max(lon_indices), max(lat_indices)) + if (lonlat_subsetting_requested && remap_needed) { + subset_indices <- list(min(lon_indices):max(lon_indices) - min(lon_indices) + 1, + lat_indices - min(lat_indices) + 1) + dim_longitudes <- ncdim_def(work_piece[['dimnames']][['lon']], "degrees_east", lon) + dim_latitudes <- ncdim_def(work_piece[['dimnames']][['lat']], "degrees_north", lat) + ncdf_dims <- list(dim_longitudes, dim_latitudes) + } else { + subset_indices <- list(lon_indices - min(lon_indices) + 1, + lat_indices - min(lat_indices) + 1) + ncdf_dims <- list() + } + final_dims <- c(length(subset_indices[[1]]), length(subset_indices[[2]]), 1, 1) + } else { + start <- end <- NULL + subset_indices <- list() + ncdf_dims <- list() + final_dims <- c(1, 1, 1, 1) + } + + if (work_piece[['dimnames']][['member']] %in% expected_dims) { + start <- c(start, head(members, 1)) + end <- c(end, tail(members, 1)) + subset_indices <- c(subset_indices, list(members - head(members, 1) + 1)) + dim_members <- ncdim_def(work_piece[['dimnames']][['member']], "", members) + ncdf_dims <- c(ncdf_dims, list(dim_members)) + final_dims[3] <- length(members) + } + if (time_dimname %in% expected_dims) { + if (!all(is.na(ltimes))) { + start <- c(start, head(ltimes[which(!is.na(ltimes))], 1)) + end <- c(end, tail(ltimes[which(!is.na(ltimes))], 1)) + subset_indices <- c(subset_indices, + list(ltimes - head(ltimes[which(!is.na(ltimes))], 1) + 1)) + } else { + start <- c(start, NA) + end <- c(end, NA) + subset_indices <- c(subset_indices, list(ltimes)) + } + dim_time <- ncdim_def(time_dimname, "", seq_along(ltimes), unlim = TRUE) + ncdf_dims <- c(ncdf_dims, list(dim_time)) + final_dims[4] <- length(ltimes) + } + count <- end - start + 1 + start <- start[dim_matches] + count <- count[dim_matches] + subset_indices <- subset_indices[dim_matches] + # Now that we have the indices to retrieve, we retrieve the data + if (prod(final_dims) > 0) { + tmp <- take(ncvar_get(fnc, namevar, start, count, + collapse_degen = FALSE), + seq_along(subset_indices), subset_indices) + # The data is regridded if it corresponds to an atmospheric variable. When + # the chosen output type is 'areave' the data is not regridded to not + # waste computing time unless the user specified a common grid. + if (is_2d_var) { + ###if (!is.null(work_piece[['mask']]) && !(lonlat_subsetting_requested && remap_needed)) { + ### mask <- take(ncvar_get(fnc_mask, work_piece[['mask']][['nc_var_name']], + ### start[dim_matches[1:2]], count[dim_matches[1:2]], + ### collapse_degen = FALSE), 1:2, subset_indices[dim_matches[1:2]]) + ###} + if (lonlat_subsetting_requested && remap_needed) { + filein <- tempfile(pattern = "loadRegridded", fileext = ".nc") + filein2 <- tempfile(pattern = "loadRegridded2", fileext = ".nc") + ncdf_var <- ncvar_def(namevar, "", ncdf_dims[dim_matches], + fnc$var[[namevar]]$missval, + prec = if (fnc$var[[namevar]]$prec == 'int') { + 'integer' + } else { + fnc$var[[namevar]]$prec + }) + scale_factor <- ifelse(fnc$var[[namevar]]$hasScaleFact, fnc$var[[namevar]]$scaleFact, 1) + add_offset <- ifelse(fnc$var[[namevar]]$hasAddOffset, fnc$var[[namevar]]$addOffset, 0) + if (fnc$var[[namevar]]$hasScaleFact || fnc$var[[namevar]]$hasAddOffset) { + tmp <- (tmp - add_offset) / scale_factor + } + #nc_close(fnc) + fnc2 <- nc_create(filein2, list(ncdf_var)) + ncvar_put(fnc2, ncdf_var, tmp) + if (add_offset != 0) { + ncatt_put(fnc2, ncdf_var, 'add_offset', add_offset) + } + if (scale_factor != 1) { + ncatt_put(fnc2, ncdf_var, 'scale_factor', scale_factor) + } + nc_close(fnc2) + system(paste0("cdo -L -s -sellonlatbox,", if (lonmin > lonmax) { + "0,360," + } else { + paste0(lonmin, ",", lonmax, ",") + }, latmin, ",", latmax, + " -remap", work_piece[['remap']], ",", common_grid_name, + " ", filein2, " ", filein, " 2>/dev/null")) + file.remove(filein2) + fnc2 <- nc_open(filein) + sub_lon <- ncvar_get(fnc2, 'lon') + sub_lat <- ncvar_get(fnc2, 'lat') + ## We read the longitudes and latitudes from the file. + ## In principle cdo should put in order the longitudes + ## and slice them properly unless data is across greenwich + sub_lon[which(sub_lon < 0)] <- sub_lon[which(sub_lon < 0)] + 360 + sub_lon_indices <- seq_along(sub_lon) + if (lonmax < lonmin) { + sub_lon_indices <- sub_lon_indices[which((sub_lon <= lonmax) | (sub_lon >= lonmin))] + } + sub_lat_indices <- seq_along(sub_lat) + ## In principle cdo should put in order the latitudes + if (sub_lat[1] < sub_lat[length(sub_lat)]) { + sub_lat_indices <- rev(seq_along(sub_lat)) + } + final_dims[c(1, 2)] <- c(length(sub_lon_indices), length(sub_lat_indices)) + subset_indices[[dim_matches[1]]] <- sub_lon_indices + subset_indices[[dim_matches[2]]] <- sub_lat_indices + + tmp <- take(ncvar_get(fnc2, namevar, collapse_degen = FALSE), + seq_along(subset_indices), subset_indices) + + if (!is.null(mask)) { + ## We create a very simple 2d netcdf file that is then interpolated to the common + ## grid to know what are the lons and lats of our slice of data + mask_file <- tempfile(pattern = 'loadMask', fileext = '.nc') + mask_file_remap <- tempfile(pattern = 'loadMask', fileext = '.nc') + dim_longitudes <- ncdim_def(work_piece[['dimnames']][['lon']], + "degrees_east", c(0, 360)) + dim_latitudes <- ncdim_def(work_piece[['dimnames']][['lat']], + "degrees_north", c(-90, 90)) + ncdf_var <- ncvar_def('LSM', "", list(dim_longitudes, dim_latitudes), NA, 'double') + fnc_mask <- nc_create(mask_file, list(ncdf_var)) + ncvar_put(fnc_mask, ncdf_var, array(rep(0, 4), dim = c(2, 2))) + nc_close(fnc_mask) + system(paste0("cdo -L -s remap", work_piece[['remap']], ",", + common_grid_name, + " ", mask_file, " ", mask_file_remap, " 2>/dev/null")) + fnc_mask <- nc_open(mask_file_remap) + mask_lons <- ncvar_get(fnc_mask, 'lon') + mask_lats <- ncvar_get(fnc_mask, 'lat') + nc_close(fnc_mask) + file.remove(mask_file, mask_file_remap) + if ((dim(mask)[1] != common_grid_lons) || (dim(mask)[2] != common_grid_lats)) { + stop("Error: the mask of the dataset with index ", + tail(work_piece[['indices']], 1), " in '", + work_piece[['dataset_type']], + "' is wrong. It must be on the common grid if the ", + "selected output type is 'lonlat', 'lon' or 'lat', ", + "or 'areave' and 'grid' has been specified. It must ", + "be on the grid of the corresponding dataset if the ", + "selected output type is 'areave' and no 'grid' has been ", + "specified. For more information check ?Load and see help ", + "on parameters 'grid', 'maskmod' and 'maskobs'.") + } + mask_lons[which(mask_lons < 0)] <- mask_lons[which(mask_lons < 0)] + 360 + if (lonmax >= lonmin) { + mask_lon_indices <- which((mask_lons >= lonmin) & (mask_lons <= lonmax)) + } else { + mask_lon_indices <- which((mask_lons >= lonmin) | (mask_lons <= lonmax)) + } + mask_lat_indices <- which((mask_lats >= latmin) & (mask_lats <= latmax)) + if (sub_lat[1] < sub_lat[length(sub_lat)]) { + mask_lat_indices <- mask_lat_indices[rev(seq_along(mask_lat_indices))] + } + mask <- mask[mask_lon_indices, mask_lat_indices] + } + sub_lon <- sub_lon[sub_lon_indices] + sub_lat <- sub_lat[sub_lat_indices] + ### nc_close(fnc_mask) + ### system(paste0("cdo -s -sellonlatbox,", if (lonmin > lonmax) { + ### "0,360," + ### } else { + ### paste0(lonmin, ",", lonmax, ",") + ### }, latmin, ",", latmax, + ### " -remap", work_piece[['remap']], ",", common_grid_name, + ###This is wrong: same files + ### " ", mask_file, " ", mask_file, " 2>/dev/null", sep = "")) + ### fnc_mask <- nc_open(mask_file) + ### mask <- take(ncvar_get(fnc_mask, work_piece[['mask']][['nc_var_name']], + ### collapse_degen = FALSE), 1:2, subset_indices[dim_matches[1:2]]) + ###} + } + } + if (is.unsorted(dim_matches)) { + if (!found_disordered_dims && + rev(work_piece[['indices']])[2] == 1 && + rev(work_piece[['indices']])[3] == 1) { + found_disordered_dims <- TRUE + .warning(paste0("The dimensions for the variable ", namevar, + " in the files of the experiment with index ", + tail(work_piece[['indices']], 1), + " are not in the optimal order for loading with Load(). ", + "The optimal order would be '", + toString(expected_dims), + "'. One of the files of the dataset is stored in ", filename)) + } + tmp <- aperm(tmp, dim_matches) + } + dim(tmp) <- final_dims + # If we are exploring the file we don't need to process and arrange + # the retrieved data. We only need to keep the dimension sizes. + if (is_2d_var && lonlat_subsetting_requested && remap_needed) { + final_lons <- sub_lon + final_lats <- sub_lat + } else { + final_lons <- lon + final_lats <- lat + } + if (explore_dims) { + if (work_piece[['is_file_per_member']]) { + ## TODO: When the exp_full_path contains asterisks and is file_per_member + ## members from different datasets may be accounted. + ## Also if one file member is missing the accounting will be wrong. + ## Should parse the file name and extract number of members. + if (is_url) { + nmemb <- NULL + } else { + nmemb <- length(files) + } + } + dims <- list(member = nmemb, ftime = nltime, lon = final_lons, lat = final_lats) + } else { + # If we are not exploring, then we have to process the retrieved data + if (is_2d_var) { + tmp <- apply(tmp, c(3, 4), function(x) { + # Disable of large values. + if (!is.na(work_piece[['var_limits']][2])) { + x[which(x > work_piece[['var_limits']][2])] <- NA + } + if (!is.na(work_piece[['var_limits']][1])) { + x[which(x < work_piece[['var_limits']][1])] <- NA + } + if (!is.null(mask)) { + x[which(mask < 0.5)] <- NA + } + + if (output == 'areave' || output == 'lon') { + weights <- InsertDim(cos(final_lats * pi / 180), 1, + length(final_lons), name = 'lon') + weights[which(is.na(x))] <- NA + if (output == 'areave') { + weights <- weights / mean(weights, na.rm = TRUE) + mean(x * weights, na.rm = TRUE) + } else { + weights <- weights / InsertDim(MeanDims(weights, 2, na.rm = TRUE), 2, + length(final_lats), name = 'lat') + MeanDims(x * weights, 2, na.rm = TRUE) + } + } else if (output == 'lat') { + MeanDims(x, 1, na.rm = TRUE) + } else if (output == 'lonlat') { + signif(x, 5) + } + }) + if (output == 'areave') { + dim(tmp) <- c(1, 1, final_dims[3:4]) + } else if (output == 'lon') { + dim(tmp) <- c(final_dims[1], 1, final_dims[3:4]) + } else if (output == 'lat') { + dim(tmp) <- c(1, final_dims[c(2, 3, 4)]) + } else if (output == 'lonlat') { + dim(tmp) <- final_dims + } + } + var_data <- attach.big.matrix(work_piece[['out_pointer']]) + if (work_piece[['dims']][['member']] > 1 && nmemb > 1 && + work_piece[['dims']][['ftime']] > 1 && + nltime < work_piece[['dims']][['ftime']]) { + work_piece[['indices']][2] <- work_piece[['indices']][2] - 1 + for (jmemb in members) { + work_piece[['indices']][2] <- work_piece[['indices']][2] + 1 + out_position <- arrayIndex2VectorIndex(work_piece[['indices']], work_piece[['dims']]) + out_indices <- out_position:(out_position + length(tmp[, , jmemb, ]) - 1) + var_data[out_indices] <- as.vector(tmp[, , jmemb, ]) + } + work_piece[['indices']][2] <- work_piece[['indices']][2] - tail(members, 1) + 1 + } else { + out_position <- arrayIndex2VectorIndex(work_piece[['indices']], work_piece[['dims']]) + out_indices <- out_position:(out_position + length(tmp) - 1) + a <- aperm(tmp, c(1, 2, 4, 3)) + as.vector(a) + var_data[out_indices] <- as.vector(aperm(tmp, c(1, 2, 4, 3))) + } + work_piece[['indices']][3] <- work_piece[['indices']][3] + 1 + } + } + } + nc_close(fnc) + if (is_2d_var) { + if (remap_needed) { + array_across_gw <- FALSE + file.remove(filein) + ###if (!is.null(mask) && lonlat_subsetting_requested) { + ### file.remove(mask_file) + ###} + } else { + if (first_lon_in_original_file < 0) { + array_across_gw <- data_across_gw + } else { + array_across_gw <- FALSE + } + } + } + } + if (explore_dims) { + list(dims = dims, is_2d_var = is_2d_var, grid = grid_name, + units = units, var_long_name = var_long_name, + data_across_gw = data_across_gw, array_across_gw = array_across_gw) + } else { + ###if (!silent && !is.null(progress_connection) && !is.null(work_piece[['progress_amount']])) { + ### foobar <- writeBin(work_piece[['progress_amount']], progress_connection) + ###} + if (!silent && !is.null(work_piece[['progress_amount']])) { + message(work_piece[['progress_amount']], appendLF = FALSE) + } + found_file + } +} + +.LoadSampleData <- function(var, exp = NULL, obs = NULL, sdates, + nmember = NULL, nmemberobs = NULL, + nleadtime = NULL, leadtimemin = 1, + leadtimemax = NULL, storefreq = 'monthly', + sampleperiod = 1, lonmin = 0, lonmax = 360, + latmin = -90, latmax = 90, output = 'areave', + method = 'conservative', grid = NULL, + maskmod = vector("list", 15), + maskobs = vector("list", 15), + configfile = NULL, suffixexp = NULL, + suffixobs = NULL, varmin = NULL, varmax = NULL, + silent = FALSE, nprocs = NULL) { + ## This function loads and selects sample data stored in sampleMap and + ## sampleTimeSeries and is used in the examples instead of Load() so as + ## to avoid nco and cdo system calls and computation time in the stage + ## of running examples in the CHECK process on CRAN. + selected_start_dates <- match(sdates, c('19851101', '19901101', '19951101', + '20001101', '20051101')) + start_dates_position <- 3 + lead_times_position <- 4 + + if (output == 'lonlat') { + sampleData <- s2dv::sampleMap + if (is.null(leadtimemax)) { + leadtimemax <- dim(sampleData$mod)[lead_times_position] + } + selected_lead_times <- leadtimemin:leadtimemax + + dataOut <- sampleData + dataOut$mod <- sampleData$mod[, , selected_start_dates, selected_lead_times, , ] + dataOut$obs <- sampleData$obs[, , selected_start_dates, selected_lead_times, , ] + } else if (output == 'areave') { + sampleData <- s2dv::sampleTimeSeries + if (is.null(leadtimemax)) { + leadtimemax <- dim(sampleData$mod)[lead_times_position] + } + selected_lead_times <- leadtimemin:leadtimemax + + dataOut <- sampleData + dataOut$mod <- sampleData$mod[, , selected_start_dates, selected_lead_times] + dataOut$obs <- sampleData$obs[, , selected_start_dates, selected_lead_times] + } + + dims_out <- dim(sampleData$mod) + dims_out[start_dates_position] <- length(selected_start_dates) + dims_out[lead_times_position] <- length(selected_lead_times) + dim(dataOut$mod) <- dims_out + + dims_out <- dim(sampleData$obs) + dims_out[start_dates_position] <- length(selected_start_dates) + dims_out[lead_times_position] <- length(selected_lead_times) + dim(dataOut$obs) <- dims_out + + invisible(list(mod = dataOut$mod, obs = dataOut$obs, + lat = dataOut$lat, lon = dataOut$lon)) +} + +.ConfigGetDatasetInfo <- function(matching_entries, table_name) { + # This function obtains the information of a dataset and variable pair, + # applying all the entries that match in the configuration file. + if (table_name == 'experiments') { + id <- 'EXP' + } else { + id <- 'OBS' + } + defaults <- c(paste0('$DEFAULT_', id, '_MAIN_PATH$'), + paste0('$DEFAULT_', id, '_FILE_PATH$'), + '$DEFAULT_NC_VAR_NAME$', '$DEFAULT_SUFFIX$', + '$DEFAULT_VAR_MIN$', '$DEFAULT_VAR_MAX$') + info <- NULL + + for (entry in matching_entries) { + if (is.null(info)) { + info <- entry[-1:-2] + info[which(info == '*')] <- defaults[which(info == '*')] + } else { + info[which(entry[-1:-2] != '*')] <- entry[-1:-2][which(entry[-1:-2] != '*')] + } + } + + info <- as.list(info) + names(info) <- c('main_path', 'file_path', 'nc_var_name', 'suffix', 'var_min', 'var_max') + info +} + +.ReplaceGlobExpressions <- function(path_with_globs, actual_path, + replace_values, tags_to_keep, + dataset_name, permissive) { + # The goal of this function is to replace the shell globbing expressions in + # a path pattern (that may contain shell globbing expressions and Load() + # tags) by the corresponding part of the real existing path. + # What is done actually is to replace all the values of the tags in the + # actual path by the corresponding $TAG$ + # + # It takes mainly two inputs. The path with expressions and tags, e.g.: + # /data/experiments/*/$EXP_NAME$/$VAR_NAME$/$VAR_NAME$_*$START_DATE$*.nc + # and a complete known path to one of the matching files, e.g.: + # /data/experiments/ecearth/i00k/tos/tos_fc0-1_19901101_199011-199110.nc + # and it returns the path pattern but without shell globbing expressions: + # /data/experiments/ecearth/$EXP_NAME$/$VAR_NAME$/$VAR_NAME$_fc0-1_$START_DATE$_199011-199110.nc + # + # To do that, it needs also as inputs the list of replace values (the + # association of each tag to their value). + # + # All the tags not present in the parameter tags_to_keep will be repalced. + # + # Not all cases can be resolved with the implemented algorithm. In an + # unsolvable case a warning is given and one possible guess is returned. + # + # In some cases it is interesting to replace only the expressions in the + # path to the file, but not the ones in the file name itself. To keep the + # expressions in the file name, the parameter permissive can be set to + # TRUE. To replace all the expressions it can be set to FALSE. + clean <- function(x) { + if (nchar(x) > 0) { + x <- gsub('\\\\', '', x) + x <- gsub('\\^', '', x) + x <- gsub('\\$', '', x) + x <- unname(sapply(strsplit(x, '[', fixed = TRUE)[[1]], function(y) gsub('.*]', '.', y))) + do.call(paste0, as.list(x)) + } else { + x + } + } + + strReverse <- function(x) sapply(lapply(strsplit(x, NULL), rev), paste, collapse = "") + + if (permissive) { + actual_path_chunks <- strsplit(actual_path, '/')[[1]] + actual_path <- paste(actual_path_chunks[-length(actual_path_chunks)], collapse = '/') + file_name <- tail(actual_path_chunks, 1) + if (length(actual_path_chunks) > 1) { + file_name <- paste0('/', file_name) + } + path_with_globs_chunks <- strsplit(path_with_globs, '/')[[1]] + path_with_globs <- paste(path_with_globs_chunks[-length(path_with_globs_chunks)], + collapse = '/') + path_with_globs <- .ConfigReplaceVariablesInString(path_with_globs, replace_values) + file_name_with_globs <- tail(path_with_globs_chunks, 1) + if (length(path_with_globs_chunks) > 1) { + file_name_with_globs <- paste0('/', file_name_with_globs) + } + right_known <- head(strsplit(file_name_with_globs, '*', fixed = TRUE)[[1]], 1) + right_known_no_tags <- .ConfigReplaceVariablesInString(right_known, replace_values) + path_with_globs_rx <- utils::glob2rx(paste0(path_with_globs, right_known_no_tags)) + match <- regexpr(gsub('$', '', path_with_globs_rx, fixed = TRUE), + paste0(actual_path, file_name)) + if (match != 1) { + stop("Incorrect parameters to replace glob expressions. ", + "The path with expressions does not match the actual path.") + } + if (attr(match, 'match.length') - nchar(right_known_no_tags) < nchar(actual_path)) { + path_with_globs <- paste0(path_with_globs, right_known_no_tags, '*') + file_name_with_globs <- sub(right_known, '/*', file_name_with_globs) + } + } + path_with_globs_rx <- utils::glob2rx(path_with_globs) + values_to_replace <- NULL + tags_to_replace_starts <- NULL + tags_to_replace_ends <- NULL + give_warning <- FALSE + for (tag in tags_to_keep) { + matches <- gregexpr(paste0('$', tag, '$'), path_with_globs_rx, fixed = TRUE)[[1]] + lengths <- attr(matches, 'match.length') + if (!(length(matches) == 1 && matches[1] == -1)) { + for (i in seq_along(matches)) { + left <- NULL + if (matches[i] > 1) { + left <- + .ConfigReplaceVariablesInString(substr(path_with_globs_rx, 1, + matches[i] - 1), replace_values) + left_known <- + strReverse(head(strsplit(strReverse(left), + strReverse('.*'), fixed = TRUE)[[1]], 1)) + } + right <- NULL + if ((matches[i] + lengths[i] - 1) < nchar(path_with_globs_rx)) { + right <- + .ConfigReplaceVariablesInString(substr(path_with_globs_rx, + matches[i] + lengths[i], + nchar(path_with_globs_rx)), + replace_values) + right_known <- head(strsplit(right, '.*', fixed = TRUE)[[1]], 1) + } + match_limits <- NULL + if (!is.null(left)) { + left_match <- regexpr(paste0(left, replace_values[[tag]], right_known), actual_path) + match_len <- attr(left_match, 'match.length') + left_match_limits <- + c(left_match + match_len - 1 - nchar(clean(right_known)) - + nchar(replace_values[[tag]]) + 1, + left_match + match_len - 1 - nchar(clean(right_known))) + if (!(left_match < 1)) { + match_limits <- left_match_limits + } + } + right_match <- NULL + if (!is.null(right)) { + right_match <- regexpr(paste0(left_known, replace_values[[tag]], right), actual_path) + match_len <- attr(right_match, 'match.length') + right_match_limits <- + c(right_match + nchar(clean(left_known)), + right_match + nchar(clean(left_known)) + + nchar(replace_values[[tag]]) - 1) + if (is.null(match_limits) && !(right_match < 1)) { + match_limits <- right_match_limits + } + } + if (!is.null(right_match) && !is.null(left_match)) { + if (!identical(right_match_limits, left_match_limits)) { + give_warning <- TRUE + } + } + if (is.null(match_limits)) { + stop("Too complex path pattern specified for ", dataset_name, + ". Specify a simpler path pattern for this dataset.") + } + values_to_replace <- c(values_to_replace, tag) + tags_to_replace_starts <- c(tags_to_replace_starts, match_limits[1]) + tags_to_replace_ends <- c(tags_to_replace_ends, match_limits[2]) + } + } + } + + if (length(tags_to_replace_starts) > 0) { + reorder <- sort(tags_to_replace_starts, index.return = TRUE) + tags_to_replace_starts <- reorder$x + values_to_replace <- values_to_replace[reorder$ix] + tags_to_replace_ends <- tags_to_replace_ends[reorder$ix] + while (length(values_to_replace) > 0) { + actual_path <- paste0(substr(actual_path, 1, head(tags_to_replace_starts, 1) - 1), + '$', head(values_to_replace, 1), '$', + substr(actual_path, head(tags_to_replace_ends, 1) + 1, + nchar(actual_path))) + extra_chars <- nchar(head(values_to_replace, 1)) + 2 - + (head(tags_to_replace_ends, 1) - + head(tags_to_replace_starts, 1) + 1) + values_to_replace <- values_to_replace[-1] + tags_to_replace_starts <- tags_to_replace_starts[-1] + tags_to_replace_ends <- tags_to_replace_ends[-1] + tags_to_replace_starts <- tags_to_replace_starts + extra_chars + tags_to_replace_ends <- tags_to_replace_ends + extra_chars + } + } + + if (give_warning) { + .warning(paste0("Too complex path pattern specified for ", dataset_name, + ". Double check carefully the '$Files' fetched for this dataset ", + "or specify a simpler path pattern.")) + } + + if (permissive) { + paste0(actual_path, file_name_with_globs) + } else { + actual_path + } +} + +.FindTagValue <- function(path_with_globs_and_tag, actual_path, tag) { + tag <- paste0('\\$', tag, '\\$') + path_with_globs_and_tag <- paste0('^', path_with_globs_and_tag, '$') + parts <- strsplit(path_with_globs_and_tag, '*', fixed = TRUE)[[1]] + parts <- as.list(grep(tag, parts, value = TRUE)) + longest_couples <- NULL + pos_longest_couples <- NULL + found_value <- NULL + for (i in seq_along(parts)) { + parts[[i]] <- strsplit(parts[[i]], tag)[[1]] + if (length(parts[[i]]) == 1) { + parts[[i]] <- c(parts[[i]], '') + } + len_parts <- sapply(parts[[i]], nchar) + len_couples <- len_parts[-length(len_parts)] + len_parts[2:length(len_parts)] + pos_longest_couples <- c(pos_longest_couples, which.max(len_couples)) + longest_couples <- c(longest_couples, max(len_couples)) + } + chosen_part <- which.max(longest_couples) + parts[[chosen_part]] <- + parts[[chosen_part]][pos_longest_couples[chosen_part]:(pos_longest_couples[chosen_part] + 1)] + if (nchar(parts[[chosen_part]][1]) >= nchar(parts[[chosen_part]][2])) { + if (nchar(parts[[chosen_part]][1]) > 0) { + matches <- gregexpr(parts[[chosen_part]][1], actual_path)[[1]] + if (length(matches) == 1) { + match_left <- matches + actual_path <- + substr(actual_path, match_left + attr(match_left, 'match.length'), nchar(actual_path)) + } + } + if (nchar(parts[[chosen_part]][2]) > 0) { + matches <- gregexpr(parts[[chosen_part]][2], actual_path)[[1]] + if (length(matches) == 1) { + match_right <- matches + found_value <- substr(actual_path, 0, match_right - 1) + } + } + } else { + if (nchar(parts[[chosen_part]][2]) > 0) { + matches <- gregexpr(parts[[chosen_part]][2], actual_path)[[1]] + if (length(matches) == 1) { + match_right <- matches + actual_path <- substr(actual_path, 0, match_right - 1) + } + } + if (nchar(parts[[chosen_part]][1]) > 0) { + matches <- gregexpr(parts[[chosen_part]][1], actual_path)[[1]] + if (length(matches) == 1) { + match_left <- matches + found_value <- + substr(actual_path, match_left + attr(match_left, 'match.length'), + nchar(actual_path)) + } + } + } + found_value +} + +.FilterUserGraphicArgs <- function(excludedArgs, ...) { + # This function filter the extra graphical parameters passed by the user in + # a plot function, excluding the ones that the plot function uses by default. + # Each plot function has a different set of arguments that are not allowed to + # be modified. + args <- list(...) + userArgs <- list() + for (name in names(args)) { + if ((name != "") & !is.element(name, excludedArgs)) { + # If the argument has a name and it is not in the list of excluded + # arguments, then it is added to the list that will be used + userArgs[[name]] <- args[[name]] + } else { + .warning(paste0("the argument '", name, "' can not be + modified and the new value will be ignored")) + } + } + userArgs +} + +.SelectDevice <- function(fileout, width, height, units, res) { + # This function is used in the plot functions to check the extension of the + # files where the graphics will be stored and select the right R device to + # save them. + # If the vector of filenames ('fileout') has files with different + # extensions, then it will only accept the first one, changing all the rest + # of the filenames to use that extension. + + # We extract the extension of the filenames: '.png', '.pdf', ... + ext <- regmatches(fileout, regexpr("\\.[a-zA-Z0-9]*$", fileout)) + + if (length(ext) != 0) { + # If there is an extension specified, select the correct device + ## units of width and height set to accept inches + if (ext[1] == ".png") { + saveToFile <- function(fileout) { + png(filename = fileout, width = width, height = height, res = res, units = units) + } + } else if (ext[1] == ".jpeg") { + saveToFile <- function(fileout) { + jpeg(filename = fileout, width = width, height = height, res = res, units = units) + } + } else if (ext[1] %in% c(".eps", ".ps")) { + saveToFile <- function(fileout) { + postscript(file = fileout, width = width, height = height) + } + } else if (ext[1] == ".pdf") { + saveToFile <- function(fileout) { + pdf(file = fileout, width = width, height = height) + } + } else if (ext[1] == ".svg") { + saveToFile <- function(fileout) { + svg(filename = fileout, width = width, height = height) + } + } else if (ext[1] == ".bmp") { + saveToFile <- function(fileout) { + bmp(filename = fileout, width = width, height = height, res = res, units = units) + } + } else if (ext[1] == ".tiff") { + saveToFile <- function(fileout) { + tiff(filename = fileout, width = width, height = height, res = res, units = units) + } + } else { + .warning("file extension not supported, it will be used '.eps' by default.") + ## In case there is only one filename + fileout[1] <- sub("\\.[a-zA-Z0-9]*$", ".eps", fileout[1]) + ext[1] <- ".eps" + saveToFile <- function(fileout) { + postscript(file = fileout, width = width, height = height) + } + } + # Change filenames when necessary + if (any(ext != ext[1])) { + .warning(paste0("some extensions of the filenames provided in 'fileout' ", + "are not ", ext[1], + ". The extensions are being converted to ", ext[1], ".")) + fileout <- sub("\\.[a-zA-Z0-9]*$", ext[1], fileout) + } + } else { + # Default filenames when there is no specification + .warning("there are no extensions specified in the filenames, default to '.eps'") + fileout <- paste0(fileout, ".eps") + saveToFile <- postscript + } + + # return the correct function with the graphical device, and the correct + # filenames + list(fun = saveToFile, files = fileout) +} + +.message <- function(...) { + # Function to use the 'message' R function with our custom settings + # Default: new line at end of message, indent to 0, exdent to 3, + # collapse to \n* + args <- list(...) + + ## In case we need to specify message arguments + if (!is.null(args[["appendLF"]])) { + appendLF <- args[["appendLF"]] + } else { + ## Default value in message function + appendLF <- TRUE + } + if (!is.null(args[["domain"]])) { + domain <- args[["domain"]] + } else { + ## Default value in message function + domain <- NULL + } + args[["appendLF"]] <- NULL + args[["domain"]] <- NULL + + ## To modify strwrap indent and exdent arguments + if (!is.null(args[["indent"]])) { + indent <- args[["indent"]] + } else { + indent <- 0 + } + if (!is.null(args[["exdent"]])) { + exdent <- args[["exdent"]] + } else { + exdent <- 3 + } + args[["indent"]] <- NULL + args[["exdent"]] <- NULL + + ## To modify paste collapse argument + if (!is.null(args[["collapse"]])) { + collapse <- args[["collapse"]] + } else { + collapse <- "\n*" + } + args[["collapse"]] <- NULL + + ## Message tag + if (!is.null(args[["tag"]])) { + tag <- args[["tag"]] + } else { + tag <- "* " + } + args[["tag"]] <- NULL + + tmp <- paste0(tag, + paste(strwrap(args, indent = indent, exdent = exdent), collapse = collapse)) + message(tmp, appendLF = appendLF, domain = domain) +} + +.warning <- function(...) { + # Function to use the 'warning' R function with our custom settings + # Default: no call information, indent to 0, exdent to 3, + # collapse to \n + args <- list(...) + + ## In case we need to specify warning arguments + if (!is.null(args[["call."]])) { + call <- args[["call."]] + } else { + ## Default: don't show info about the call where the warning came up + call <- FALSE + } + if (!is.null(args[["immediate."]])) { + immediate <- args[["immediate."]] + } else { + ## Default value in warning function + immediate <- FALSE + } + if (!is.null(args[["noBreaks."]])) { + noBreaks <- args[["noBreaks."]] + } else { + ## Default value warning function + noBreaks <- FALSE + } + if (!is.null(args[["domain"]])) { + domain <- args[["domain"]] + } else { + ## Default value warning function + domain <- NULL + } + args[["call."]] <- NULL + args[["immediate."]] <- NULL + args[["noBreaks."]] <- NULL + args[["domain"]] <- NULL + + ## To modify strwrap indent and exdent arguments + if (!is.null(args[["indent"]])) { + indent <- args[["indent"]] + } else { + indent <- 0 + } + if (!is.null(args[["exdent"]])) { + exdent <- args[["exdent"]] + } else { + exdent <- 3 + } + args[["indent"]] <- NULL + args[["exdent"]] <- NULL + + ## To modify paste collapse argument + if (!is.null(args[["collapse"]])) { + collapse <- args[["collapse"]] + } else { + collapse <- "\n!" + } + args[["collapse"]] <- NULL + + ## Warning tag + if (!is.null(args[["tag"]])) { + tag <- args[["tag"]] + } else { + tag <- "! Warning: " + } + args[["tag"]] <- NULL + + tmp <- paste0(tag, + paste(strwrap(args, indent = indent, exdent = exdent), collapse = collapse)) + warning(tmp, call. = call, immediate. = immediate, + noBreaks. = noBreaks, domain = domain) +} + +.IsColor <- function(x) { + res <- try(col2rgb(x), silent = TRUE) + return(!is(res, "try-error")) +} + +# This function switches to a specified figure at position (row, col) in a layout. +# This overcomes the bug in par(mfg = ...). However the mode par(new = TRUE) is +# activated, i.e., all drawn elements will be superimposed. Additionally, after +# using this function, the automatical pointing to the next figure in the layout +# will be spoiled: once the last figure in the layout is drawn, the pointer won't +# move to the first figure in the layout. +# Only figures with numbers other than 0 (when creating the layout) will be +# accessible. +# Inputs: either row and col, or n and mat +.SwitchToFigure <- function(row = NULL, col = NULL, n = NULL, mat = NULL) { + if (!is.null(n) && !is.null(mat)) { + if (!is.numeric(n) || length(n) != 1) { + stop("Parameter 'n' must be a single numeric value.") + } + n <- round(n) + if (!is.array(mat)) { + stop("Parameter 'mat' must be an array.") + } + target <- which(mat == n, arr.ind = TRUE)[1, ] + row <- target[1] + col <- target[2] + } else if (!is.null(row) && !is.null(col)) { + if (!is.numeric(row) || length(row) != 1) { + stop("Parameter 'row' must be a single numeric value.") + } + row <- round(row) + if (!is.numeric(col) || length(col) != 1) { + stop("Parameter 'col' must be a single numeric value.") + } + col <- round(col) + } else { + stop("Either 'row' and 'col' or 'n' and 'mat' must be provided.") + } + next_attempt <- c(row, col) + par(mfg = next_attempt) + i <- 1 + layout_size <- par('mfrow') + layout_cells <- matrix(1:prod(layout_size), layout_size[1], layout_size[2], + byrow = TRUE) + while (any((par('mfg')[1:2] != c(row, col)))) { + next_attempt <- which(layout_cells == i, arr.ind = TRUE)[1, ] + par(mfg = next_attempt) + i <- i + 1 + if (i > prod(layout_size)) { + stop("Figure not accessible.") + } + } + plot(0, type = 'n', axes = FALSE, ann = FALSE) + par(mfg = next_attempt) +} + +# Function to permute arrays of non-atomic elements (e.g. POSIXct) +.aperm2 <- function(x, new_order) { + old_dims <- dim(x) + attr_bk <- attributes(x) + if ('dim' %in% names(attr_bk)) { + attr_bk[['dim']] <- NULL + } + if (is.numeric(x)) { + x <- aperm(x, new_order) + } else { + y <- array(seq_along(x), dim = dim(x)) + y <- aperm(y, new_order) + x <- x[as.vector(y)] + } + dim(x) <- old_dims[new_order] + attributes(x) <- c(attributes(x), attr_bk) + x +} + +# This function is a helper for the function .MergeArrays. +# It expects as inputs two named numeric vectors, and it extends them +# with dimensions of length 1 until an ordered common dimension +# format is reached. +# The first output is dims1 extended with 1s. +# The second output is dims2 extended with 1s. +# The third output is a merged dimension vector. If dimensions with +# the same name are found in the two inputs, and they have a different +# length, the maximum is taken. +.MergeArrayDims <- function(dims1, dims2) { + new_dims1 <- NULL + new_dims2 <- NULL + while (length(dims1) > 0) { + if (names(dims1)[1] %in% names(dims2)) { + pos <- which(names(dims2) == names(dims1)[1]) + dims_to_add <- rep(1, pos - 1) + if (length(dims_to_add) > 0) { + names(dims_to_add) <- names(dims2[1:(pos - 1)]) + } + new_dims1 <- c(new_dims1, dims_to_add, dims1[1]) + new_dims2 <- c(new_dims2, dims2[1:pos]) + dims1 <- dims1[-1] + dims2 <- dims2[-(1:pos)] + } else { + new_dims1 <- c(new_dims1, dims1[1]) + new_dims2 <- c(new_dims2, 1) + names(new_dims2)[length(new_dims2)] <- names(dims1)[1] + dims1 <- dims1[-1] + } + } + if (length(dims2) > 0) { + dims_to_add <- rep(1, length(dims2)) + names(dims_to_add) <- names(dims2) + new_dims1 <- c(new_dims1, dims_to_add) + new_dims2 <- c(new_dims2, dims2) + } + list(new_dims1, new_dims2, pmax(new_dims1, new_dims2)) +} + +# This function takes two named arrays and merges them, filling with +# NA where needed. +# dim(array1) +# 'b' 'c' 'e' 'f' +# 1 3 7 9 +# dim(array2) +# 'a' 'b' 'd' 'f' 'g' +# 2 3 5 9 11 +# dim(.MergeArrays(array1, array2, 'b')) +# 'a' 'b' 'c' 'e' 'd' 'f' 'g' +# 2 4 3 7 5 9 11 +.MergeArrays <- function(array1, array2, along) { + if (!(is.null(array1) || is.null(array2))) { + if (!(identical(names(dim(array1)), names(dim(array2))) && + identical(dim(array1)[-which(names(dim(array1)) == along)], + dim(array2)[-which(names(dim(array2)) == along)]))) { + new_dims <- .MergeArrayDims(dim(array1), dim(array2)) + dim(array1) <- new_dims[[1]] + dim(array2) <- new_dims[[2]] + for (j in seq_along(dim(array1))) { + if (names(dim(array1))[j] != along) { + if (dim(array1)[j] != dim(array2)[j]) { + if (which.max(c(dim(array1)[j], dim(array2)[j])) == 1) { + na_array_dims <- dim(array2) + na_array_dims[j] <- dim(array1)[j] - dim(array2)[j] + na_array <- array(dim = na_array_dims) + array2 <- abind(array2, na_array, along = j) + names(dim(array2)) <- names(na_array_dims) + } else { + na_array_dims <- dim(array1) + na_array_dims[j] <- dim(array2)[j] - dim(array1)[j] + na_array <- array(dim = na_array_dims) + array1 <- abind(array1, na_array, along = j) + names(dim(array1)) <- names(na_array_dims) + } + } + } + } + } + if (!(along %in% names(dim(array2)))) { + stop("The dimension specified in 'along' is not present in the ", + "provided arrays.") + } + array1 <- abind(array1, array2, along = which(names(dim(array1)) == along)) + names(dim(array1)) <- names(dim(array2)) + } else if (is.null(array1)) { + array1 <- array2 + } + array1 +} + +# only can be used in Trend(). Needs generalization or be replaced by other function. +.reorder <- function(output, time_dim, dim_names) { + # Add dim name back + if (is.null(dim(output))) { + dim(output) <- c(stats = length(output)) + } else { #is an array + if (length(dim(output)) == 1) { + if (!is.null(names(dim(output)))) { + dim(output) <- c(1, dim(output)) + names(dim(output))[1] <- time_dim + } else { + names(dim(output)) <- time_dim + } + } else { # more than one dim + if (names(dim(output))[1] != "") { + dim(output) <- c(1, dim(output)) + names(dim(output))[1] <- time_dim + } else { #regular case + names(dim(output))[1] <- time_dim + } + } + } + # reorder + pos <- match(dim_names, names(dim(output))) + output <- aperm(output, pos) + names(dim(output)) <- dim_names + names(dim(output))[names(dim(output)) == time_dim] <- 'stats' + return(output) +} + +# to be used in AMV.R, TPI.R, SPOD.R, GSAT.R and GMST.R +.Indices <- function(data, type, monini, indices_for_clim, + fmonth_dim, sdate_dim, year_dim, month_dim, na.rm) { + + if (type == 'dcpp') { + + fyear_dim <- 'fyear' + data <- Season(data = data, time_dim = fmonth_dim, + monini = monini, moninf = 1, monsup = 12, + method = mean, na.rm = na.rm) + names(dim(data))[which(names(dim(data)) == fmonth_dim)] <- fyear_dim + + if (identical(indices_for_clim, FALSE)) { ## data is already anomalies + + anom <- data + + } else { ## Different indices_for_clim for each forecast year (to use the same calendar years) + + n_fyears <- as.numeric(dim(data)[fyear_dim]) + n_sdates <- as.numeric(dim(data)[sdate_dim]) + + if (is.null(indices_for_clim)) { ## climatology over the whole (common) period + first_years_for_clim <- n_fyears : 1 + last_years_for_clim <- n_sdates : (n_sdates - n_fyears + 1) + } else { ## indices_for_clim specified as a numeric vector + first_years_for_clim <- seq(from = indices_for_clim[1], by = -1, length.out = n_fyears) + last_years_for_clim <- + seq(from = indices_for_clim[length(indices_for_clim)], + by = -1, length.out = n_fyears) + } + + data <- s2dv::Reorder(data = data, order = c(fyear_dim, sdate_dim)) + anom <- array(data = NA, dim = dim(data)) + for (i in 1:n_fyears) { + clim <- mean(data[i, first_years_for_clim[i]:last_years_for_clim[i]], na.rm = na.rm) + anom[i, ] <- data[i, ] - clim + } + } + + } else if (type %in% c('obs', 'hist')) { + + data <- multiApply::Apply(data = data, target_dims = month_dim, + fun = mean, na.rm = na.rm)$output1 + + if (identical(indices_for_clim, FALSE)) { ## data is already anomalies + clim <- 0 + } else if (is.null(indices_for_clim)) { + ## climatology over the whole period + clim <- multiApply::Apply(data = data, target_dims = year_dim, fun = mean, + na.rm = na.rm)$output1 + } else { + ## indices_for_clim specified as a numeric vector + clim <- multiApply::Apply(data = ClimProjDiags::Subset(x = data, along = year_dim, + indices = indices_for_clim), + target_dims = year_dim, fun = mean, na.rm = na.rm)$output1 + } + + anom <- data - clim + + } else { + stop('type must be dcpp, hist or obs') + } + + return(anom) +} + +#TODO: Remove from s2dv when PlotLayout can get colorbar info from plotting function directly. +# The function is temporarily here because PlotLayout() needs to draw the colorbars of +# PlotMostLikelyQuantileMap(). +#Draws Color Bars for Categories +#A wrapper of s2dv::ColorBar to generate multiple color bars for different +#categories, and each category has different color set. +GradientCatsColorBar <- function(nmap, brks = NULL, cols = NULL, vertical = TRUE, subsampleg = NULL, + bar_limits, var_limits = NULL, + triangle_ends = NULL, plot = TRUE, + draw_separators = FALSE, + bar_titles = NULL, title_scale = 1, + label_scale = 1, extra_margin = rep(0, 4), + ...) { + # bar_limits + if (!is.numeric(bar_limits) || length(bar_limits) != 2) { + stop("Parameter 'bar_limits' must be a numeric vector of length 2.") + } + + # Check brks + if (is.null(brks) || (is.numeric(brks) && length(brks) == 1)) { + num_brks <- 5 + if (is.numeric(brks)) { + num_brks <- brks + } + brks <- seq(from = bar_limits[1], to = bar_limits[2], length.out = num_brks) + } + if (!is.numeric(brks)) { + stop("Parameter 'brks' must be a numeric vector.") + } + # Check cols + col_sets <- list(c("#A1D99B", "#74C476", "#41AB5D", "#238B45"), + c("#6BAED6FF", "#4292C6FF", "#2171B5FF", "#08519CFF"), + c("#FFEDA0FF", "#FED976FF", "#FEB24CFF", "#FD8D3CFF"), + c("#FC4E2AFF", "#E31A1CFF", "#BD0026FF", "#800026FF"), + c("#FCC5C0", "#FA9FB5", "#F768A1", "#DD3497")) + if (is.null(cols)) { + if (length(col_sets) >= nmap) { + chosen_sets <- 1:nmap + chosen_sets <- chosen_sets + floor((length(col_sets) - length(chosen_sets)) / 2) + } else { + chosen_sets <- array(seq_along(col_sets), nmap) + } + cols <- col_sets[chosen_sets] + } else { + if (!is.list(cols)) { + stop("Parameter 'cols' must be a list of character vectors.") + } + if (!all(sapply(cols, is.character))) { + stop("Parameter 'cols' must be a list of character vectors.") + } + if (length(cols) != nmap) { + stop("Parameter 'cols' must be a list of the same length as the number of ", + "maps in 'maps'.") + } + } + for (i in seq_along(cols)) { + if (length(cols[[i]]) != (length(brks) - 1)) { + cols[[i]] <- grDevices::colorRampPalette(cols[[i]])(length(brks) - 1) + } + } + + # Check bar_titles + if (is.null(bar_titles)) { + if (nmap == 3) { + bar_titles <- c("Below normal (%)", "Normal (%)", "Above normal (%)") + } else if (nmap == 5) { + bar_titles <- c("Low (%)", "Below normal (%)", + "Normal (%)", "Above normal (%)", "High (%)") + } else { + bar_titles <- paste0("Cat. ", 1:nmap, " (%)") + } + } + + if (plot) { + for (k in 1:nmap) { + s2dv::ColorBar(brks = brks, cols = cols[[k]], vertical = FALSE, subsampleg = subsampleg, +# bar_limits = bar_limits, var_limits = var_limits, + triangle_ends = triangle_ends, plot = TRUE, + draw_separators = draw_separators, + title = bar_titles[[k]], title_scale = title_scale, + label_scale = label_scale, extra_margin = extra_margin) + } + } else { + #TODO: col_inf and col_sup + return(list(brks = brks, cols = cols)) + } + +} + + -- GitLab From 87d6347e5f2069d5647f9e5cdaa48a568c3e907f Mon Sep 17 00:00:00 2001 From: vagudets Date: Fri, 16 May 2025 11:26:36 +0200 Subject: [PATCH 14/22] Always request one node with Autosubmit + Update use case --- autosubmit/conf_esarchive/autosubmit.yml | 6 +++--- autosubmit/conf_esarchive/expdef.yml | 2 +- autosubmit/conf_esarchive/jobs.yml | 12 +++++------- autosubmit/conf_gpfs/autosubmit.yml | 6 +++--- autosubmit/conf_gpfs/jobs.yml | 6 ++---- tools/write_autosubmit_conf.R | 2 +- .../ex1_2_autosubmit_scorecards/ex1_2-handson.md | 6 +++--- .../ex1_2_autosubmit_scorecards/ex1_2-recipe.yml | 4 ++-- 8 files changed, 20 insertions(+), 24 deletions(-) diff --git a/autosubmit/conf_esarchive/autosubmit.yml b/autosubmit/conf_esarchive/autosubmit.yml index 0fd5d5c6..04fb9210 100644 --- a/autosubmit/conf_esarchive/autosubmit.yml +++ b/autosubmit/conf_esarchive/autosubmit.yml @@ -1,10 +1,10 @@ config: EXPID: - AUTOSUBMIT_VERSION: 4.0.0b0 - MAXWAITINGJOBS: 16 + AUTOSUBMIT_VERSION: 4.0.98 + MAXWAITINGJOBS: 100 # Default maximum number of jobs to be running at the same time at any platform # Default: 6 - TOTALJOBS: 16 + TOTALJOBS: 100 SAFETYSLEEPTIME: 10 RETRIALS: 0 mail: diff --git a/autosubmit/conf_esarchive/expdef.yml b/autosubmit/conf_esarchive/expdef.yml index 8dc29b27..bd579342 100644 --- a/autosubmit/conf_esarchive/expdef.yml +++ b/autosubmit/conf_esarchive/expdef.yml @@ -1,6 +1,6 @@ DEFAULT: EXPID: - HPCARCH: nord3v2 + HPCARCH: nord4 experiment: DATELIST: MEMBERS: fc0 diff --git a/autosubmit/conf_esarchive/jobs.yml b/autosubmit/conf_esarchive/jobs.yml index 8196dcc1..f856b441 100644 --- a/autosubmit/conf_esarchive/jobs.yml +++ b/autosubmit/conf_esarchive/jobs.yml @@ -4,20 +4,18 @@ JOBS: RUNNING: chunk WALLCLOCK: NOTIFY_ON: - PLATFORM: nord3v2 + PLATFORM: nord4 PROCESSORS: - ## TODO: Uncomment (see #162) - # NODES: 1 + NODES: 1 # SPLITS: # n_atomic_recipes, number of atomic recipes multimodel: FILE: autosubmit/auto-multimodel.sh RUNNING: once WALLCLOCK: NOTIFY_ON: - PLATFORM: nord3v2 + PLATFORM: nord4 PROCESSORS: - ## TODO: Uncomment - # NODES: 1 + NODES: 1 DEPENDENCIES: verification: SPLITS_FROM: @@ -25,7 +23,7 @@ JOBS: scorecards: FILE: autosubmit/auto-scorecards.sh WALLCLOCK: 00:10 - PLATFORM: nord3v2 + PLATFORM: nord4 NOTIFY_ON: PROCESSORS: 1 DEPENDENCIES: diff --git a/autosubmit/conf_gpfs/autosubmit.yml b/autosubmit/conf_gpfs/autosubmit.yml index 0fd5d5c6..04fb9210 100644 --- a/autosubmit/conf_gpfs/autosubmit.yml +++ b/autosubmit/conf_gpfs/autosubmit.yml @@ -1,10 +1,10 @@ config: EXPID: - AUTOSUBMIT_VERSION: 4.0.0b0 - MAXWAITINGJOBS: 16 + AUTOSUBMIT_VERSION: 4.0.98 + MAXWAITINGJOBS: 100 # Default maximum number of jobs to be running at the same time at any platform # Default: 6 - TOTALJOBS: 16 + TOTALJOBS: 100 SAFETYSLEEPTIME: 10 RETRIALS: 0 mail: diff --git a/autosubmit/conf_gpfs/jobs.yml b/autosubmit/conf_gpfs/jobs.yml index e6806ca7..36591feb 100644 --- a/autosubmit/conf_gpfs/jobs.yml +++ b/autosubmit/conf_gpfs/jobs.yml @@ -13,8 +13,7 @@ JOBS: NOTIFY_ON: PLATFORM: PROCESSORS: - ## TODO: Uncomment - # NODES: 1 + NODES: 1 # SPLITS: # n_atomic_recipes, number of atomic recipes multimodel: FILE: autosubmit/auto-multimodel.sh @@ -24,8 +23,7 @@ JOBS: PLATFORM: PROCESSORS: DEPENDENCIES: - ## TODO: Uncomment - # NODES: 1 + NODES: 1 verification: SPLITS_FROM: SPLITS: # n_atomic_recipes/n_models = n_multimodels diff --git a/tools/write_autosubmit_conf.R b/tools/write_autosubmit_conf.R index dd943f15..1b490d1c 100644 --- a/tools/write_autosubmit_conf.R +++ b/tools/write_autosubmit_conf.R @@ -61,7 +61,7 @@ write_autosubmit_conf <- function(recipe, nchunks, # Section 3: jobs ## wallclock, notify_on, platform?, processors, # Create bash file to associate chunk number to recipe name - chunk_file <- paste0(proj_dir, "chunk_to_recipe") + chunk_file <- paste0(proj_dir, "/", "chunk_to_recipe") ## TODO: Specify dir in chunk_to_recipe? .create_bash_file(fileout = chunk_file, dictionary = chunk_to_recipe, diff --git a/use_cases/ex1_2_autosubmit_scorecards/ex1_2-handson.md b/use_cases/ex1_2_autosubmit_scorecards/ex1_2-handson.md index 9f3b05f9..59a8dcf6 100644 --- a/use_cases/ex1_2_autosubmit_scorecards/ex1_2-handson.md +++ b/use_cases/ex1_2_autosubmit_scorecards/ex1_2-handson.md @@ -34,7 +34,7 @@ Since we're going to use Autosubmit to dispatch jobs, we need to have an Autosub On the workstation or the Autosubmit machine, you can create an experiment by the following commands. **AUTOSUBMIT EXPERIMENTS ARE REUSABLE!** You do not need to create a new one every time. ```shell -module load autosubmit/4.0.98b-foss-2015a-Python-3.7.3 +module load autosubmit/4.0.98-foss-2015a-Python-3.7.3 autosubmit expid -H nord4 -d "SUNSET use case 1_2" ``` You will see the messages like below: @@ -141,7 +141,7 @@ INFO [2023-11-29 00:37:41] Check output directory /esarchive/scratch/aho/auto-s INFO [2023-11-29 00:37:41] ##### AUTOSUBMIT CONFIGURATION WRITTEN FOR a6pc ##### INFO [2023-11-29 00:37:41] You can check your experiment configuration at: /esarchive/autosubmit/a6pc/conf/ INFO [2023-11-29 00:37:41] Please SSH into bscesautosubmit01 or bscesautosubmit02 and run the following commands: -INFO [2023-11-29 00:37:41] module load autosubmit/4.0.0b-foss-2015a-Python-3.7.3 +INFO [2023-11-29 00:37:41] module load autosubmit/4.0.98-foss-2015a-Python-3.7.3 INFO [2023-11-29 00:37:41] autosubmit create a6pc INFO [2023-11-29 00:37:41] autosubmit refresh a6pc INFO [2023-11-29 00:37:41] nohup autosubmit run a6pc & disown @@ -151,7 +151,7 @@ You can see some useful information, like the the path to atomic recipes, the Au ```shell ssh bscesautosubmit01.bsc.es (enter Autosubmit machine) -module load autosubmit/4.0.0b-foss-2015a-Python-3.7.3 +module load autosubmit/4.0.98-foss-2015a-Python-3.7.3 autosubmit create a6pc autosubmit refresh a6pc nohup autosubmit run a6pc & disown diff --git a/use_cases/ex1_2_autosubmit_scorecards/ex1_2-recipe.yml b/use_cases/ex1_2_autosubmit_scorecards/ex1_2-recipe.yml index a4e4b367..adf53ffd 100644 --- a/use_cases/ex1_2_autosubmit_scorecards/ex1_2-recipe.yml +++ b/use_cases/ex1_2_autosubmit_scorecards/ex1_2-recipe.yml @@ -69,7 +69,7 @@ Analysis: col1_width: NULL col2_width: NULL calculate_diff: FALSE - ncores: 8 + ncores: 16 remove_NAs: no # bool, don't split Output_format: Scorecards # string, don't split @@ -91,7 +91,7 @@ Run: wallclock: 03:00 # hh:mm processors_per_job: 16 platform: nord4 - custom_directives: ['#SBATCH --exclusive'] + custom_directives: # ['#SBATCH --exclusive'] email_notifications: yes # enable/disable email notifications. Change it if you want to. email_address: victoria.agudetse@bsc.es # replace with your email address notify_completed: yes # notify me by email when a job finishes -- GitLab From dd157882a5a9f90cd74103403bc7caa890ea8350 Mon Sep 17 00:00:00 2001 From: vagudets Date: Fri, 16 May 2025 12:03:11 +0200 Subject: [PATCH 15/22] Compute RPS/RPSS for multiple sets of probabilities --- modules/Crossval/Crossval_metrics.R | 300 ++++++++++++++++++++++++++++ 1 file changed, 300 insertions(+) create mode 100644 modules/Crossval/Crossval_metrics.R diff --git a/modules/Crossval/Crossval_metrics.R b/modules/Crossval/Crossval_metrics.R new file mode 100644 index 00000000..63c82e08 --- /dev/null +++ b/modules/Crossval/Crossval_metrics.R @@ -0,0 +1,300 @@ + +source("modules/Saving/Saving.R") +source("modules/Crossval/R/tmp/RPS.R") +source("modules/Crossval/R/RPS_clim.R") +source("modules/Crossval/R/CRPS_clim.R") +source("modules/Crossval/R/tmp/RPSS.R") +source("modules/Crossval/R/tmp/RandomWalkTest.R") +source("modules/Crossval/R/tmp/Corr.R") +source("modules/Crossval/R/tmp/Bias.R") +source("modules/Crossval/R/tmp/SprErr.R") +source("modules/Crossval/R/tmp/Eno.R") + +## data_crossval is the result from function full_crossval_anomalies or similar. +## this is a list with the required elements: + ## probs is a list with + ## probs$hcst_ev and probs$obs_ev + ## probs$hcst_ev will have as many elements in the $Probabilities$percentiles + ## each element will be an array with 'cat' dimension + ## the same for probs$obs_ev + ## hcst is a s2dv_cube for the post-processed hindcast for the evalutaion indices + ## in this case cross validated anomalies + ## obs is a s2dv_cube for the post-processed obs + ## in this case cross validated anomalies + ## fcst is a s2dv_cube for the post-processed fcst + ## in this case cross anomalies with the full hindcast period + ## this object is not required for skill assessment + ## hcst.full_val and obs.full_val are the original data to compute mean bias + ## cat_lims used to compute the probabilities + ## this object is not required for skill assessment + ## ref_obs_tr is an array with the cross-validate observed anomalies + ## to be used as reference forecast in the CRPSS and CRPS_clim + ## it is computed from the training indices +## the recipe could be used to read the Percentiles +## if fair is TRUE, the nmemb used to compute the probabilities is needed + ## nmemb_ref is the number of year - 1 in case climatological forecast is the reference +Crossval_metrics <- function(recipe, data_crossval, + fair = FALSE, nmemb = NULL, nmemb_ref = NULL) { + + ncores <- recipe$Analysis$ncores + alpha <- recipe$Analysis$Skill$alpha + na.rm <- recipe$Analysis$remove_NAs + categories <- recipe$Analysis$Workflow$Probabilities$percentiles + categories <- lapply(categories, function (x) { + sapply(x, function(x) { + eval(parse(text = x))}, USE.NAMES = FALSE)}) + + # TODO: distinguish between rpss and bss + # if 1 percentile -> bss + # if more than 1 -> rpss + # TODO: for subseasonal check if the dimension sday is 1 + ## exe_rps <- unlist(lapply(categories, function(x) { + ## if (length(x) > 1) { + ## x <- x[1] *100 + ## } + ## return(x)})) + if (is.null(alpha)) { + alpha <- 0.05 + } + ## START SKILL ASSESSMENT: + skill_metrics <- list() + requested_metrics <- tolower(strsplit(recipe$Analysis$Workflow$Skill$metric, + ", | |,")[[1]]) + + # The recipe allows to requset more than only terciles: + for (ps in 1:length(categories)) { + if (tolower(recipe$Analysis$Horizon) == 'subseasonal') { + data_crossval$probs$hcst_ev[[ps]] <- MergeDims(data_crossval$probs$hcst_ev[[ps]], + merge_dims = c('sweek', 'syear'), + rename_dim = 'syear', na.rm = FALSE) + data_crossval$probs$obs_ev[[ps]] <- MergeDims(data_crossval$probs$obs_ev[[ps]], + merge_dims = c('sweek', 'syear'), + rename_dim = 'syear', na.rm = FALSE) + } + + if ('rps' %in% requested_metrics) { + rps <- RPS(exp = data_crossval$probs$hcst_ev[[ps]], + obs = data_crossval$probs$obs_ev[[ps]], memb_dim = NULL, + cat_dim = 'bin', cross.val = FALSE, time_dim = 'syear', + Fair = fair, nmemb = nmemb, + ncores = ncores) + rps_clim <- Apply(list(data_crossval$probs$obs_ev[[ps]]), + target_dims = c('bin', 'syear'), + RPS_clim, bin_dim_abs = 'bin', Fair = fair, + cross.val = FALSE, ncores = ncores)$output1 + if (is.null(names(categories)[ps])) { + skill_metrics$rps <- rps + skill_metrics$rps_clim <- rps_clim + } else { + # names based on the categories: + # To use it when visualization works for more rps + skill_metrics[[paste0('rps-', names(categories)[ps])]] <- rps + skill_metrics[[paste0('rps_clim-', + exe_rps[ps])]] <- rps_clim + } + } + if ('rpss' %in% requested_metrics) { + rpss <- RPSS(exp = data_crossval$probs$hcst_ev[[ps]], + obs = data_crossval$probs$obs_ev[[ps]], + ref = NULL, # ref is 1/3 by default if terciles + time_dim = 'syear', memb_dim = NULL, + cat_dim = 'bin', nmemb = nmemb, + dat_dim = NULL, + prob_thresholds = categories[[ps]], + indices_for_clim = NULL, + Fair = fair, weights_exp = NULL, weights_ref = NULL, + cross.val = FALSE, na.rm = na.rm, + sig_method.type = 'two.sided.approx', alpha = alpha, + ncores = ncores) + if (is.null(names(categories)[ps])) { + skill_metrics[[paste0('rpss')]] <- rpss$rpss + skill_metrics[[paste0('rpss_significance')]] <- rpss$sign + } else { + # TO USE IT when visualization works for more rpsss + skill_metrics[[paste0('rpss-', names(categories)[ps])]] <- rpss$rpss + ## TODO: Change this name? + skill_metrics[[paste0('rpss-', + names(categories)[ps], + "_significance")]] <- rpss$sign + } + } + if (tolower(recipe$Analysis$Horizon) == 'subseasonal') { + # The evaluation of all metrics are done with extra sample + data_crossval$hcst <- CST_MergeDims(data_crossval$hcst, + merge_dims = c('sweek', 'syear'), + rename_dim = 'syear', na.rm = FALSE) + data_crossval$obs <- CST_MergeDims(data_crossval$obs, + merge_dims = c('sweek', 'syear'), + rename_dim = 'syear', na.rm = FALSE) + data_crossval$ref_obs_tr <- MergeDims(data_crossval$ref_obs_tr, + merge_dims = c('sweek', 'syear'), + rename_dim = 'syear', na.rm = FALSE) + data_crossval$hcst.full_val <- CST_MergeDims(data_crossval$hcst.full_val, + merge_dims = c('sweek', 'syear'), + rename_dim = 'syear', na.rm = FALSE) + data_crossval$obs.full_val <- CST_MergeDims(data_crossval$obs.full_val, + merge_dims = c('sweek', 'syear'), + rename_dim = 'syear', na.rm = FALSE) + } + + if ('crps' %in% requested_metrics) { + crps <- CRPS(exp = data_crossval$hcst$data, + obs = data_crossval$obs$data, + time_dim = 'syear', memb_dim = 'ensemble', + Fair = fair, + ncores = ncores) + skill_metrics$crps <- crps + crps_clim <- CRPS(exp = data_crossval$ref_obs_tr, + obs = data_crossval$obs$data, + time_dim = 'syear', memb_dim = 'ensemble', + Fair = fair, ncores = ncores) + skill_metrics$crps_clim <- crps_clim + } + if ('crpss' %in% requested_metrics) { + crpss <- CRPSS(exp = data_crossval$hcst$data, + obs = data_crossval$obs$data, + ref = data_crossval$ref_obs_tr, + memb_dim = 'ensemble', Fair = fair, + time_dim = 'syear', clim.cross.val = FALSE, + ncores = ncores) + skill_metrics$crpss <- crpss$crpss + skill_metrics$crpss_significance <- crpss$sign + } + + if ('enscorr' %in% requested_metrics) { + enscorr <- Corr(exp = data_crossval$hcst$data, + obs = data_crossval$obs$data, + dat_dim = NULL, + time_dim = 'syear', + method = 'pearson', + memb_dim = 'ensemble', + memb = F, + conf = F, + pval = F, + sign = T, + alpha = alpha, + ncores = ncores) + skill_metrics$enscorr <- enscorr$corr + skill_metrics$enscorr_significance <- enscorr$sign + } + if ('mean_bias' %in% requested_metrics) { + if (!is.null(data_crossval$hcst.full_val$data)) { + mean_bias <- Bias(exp = data_crossval$hcst.full_val$data, + obs = data_crossval$obs.full_val$data, + time_dim = 'syear', + memb_dim = 'ensemble', + alpha = alpha, + na.rm = na.rm, + ncores = ncores) + skill_metrics$mean_bias <- mean_bias$bias + skill_metrics$mean_bias_significance <- mean_bias$sig + } else { + info(recipe$Run$logger, + "Full values not available") + } + } + if ('enssprerr' %in% requested_metrics) { + enssprerr <- SprErr(exp = data_crossval$hcst$data, + obs = data_crossval$obs$data, + memb_dim = 'ensemble', dat_dim = NULL, + time_dim = 'syear', pval = TRUE, + na.rm = na.rm, + ncores = ncores) + skill_metrics$SprErr <- enssprerr$ratio + skill_metrics$SprErr_significance <- enssprerr$p.val <= alpha + } + if ('rms' %in% requested_metrics) { + rms <- RMS(exp = data_crossval$hcst$data, + obs = data_crossval$obs$data, + memb_dim = 'ensemble', dat_dim = NULL, + time_dim = 'syear', alpha = alpha, + ncores = ncores) + skill_metrics$rms <- rms$rms + } + if ('rmss' %in% requested_metrics) { + rmss <- RMSSS(exp = data_crossval$hcst$data, + obs = data_crossval$obs$data, + ref = res$ref_obs_tr, + memb_dim = 'ensemble', dat_dim = NULL, + time_dim = 'syear', alpha = alpha, sign = TRUE, + ncores = ncores) + skill_metrics$rmss <- rmss$rmss + skill_metrics$rmss_significance <- rmss$sign + } + if (is.null(data_crossval$hcst_EM)) { + data_crossval$hcst_EM <- MeanDims(data_crossval$hcst$data, + dims = 'ensemble', + drop = TRUE) + } + if (any(c('std', 'standard_deviation') %in% requested_metrics)) { + std_hcst <- Apply(data = data_crossval$hcst_EM, + target_dims = 'syear', + fun = 'sd')$output1 + + std_obs <- Apply(data = data_crossval$obs$data, + target_dims = 'syear', + fun = 'sd')$output1 + + skill_metrics[['std_hcst']] <- std_hcst + skill_metrics[['std_obs']] <- std_obs + } + if (any(c('var', 'variance') %in% requested_metrics)) { + ## Calculate variance + var_hcst <- Apply(data = data_crossval$hcst_EM, + target_dims = 'syear', + fun = 'sd')$output1 ^ 2 + + var_obs <- Apply(data = data_crossval$obs$data, + target_dims = 'syear', + fun = 'sd')$output1 ^ 2 + + skill_metrics[['var_hcst']] <- var_hcst + skill_metrics[['var_obs']] <- var_obs + } ## close if on variance + if ('n_eff' %in% requested_metrics) { + ## Calculate degrees of freedom + n_eff <- s2dv::Eno(data = data_crossval$obs$data, + time_dim = 'syear', + na.action = na.pass, + ncores = ncores) + skill_metrics[['n_eff']] <- n_eff + } ## close on n_eff + + if (any(c('cov', 'covariance') %in% requested_metrics)) { + covariance <- Apply(data = list(x = data_crossval$obs$data, + y = data_crossval$hcst_EM), + target_dims = 'syear', + fun = function(x, y) { + cov(as.vector(x), as.vector(y), + use = "everything", + method = "pearson")})$output1 + skill_metrics$covariance <- covariance + } + original <- recipe$Run$output_dir + recipe$Run$output_dir <- paste0(original, "/outputs/Skill/") + + skill_metrics <- lapply(skill_metrics, function(x) { + if (is.logical(x)) { + dims <- dim(x) + res <- as.numeric(x) + dim(res) <- dims + } else { + res <- x + } + return(res) + }) + # Save metrics + # reduce dimension to work with Visualization module: + skill_metrics <- lapply(skill_metrics, function(x) {.drop_dims(x)}) + + if (tolower(recipe$Analysis$Workflow$Skill$save) == 'all') { + save_metrics(recipe = recipe, + metrics = skill_metrics, + data_cube = data_crossval$hcst, agg = 'global', + outdir = recipe$Run$output_dir) + } + recipe$Run$output_dir <- original + return(skill_metrics) +} + + -- GitLab From e01458ab1b60647938d60eb3e141486d612eaded Mon Sep 17 00:00:00 2001 From: vagudets Date: Fri, 16 May 2025 14:11:58 +0200 Subject: [PATCH 16/22] Fix typos --- modules/Crossval/Crossval_metrics.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/Crossval/Crossval_metrics.R b/modules/Crossval/Crossval_metrics.R index 63c82e08..169ccb1e 100644 --- a/modules/Crossval/Crossval_metrics.R +++ b/modules/Crossval/Crossval_metrics.R @@ -116,6 +116,7 @@ Crossval_metrics <- function(recipe, data_crossval, skill_metrics[[paste0('rpss-', names(categories)[ps], "_significance")]] <- rpss$sign + } } } if (tolower(recipe$Analysis$Horizon) == 'subseasonal') { @@ -271,7 +272,7 @@ Crossval_metrics <- function(recipe, data_crossval, skill_metrics$covariance <- covariance } original <- recipe$Run$output_dir - recipe$Run$output_dir <- paste0(original, "/outputs/Skill/") + recipe$Run$output_dir <- paste0(original, "/", "outputs/Skill/") skill_metrics <- lapply(skill_metrics, function(x) { if (is.logical(x)) { -- GitLab From 8a5a838f4510cb5aca2a1c92389bf09079664fa4 Mon Sep 17 00:00:00 2001 From: vagudets Date: Wed, 21 May 2025 12:30:44 +0200 Subject: [PATCH 17/22] Recipe checks --- tools/check_recipe.R | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/tools/check_recipe.R b/tools/check_recipe.R index 7b20ee45..3a9a148a 100644 --- a/tools/check_recipe.R +++ b/tools/check_recipe.R @@ -674,9 +674,20 @@ check_recipe <- function(recipe) { error(recipe$Run$logger, "Parameter 'percentiles' must be defined under 'Probabilities'.") error_status <- TRUE - } else if (!is.list(recipe$Analysis$Workflow$Probabilities$percentiles)) { - recipe$Analysis$Workflow$Probabilities$percentiles <- - as.list(recipe$Analysis$Workflow$Probabilities$percentiles) + } else { + if (!is.list(recipe$Analysis$Workflow$Probabilities$percentiles)) { + recipe$Analysis$Workflow$Probabilities$percentiles <- + as.list(recipe$Analysis$Workflow$Probabilities$percentiles) + } + if (length(recipe$Analysis$Workflow$Probabilities$percentiles) > 1 && + is.null(names(recipe$Analysis$Workflow$Probabilities$percentiles))) { + for (i in 1:length(recipe$Analysis$Workflow$Probabilities$percentiles)) { + names(recipe$Analysis$Workflow$Probabilities$percentiles)[[i]] <- paste0("set", i) + } + warn(recipe$Run$logger, + paste0("No names given in recipe Probabilities:percentiles. Names ", + "'set1, set2, ...' assigned automatically.")) + } } # Saving checks SAVING_OPTIONS_PROBS <- c("all", "none", "bins_only", "percentiles_only") -- GitLab From 52147fb68e62de1b773d900938523ec0ce5df257 Mon Sep 17 00:00:00 2001 From: vagudets Date: Wed, 21 May 2025 12:30:54 +0200 Subject: [PATCH 18/22] Update RPS version --- modules/Skill/R/tmp/RPS.R | 42 +++++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/modules/Skill/R/tmp/RPS.R b/modules/Skill/R/tmp/RPS.R index 59b2d01a..0ed599ac 100644 --- a/modules/Skill/R/tmp/RPS.R +++ b/modules/Skill/R/tmp/RPS.R @@ -43,6 +43,7 @@ #'@param Fair A logical indicating whether to compute the FairRPS (the #' potential RPS that the forecast would have with an infinite ensemble size). #' The default value is FALSE. +#'@param nmemb A numeric value indicating the number of members used to compute the probabilities. This parameter is necessary when calculating FairRPS from probabilities. Default is NULL. #'@param weights A named numerical array of the weights for 'exp' probability #' calculation. If 'dat_dim' is NULL, the dimensions should include 'memb_dim' #' and 'time_dim'. Else, the dimension should also include 'dat_dim'. The @@ -88,7 +89,8 @@ #'@export RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', cat_dim = NULL, dat_dim = NULL, prob_thresholds = c(1/3, 2/3), indices_for_clim = NULL, - Fair = FALSE, weights = NULL, cross.val = FALSE, return_mean = TRUE, + Fair = FALSE, nmemb = NULL, weights = NULL, + cross.val = FALSE, return_mean = TRUE, na.rm = FALSE, ncores = NULL) { # Check inputs @@ -178,6 +180,16 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', cat_dim = NUL if (!is.logical(Fair) | length(Fair) > 1) { stop("Parameter 'Fair' must be either TRUE or FALSE.") } + if (Fair) { + if (!is.null(cat_dim)) { + if (cat_dim %in% names(dim(exp))) { + if (is.null(nmemb)) { + stop("Parameter 'nmemb' necessary to compute Fair", + "score from probabilities") + } + } + } + } ## return_mean if (!is.logical(return_mean) | length(return_mean) > 1) { stop("Parameter 'return_mean' must be either TRUE or FALSE.") @@ -253,7 +265,7 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', cat_dim = NUL fun = .RPS, dat_dim = dat_dim, time_dim = time_dim, memb_dim = memb_dim, cat_dim = cat_dim, - prob_thresholds = prob_thresholds, + prob_thresholds = prob_thresholds, nmemb = nmemb, indices_for_clim = indices_for_clim, Fair = Fair, weights = weights, cross.val = cross.val, na.rm = na.rm, ncores = ncores)$output1 @@ -270,7 +282,8 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', cat_dim = NUL .RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', cat_dim = NULL, dat_dim = NULL, prob_thresholds = c(1/3, 2/3), indices_for_clim = NULL, - Fair = FALSE, weights = NULL, cross.val = FALSE, na.rm = FALSE) { + Fair = FALSE, nmemb = NULL, weights = NULL, + cross.val = FALSE, na.rm = FALSE) { #--- if memb_dim: # exp: [sdate, memb, (dat)] # obs: [sdate, (memb), (dat)] @@ -349,8 +362,11 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', cat_dim = NUL # obs_probs: [bin, sdate] } else { # inputs are probabilities already - exp_probs <- t(exp_data) - obs_probs <- t(obs_data) + if (all(names(dim(exp_data)) == c(time_dim, memb_dim)) || + all(names(dim(exp_data)) == c(time_dim, cat_dim))) { + exp_probs <- t(exp_data) + obs_probs <- t(obs_data) + } } probs_exp_cumsum <- apply(exp_probs, 2, cumsum) @@ -358,11 +374,17 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', cat_dim = NUL # rps: [sdate, nexp, nobs] rps [good_values, i, j] <- colSums((probs_exp_cumsum - probs_obs_cumsum)^2) - if (Fair) { # FairRPS - ## adjustment <- rowSums(-1 * (1/R - 1/R.new) * ens.cum * (R - ens.cum)/R/(R - 1)) - ## [formula taken from SpecsVerification::EnsRps] - R <- dim(exp)[2] #memb + if (!is.null(memb_dim)) { + if (memb_dim %in% names(dim(exp))) { + ## adjustment <- rowSums(-1 * (1/R - 1/R.new) * ens.cum * (R - ens.cum)/R/(R - 1)) + ## [formula taken from SpecsVerification::EnsRps] + R <- dim(exp)[2] #memb + } + } else { + R <- nmemb + } + warning("Applying fair correction.") adjustment <- (-1) / (R - 1) * probs_exp_cumsum * (1 - probs_exp_cumsum) adjustment <- colSums(adjustment) rps[, i, j] <- rps[, i, j] + adjustment @@ -384,5 +406,3 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', cat_dim = NUL return(rps) } - - -- GitLab From c36ebc09d3abdf0516ea5df4f6260aab47ff6a5f Mon Sep 17 00:00:00 2001 From: vagudets Date: Wed, 21 May 2025 12:31:28 +0200 Subject: [PATCH 19/22] Adapt Visualization functions to named probabilities/named RPS(S) --- modules/Visualization/R/plot_extreme_probs.R | 26 +++++++--- modules/Visualization/R/plot_metrics.R | 32 ++++++++++-- .../R/plot_most_likely_terciles_map.R | 50 +++++++++++++------ modules/Visualization/Visualization.R | 40 +++++++-------- 4 files changed, 101 insertions(+), 47 deletions(-) diff --git a/modules/Visualization/R/plot_extreme_probs.R b/modules/Visualization/R/plot_extreme_probs.R index d0d38e5d..2797cff1 100644 --- a/modules/Visualization/R/plot_extreme_probs.R +++ b/modules/Visualization/R/plot_extreme_probs.R @@ -14,7 +14,6 @@ plot_extreme_probs <- function(recipe, fcst, outdir, output_conf, brks_min = 5, brks_max = 95, logo = NULL) { - ## TODO: Add 'anomaly' to plot title # Abort if frequency is daily if (recipe$Analysis$Variables$freq %in% c("daily", "daily_mean")) { stop("Visualization functions not yet implemented for daily data.") @@ -67,6 +66,8 @@ plot_extreme_probs <- function(recipe, fcst, } else { projection <- "cylindrical_equidistant" } + ## TODO: improve handling of mask + mask_list <- mask # Define indices and filter binary probabilities probs_index <- which(vapply(recipe$Analysis$Workflow$Probabilities$percentiles, length, integer(1)) == 1) @@ -176,26 +177,37 @@ plot_extreme_probs <- function(recipe, fcst, "-", i_syear) # Mask if (!is.null(mask)) { - outfile <- paste0(outfile, "_enscormask") - var_mask <- ClimProjDiags::Subset(mask, + if (is.list(mask)) { + mask_p <- mask[[paste0("rpss-", names(probs_index)[p])]] + } else { + mask_p <- mask + } + ## TODO: Change name? + outfile <- paste0(outfile, "_rpssmask") + var_mask <- ClimProjDiags::Subset(mask_p, along = c("var"), indices = list(var), drop = 'selected') dim_mask <- dim(var_mask) - var_mask <- as.numeric(var_mask <= 0) + var_mask <- as.numeric(var_mask >= 0) dim(var_mask) <- dim_mask } else { var_mask <- NULL } # Dots if (!is.null(dots)) { - outfile <- paste0(outfile, "_enscordots") - var_dots <- ClimProjDiags::Subset(dots, + if (is.list(dots)) { + dots_p <- dots[[paste0("rpss-", names(probs_index)[p])]] + } else { + dots_p <- dots + } + outfile <- paste0(outfile, "_rpssdots") + var_dots <- ClimProjDiags::Subset(dots_p, along = c("var"), indices = var, drop = 'selected') dim_dots <- dim(var_dots) - var_dots <- as.numeric(var_dots <= 0) + var_dots <- as.numeric(var_dots >= 0) dim(var_dots) <- dim_dots } else { var_dots <- NULL diff --git a/modules/Visualization/R/plot_metrics.R b/modules/Visualization/R/plot_metrics.R index e35d8999..77c477ea 100644 --- a/modules/Visualization/R/plot_metrics.R +++ b/modules/Visualization/R/plot_metrics.R @@ -85,6 +85,11 @@ plot_metrics <- function(recipe, data_cube, metrics, sequential_palette <- "Reds" } # Group different metrics by type + metrics_not_to_plot <- names(metrics)[grep("_significance|metadata|_individual_members", + names(metrics))] + metrics_to_plot <- names(metrics)[!names(metrics) %in% metrics_not_to_plot] + probability_scores <- c("rps-") + probability_skill_scores <- c("rpss-") skill_scores <- c("rpss", "bss90", "bss10", "frpss", "crpss", "mean_bias_ss", "enscorr", "rpss_specs", "bss90_specs", "bss10_specs", "enscorr_specs", "rmsss", "msss", "fbss10", "fbss90") @@ -98,8 +103,10 @@ plot_metrics <- function(recipe, data_cube, metrics, ClimProjDiags::Subset(x, along = 'var', indices = var, drop = 'selected')}) - for (name in c(skill_scores, scores, statistics, "mean_bias", "enssprerr")) { - if (name %in% names(metrics)) { + ## TODO: Remove layer? + for (name in metrics_to_plot) { + # for (name in c(skill_scores, scores, statistics, "mean_bias", "enssprerr")) { + # if (name %in% names(metrics)) { units <- NULL # Define plot characteristics and metric name to display in plot if (name %in% c("rpss", "bss90", "bss10", "frpss", "crpss", @@ -196,8 +203,25 @@ plot_metrics <- function(recipe, data_cube, metrics, cols <- colorbar[2:(length(colorbar) - 1)] col_inf <- NULL col_sup <- NULL + } else if (grepl(paste0(probability_scores, collapse = "|"), name)) { + metric <- var_metric[[name]] + display_name <- paste0(toupper(strsplit(name, "-")[[1]][1]), + " (", strsplit(name, "-")[[1]][2], ")") + brks <- seq(0, 1, by = 0.1) + colorbar <- grDevices::hcl.colors(length(brks), sequential_palette) + cols <- colorbar[1:(length(colorbar) - 1)] + col_inf <- NULL + col_sup <- colorbar[length(colorbar)] + } else if (grepl(paste0(probability_skill_scores, collapse = "|"), name)) { + display_name <- paste0(toupper(strsplit(name, "-")[[1]][1]), + " (", strsplit(name, "-")[[1]][2], ")") + metric <- var_metric[[name]] + brks <- seq(-1, 1, by = 0.2) + colorbar <- clim.colors(length(brks) + 1, diverging_palette) + cols <- colorbar[2:(length(colorbar) - 1)] + col_inf <- colorbar[1] + col_sup <- NULL } - # Reorder dimensions metric <- Reorder(metric, c("time", "longitude", "latitude")) # If the significance has been requested and the variable has it, @@ -495,7 +519,7 @@ plot_metrics <- function(recipe, data_cube, metrics, } } } - } + # } } } if (return_elements) { diff --git a/modules/Visualization/R/plot_most_likely_terciles_map.R b/modules/Visualization/R/plot_most_likely_terciles_map.R index 54d5a735..300de65a 100644 --- a/modules/Visualization/R/plot_most_likely_terciles_map.R +++ b/modules/Visualization/R/plot_most_likely_terciles_map.R @@ -59,15 +59,22 @@ plot_most_likely_terciles <- function(recipe, recipe$Analysis$Time$sdate) init_date <- 1 } + ## Mask adjustments; remove when no longer needed + if (!is.null(mask) && ("rpss" %in% names(mask))) { + mask <- mask$rpss + } + if (!is.null(dots) && ("rpss" %in% names(dots))) { + dots <- dots$rpss + } # Retrieve and rearrange probability bins for the forecast if ("bin" %in% names(dim(probabilities[[1]]))) { ## TODO: Adjust array dimensions if needed. What if there is more than one ## set of categories? - terciles_index <- which(lapply(recipe$Analysis$Workflow$Probabilities$percentiles, - function(x) { - all(match(x, c("1/3", "2/3"))) - }) == TRUE) - probabilities <- probabilities[[terciles_index]] + probs_index <- which(lapply(recipe$Analysis$Workflow$Probabilities$percentiles, + function(x) { + all(match(x, c("1/3", "2/3"))) + }) == TRUE) + probabilities <- probabilities[[probs_index]] probs_fcst <- .drop_dims(probabilities) } else { if (is.null(probabilities$prob_b33) || @@ -122,27 +129,38 @@ plot_most_likely_terciles <- function(recipe, paste0("forecast_most_likely_tercile-", i_syear)) # Mask if (!is.null(mask)) { + ## TODO: Remove when no longer needed + if (is.list(mask)) { + mask_p <- mask[[paste0("rpss-", names(probs_index))]] + } else { + mask_p <- mask + } outfile <- paste0(outfile, "_rpssmask") - var_mask <- ClimProjDiags::Subset(mask, + var_mask <- ClimProjDiags::Subset(mask_p, along = c("var"), - indices = var, + indices = list(var), drop = 'selected') dim_mask <- dim(var_mask) var_mask <- as.numeric(var_mask <= 0) - dim(var_mask) <- dim_mask + dim(var_mask) <- dim_mask } else { var_mask <- NULL } # Dots if (!is.null(dots)) { - outfile <- paste0(outfile, "_rpssdots") - var_dots <- ClimProjDiags::Subset(dots, - along = c("var"), - indices = var, - drop = 'selected') - dim_dots <- dim(var_dots) - var_dots <- as.numeric(var_dots <= 0) - dim(var_dots) <- dim_dots + if (is.list(dots)) { + dots_p <- dots[[paste0("rpss-", names(probs_index)[p])]] + } else { + dots_p <- dots + } + outfile <- paste0(outfile, "_rpssdots") + var_dots <- ClimProjDiags::Subset(dots_p, + along = c("var"), + indices = var, + drop = 'selected') + dim_dots <- dim(var_dots) + var_dots <- as.numeric(var_dots <= 0) + dim(var_dots) <- dim_dots } else { var_dots <- NULL } diff --git a/modules/Visualization/Visualization.R b/modules/Visualization/Visualization.R index 8c8ee629..38e917a3 100644 --- a/modules/Visualization/Visualization.R +++ b/modules/Visualization/Visualization.R @@ -248,14 +248,14 @@ Visualization <- function(recipe, error(recipe$Run$logger, paste0("For the most likely terciles plot, skill_metrics ", "needs to be provided to be masked.")) - } else if (!('rpss' %in% names(skill_metrics))) { - error(recipe$Run$logger, - paste0("For the most likely terciles plot, rpss metric ", - "needs to be provided to be masked.")) + # } else if (!('rpss' %in% names(skill_metrics))) { + # error(recipe$Run$logger, + # paste0("For the most likely terciles plot, rpss metric ", + # "needs to be provided to be masked.")) } else { plot_most_likely_terciles(recipe, data$fcst, probabilities$fcst, - mask = skill_metrics$rpss, + mask = skill_metrics, dots = NULL, outdir, output_conf = output_conf, logo = logo) @@ -267,15 +267,15 @@ Visualization <- function(recipe, error(recipe$Run$logger, paste0("For the most likely terciles plot, skill_metrics ", "needs to be provided for the dots.")) - } else if (!('rpss' %in% names(skill_metrics))) { - error(recipe$Run$logger, - paste0("For the most likely terciles plot, rpss metric ", - "needs to be provided for the dots.")) + # } else if (!('rpss' %in% names(skill_metrics))) { + # error(recipe$Run$logger, + # paste0("For the most likely terciles plot, rpss metric ", + # "needs to be provided for the dots.")) } else { plot_most_likely_terciles(recipe, data$fcst, probabilities$fcst, mask = NULL, - dots = skill_metrics$rpss, + dots = skill_metrics, outdir, output_conf = output_conf, logo = logo) } @@ -324,14 +324,14 @@ Visualization <- function(recipe, error(recipe$Run$logger, paste0("For the extreme probabilities plot, skill_metrics ", "needs to be provided to be masked.")) - } else if (!('rpss' %in% names(skill_metrics))) { - error(recipe$Run$logger, - paste0("For the extreme probabilities plot, rpss metric ", - "need to be provided to be masked.")) + # } else if (!('rpss' %in% names(skill_metrics))) { + # error(recipe$Run$logger, + # paste0("For the extreme probabilities plot, rpss metric ", + # "need to be provided to be masked.")) } else { plot_extreme_probs(recipe = recipe, fcst = data$fcst, probabilities = probabilities$fcst, - mask = skill_metrics$rpss, + mask = skill_metrics, dots = NULL, outdir, output_conf = output_conf, brks_min = recipe$Analysis$Workflow$Visualization$brks_min, @@ -345,15 +345,15 @@ Visualization <- function(recipe, error(recipe$Run$logger, paste0("For the extreme probabilities plot, skill_metrics ", "needs to be provided for the dots.")) - } else if (!('rpss' %in% names(skill_metrics))) { - error(recipe$Run$logger, - paste0("For the extreme probabilities plot, rpss metric ", - "needs to be provided for the dots.")) + # } else if (!('rpss' %in% names(skill_metrics))) { + # error(recipe$Run$logger, + # paste0("For the extreme probabilities plot, rpss metric ", + # "needs to be provided for the dots.")) } else { plot_extreme_probs(recipe = recipe, fcst = data$fcst, probabilities = probabilities$fcst, mask = NULL, - dots = skill_metrics$rpss, + dots = skill_metrics, outdir, output_conf = output_conf, brks_min = recipe$Analysis$Workflow$Visualization$brks_min, brks_max = recipe$Analysis$Workflow$Visualization$brks_max, -- GitLab From e0647c6b65e64f7221637dad852824ff96abc794 Mon Sep 17 00:00:00 2001 From: vagudets Date: Wed, 21 May 2025 14:24:20 +0200 Subject: [PATCH 20/22] Fix pipeline --- tests/testthat/test-decadal_monthly_2.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test-decadal_monthly_2.R b/tests/testthat/test-decadal_monthly_2.R index 44de32b8..e58c1e65 100644 --- a/tests/testthat/test-decadal_monthly_2.R +++ b/tests/testthat/test-decadal_monthly_2.R @@ -262,13 +262,13 @@ test_that("5. Visualization", { plots <- paste0(recipe$Run$output_dir, "/plots/") expect_equal( all(basename(list.files(plots, recursive = T)) %in% -c("bss10_specs.pdf", "enscorr_specs.pdf", "forecast_ensemble_median-2020.pdf", "forecast_ensemble_median-2021.pdf", "forecast_most_likely_tercile-2020.pdf", "forecast_most_likely_tercile-2021.pdf", "frps_specs.pdf", "frps.pdf", "rpss_specs.pdf") +c("bss10_specs.pdf", "enscorr_specs.pdf", "forecast_ensemble_median-2020.pdf", "forecast_ensemble_median-2021.pdf", "forecast_most_likely_tercile-2020.pdf", "forecast_most_likely_tercile-2021.pdf", "frps_specs.pdf", "frps.pdf", "rpss_specs.pdf", "frpss_specs.pdf") ), TRUE ) expect_equal( length(list.files(plots, recursive = T)), -9 +10 ) }) -- GitLab From 5ae36ffe01711f25b7d00c63c76f30b0c9f0ab6c Mon Sep 17 00:00:00 2001 From: vagudets Date: Wed, 21 May 2025 15:02:07 +0200 Subject: [PATCH 21/22] Avoic mixing POSIXt and PCICt types when substracting dates --- modules/Saving/R/get_times.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/Saving/R/get_times.R b/modules/Saving/R/get_times.R index 58ab1b60..52d773c5 100644 --- a/modules/Saving/R/get_times.R +++ b/modules/Saving/R/get_times.R @@ -22,7 +22,7 @@ } dates <- as.PCICt(ClimProjDiags::Subset(data_cube$attrs$Dates, 'syear', 1), cal = calendar) - leadtimes <- as.numeric(dates - init_date)/3600 + leadtimes <- as.numeric(dates - as.PCICt(init_date, cal = calendar))/3600 switch(fcst.horizon, "seasonal" = {time <- leadtimes; ref <- 'hours since '; @@ -48,7 +48,7 @@ indices = 1, drop = 'non-selected'), cal = calendar) - y <- as.numeric(y - init_date)/3600 + y <- as.numeric(y - as.PCICt(init_date, cal = calendar))/3600 return(y) }) # Generates time_bnds dimensions and the corresponding metadata. -- GitLab From 46edb16b5da369e76d7f0c0bea8bf70f1c7b86df Mon Sep 17 00:00:00 2001 From: vagudets Date: Wed, 21 May 2025 16:33:07 +0200 Subject: [PATCH 22/22] Fix decadal output file names when there is more than one fcst year --- modules/Saving/R/save_forecast.R | 26 ++++++++++++++----------- modules/Saving/R/save_probabilities.R | 28 ++++++++++++++------------- 2 files changed, 30 insertions(+), 24 deletions(-) diff --git a/modules/Saving/R/save_forecast.R b/modules/Saving/R/save_forecast.R index 6064bea7..95cee297 100644 --- a/modules/Saving/R/save_forecast.R +++ b/modules/Saving/R/save_forecast.R @@ -29,21 +29,21 @@ save_forecast <- function(recipe, # Compute initial date if (fcst.horizon == 'decadal') { if (type == 'hcst') { - init_date <- as.PCICt(paste0(as.numeric(recipe$Analysis$Time$hcst_start) + 1, - '-01-01'), cal = calendar) + init_dates <- as.PCICt(paste0(as.numeric(recipe$Analysis$Time$hcst_start) + 1, + '-01-01'), cal = calendar) } else if (type == 'fcst') { - init_date <- as.PCICt(paste0(as.numeric(recipe$Analysis$Time$fcst_year) + 1, - '-01-01'), cal = calendar) + init_dates <- as.PCICt(paste0(as.numeric(recipe$Analysis$Time$fcst_year) + 1, + '-01-01'), cal = calendar) } } else { if (type == 'hcst') { - init_date <- as.PCICt(paste0(recipe$Analysis$Time$hcst_start, - recipe$Analysis$Time$sdate), - format = '%Y%m%d', cal = calendar) + init_dates <- as.PCICt(paste0(recipe$Analysis$Time$hcst_start, + recipe$Analysis$Time$sdate), + format = '%Y%m%d', cal = calendar) } else if (type == 'fcst') { - init_date <- as.PCICt(paste0(recipe$Analysis$Time$fcst_year, - recipe$Analysis$Time$sdate), - format = '%Y%m%d', cal = calendar) + init_dates <- as.PCICt(paste0(recipe$Analysis$Time$fcst_year, + recipe$Analysis$Time$sdate), + format = '%Y%m%d', cal = calendar) } } @@ -102,13 +102,17 @@ save_forecast <- function(recipe, attr(fcst[[1]], 'global_attrs') <- global_attributes # Select start date + if (type == "fcst") { + init_date <- as.POSIXct(init_dates[i]) + } else if (type == "hcst") { + init_date <- as.POSIXct(init_dates) + } if (fcst.horizon == 'decadal') { ## NOTE: Not good to use data_cube$load_parameters$dat1 because decadal ## data has been reshaped # fcst.sdate <- format(as.Date(data_cube$Dates$start[i]), '%Y%m%d') # init_date is like "1990-11-01" - init_date <- as.POSIXct(init_date) fcst.sdate <- init_date + lubridate::years(syears_val[i] - lubridate::year(init_date)) fcst.sdate <- format(fcst.sdate, '%Y%m%d') diff --git a/modules/Saving/R/save_probabilities.R b/modules/Saving/R/save_probabilities.R index 5fd29618..5e211934 100644 --- a/modules/Saving/R/save_probabilities.R +++ b/modules/Saving/R/save_probabilities.R @@ -40,20 +40,17 @@ save_probabilities <- function(recipe, } if (fcst.horizon == 'decadal') { - # if (global_attributes$system == 'Multimodel') { - # init_month <- 11 #TODO: put as if init_month is January - # } else { - # init_month <- archive$System[[recipe$Analysis$Datasets$System$name]]$initial_month - # } - # init_date <- as.PCICt(paste0(recipe$Analysis$Time$hcst_start, '-', - # sprintf('%02d', init_month), '-01'), - # cal = calendar) - init_date <- as.PCICt(paste0(as.numeric(recipe$Analysis$Time$hcst_start)+1, - '-01-01'), cal = calendar) + if (type == 'hcst') { + init_dates <- as.PCICt(paste0(as.numeric(recipe$Analysis$Time$hcst_start) + 1, + '-01-01'), cal = calendar) + } else if (type == 'fcst') { + init_dates <- as.PCICt(paste0(as.numeric(recipe$Analysis$Time$fcst_year) + 1, + '-01-01'), cal = calendar) + } } else { - init_date <- as.PCICt(paste0(recipe$Analysis$Time$hcst_start, - recipe$Analysis$Time$sdate), - format = '%Y%m%d', cal = calendar) + init_dates <- as.PCICt(paste0(recipe$Analysis$Time$hcst_start, + recipe$Analysis$Time$sdate), + format = '%Y%m%d', cal = calendar) } syears <- seq(1:dim(data_cube$data)['syear'][[1]]) @@ -102,6 +99,11 @@ save_probabilities <- function(recipe, attr(probs_syear[[1]], 'global_attrs') <- global_attributes # Select start date + if (type == "fcst") { + init_date <- as.POSIXct(init_dates[i]) + } else if (type == "hcst") { + init_date <- as.POSIXct(init_dates) + } if (fcst.horizon == 'decadal') { # init_date is like "1990-11-01" init_date <- as.POSIXct(init_date) -- GitLab