From 0394dd79cf87f970020b0ae31d1eefb612806164 Mon Sep 17 00:00:00 2001 From: nperez Date: Fri, 3 Nov 2023 17:45:53 +0100 Subject: [PATCH 1/6] check recipe scorecards --- tools/check_recipe.R | 30 +++++++++++++++++++++++++++--- tools/prepare_outputs.R | 2 +- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/tools/check_recipe.R b/tools/check_recipe.R index e4c59a55..339c8111 100644 --- a/tools/check_recipe.R +++ b/tools/check_recipe.R @@ -442,10 +442,11 @@ check_recipe <- function(recipe) { } # Skill - AVAILABLE_METRICS <- c("enscorr", "corr_individual_members", "rps", "rpss", - "frps", "frpss", "crps", "crpss", "bss10", "bss90", + AVAILABLE_METRICS <- c("enscorr", "corr_individual_members", "rps", "rps_syear", + "rpss", "frps", "frpss", "crps", "crps_syear", + "crpss", "bss10", "bss90", "mean_bias", "mean_bias_ss", "enssprerr", "rps_clim", - "rpss_clim", "enscorr_specs", "frps_specs", "rpss_specs", + "crps_clim", "enscorr_specs", "frps_specs", "rpss_specs", "frpss_specs", "bss10_specs", "bss90_specs") if ("Skill" %in% names(recipe$Analysis$Workflow)) { if (is.null(recipe$Analysis$Workflow$Skill$metric)) { @@ -571,6 +572,28 @@ check_recipe <- function(recipe) { } else { sc_metrics <- strsplit(recipe$Analysis$Workflow$Scorecards$metric, ", | |,")[[1]] + if (recipe$Analysis$Workflow$Scorecards$metric_aggregation == 'score') { + if ('rpss' %in% tolower(sc_metrics)) { + if (!('rps_clim_syear' %in% requested_metrics)) { + requested_metrics <- c(requested_metrics, 'rps_clim_syear') + } + if (!('rps_syear' %in% requested_metrics)) { + requested_metrics <- c(requested_metrics, 'rps_syear') + } + } + if ('crpss' %in% tolower(sc_metrics)) { + if (!('crps_clim_syear' %in% requested_metrics)) { + requested_metrics <- c(requested_metrics, 'crps_clim_syear') + } + if (!('crps_syear' %in% requested_metrics)) { + requested_metrics <- c(requested_metrics, 'crps_syear') + } + } + if ('enscorr' %in% tolower(sc_metrics)) { + recipe$Analysis$Workflow$Statistics <- c('variance', 'covariance') + } + recipe$Analysis$Workflow$Skill$metric <- requested_metrics + } if (!all(tolower(sc_metrics) %in% tolower(requested_metrics))) { error(recipe$Run$logger, paste0("All of the metrics requested under 'Scorecards' must ", @@ -718,4 +741,5 @@ check_recipe <- function(recipe) { } else { info(recipe$Run$logger, "##### RECIPE CHECK SUCCESSFULL #####") } + return(recipe) } diff --git a/tools/prepare_outputs.R b/tools/prepare_outputs.R index fa4d23ed..c7f3e3e6 100644 --- a/tools/prepare_outputs.R +++ b/tools/prepare_outputs.R @@ -94,7 +94,7 @@ prepare_outputs <- function(recipe_file, warn(recipe$Run$logger, "Recipe checks disabled. The recipe will not be checked for errors.") } else { - check_recipe(recipe) + recipe <- check_recipe(recipe) } # Restructure the recipe to make the atomic recipe more readable if (restructure) { -- GitLab From bee3bdbe4e5d9b0261e722ef5b287a7fa2f558ce Mon Sep 17 00:00:00 2001 From: Nadia Milders Date: Tue, 7 Nov 2023 16:25:24 +0100 Subject: [PATCH 2/6] included cross-validation in CRPS_clim and RPS_clim --- modules/Skill/R/CRPS_clim.R | 30 ++++++++++++++----- modules/Skill/R/RPS_clim.R | 12 +++++--- modules/Skill/Skill.R | 6 ++-- .../recipe_scorecards_s2s-suite.yml | 15 ++++++---- 4 files changed, 44 insertions(+), 19 deletions(-) diff --git a/modules/Skill/R/CRPS_clim.R b/modules/Skill/R/CRPS_clim.R index 36db4e94..b66cab78 100644 --- a/modules/Skill/R/CRPS_clim.R +++ b/modules/Skill/R/CRPS_clim.R @@ -1,13 +1,27 @@ -CRPS_clim <- function(obs, memb_dim ='ensemble'){ +# CRPS version for climatology +CRPS_clim <- function(obs, memb_dim ='ensemble', return_mean = TRUE, clim.cross.val= TRUE){ time_dim <- names(dim(obs)) obs_time_len <- dim(obs)[time_dim] - + + if (isFALSE(clim.cross.val)) { ## Without cross-validation ref <- array(data = rep(obs, each = obs_time_len), dim = c(obs_time_len, obs_time_len)) - names(dim(ref)) <- c(time_dim, memb_dim) - # ref: [sdate, memb] - # obs: [sdate] + } else if (isTRUE(clim.cross.val)) { ## With cross-validation (excluding the value of that year to create ref for that year) + ref <- array(data = NA, dim = c(obs_time_len, obs_time_len - 1)) + for (i in 1:obs_time_len) { + ref[i, ] <- obs[-i] + } + } + + names(dim(ref)) <- c(time_dim, memb_dim) + # ref: [sdate, memb] + # obs: [sdate] crps_ref <- s2dv:::.CRPS(exp = ref, obs = obs, time_dim = time_dim, memb_dim = memb_dim, - dat_dim = NULL, Fair = FALSE) - # crps_ref should be [sdate] - return(mean(crps_ref)) + dat_dim = NULL, Fair = FALSE) + + # crps_ref should be [sdate] + if (return_mean == TRUE) { + return(mean(crps_ref)) + } else { + return(crps_ref) + } } diff --git a/modules/Skill/R/RPS_clim.R b/modules/Skill/R/RPS_clim.R index e8b6452d..4a079cd4 100644 --- a/modules/Skill/R/RPS_clim.R +++ b/modules/Skill/R/RPS_clim.R @@ -1,12 +1,12 @@ # RPS version for climatology -RPS_clim <- function(obs, indices_for_clim = NULL, prob_thresholds = c(1/3, 2/3)) { +RPS_clim <- function(obs, indices_for_clim = NULL, prob_thresholds = c(1/3, 2/3), cross.val = TRUE, return_mean = TRUE) { if (is.null(indices_for_clim)){ indices_for_clim <- 1:length(obs) } obs_probs <- .GetProbs(data = obs, indices_for_quantiles = indices_for_clim, ## temporarily removed s2dv::: - prob_thresholds = prob_thresholds, weights = NULL, cross.val = T) ## here! + prob_thresholds = prob_thresholds, weights = NULL, cross.val = cross.val) # clim_probs: [bin, sdate] clim_probs <- c(prob_thresholds[1], diff(prob_thresholds), 1 - prob_thresholds[length(prob_thresholds)]) clim_probs <- array(clim_probs, dim = dim(obs_probs)) @@ -15,6 +15,10 @@ RPS_clim <- function(obs, indices_for_clim = NULL, prob_thresholds = c(1/3, 2/3) probs_clim_cumsum <- apply(clim_probs, 2, cumsum) probs_obs_cumsum <- apply(obs_probs, 2, cumsum) rps_ref <- apply((probs_clim_cumsum - probs_obs_cumsum)^2, 2, sum) - - return(mean(rps_ref)) + + if (return_mean == TRUE) { + return(mean(rps_ref)) + } else { + return(rps_ref) + } } diff --git a/modules/Skill/Skill.R b/modules/Skill/Skill.R index fd54f17f..dcd28eb7 100644 --- a/modules/Skill/Skill.R +++ b/modules/Skill/Skill.R @@ -89,7 +89,8 @@ Skill <- function(recipe, data, agg = 'global') { skill_metrics[[ metric ]] <- skill rps_clim <- Apply(list(data$obs$data), target_dims = c(time_dim, memb_dim), - RPS_clim)$output1 + cross.val = cross.val, + fun = RPS_clim)$output1 rps_clim <- .drop_dims(rps_clim) skill_metrics[[paste0(metric, "_clim")]] <- rps_clim # Ranked Probability Skill Score and Fair version @@ -140,7 +141,8 @@ Skill <- function(recipe, data, agg = 'global') { skill <- .drop_dims(skill) skill_metrics[[ metric ]] <- skill crps_clim <- Apply(list(data$obs$data), target_dims = time_dim, - fun = CRPS_clim, memb_dim = memb_dim)$output1 + fun = CRPS_clim, memb_dim = memb_dim, + clim.cross.val = cross.val)$output1 crps_clim <- .drop_dims(crps_clim) skill_metrics[['crps_clim']] <- crps_clim # CRPSS and FCRPSS diff --git a/recipes/atomic_recipes/recipe_scorecards_s2s-suite.yml b/recipes/atomic_recipes/recipe_scorecards_s2s-suite.yml index ae17f9cd..1fd6adc3 100644 --- a/recipes/atomic_recipes/recipe_scorecards_s2s-suite.yml +++ b/recipes/atomic_recipes/recipe_scorecards_s2s-suite.yml @@ -19,9 +19,9 @@ Analysis: hcst_start: '1993' # Mandatory, int: Hindcast start year 'YYYY' hcst_end: '2016' # Mandatory, int: Hindcast end year 'YYYY' ftime_min: 1 # Mandatory, int: First leadtime time step in months - ftime_max: 6 # Mandatory, int: Last leadtime time step in months + ftime_max: 2 # Mandatory, int: Last leadtime time step in months Region: - latmin: -90 # Mandatory, int: minimum latitude + latmin: 30 # Mandatory, int: minimum latitude latmax: 90 # Mandatory, int: maximum latitude lonmin: 0 # Mandatory, int: minimum longitude lonmax: 359.9 # Mandatory, int: maximum longitude @@ -31,13 +31,18 @@ Analysis: Workflow: Calibration: method: raw # Mandatory, str: Calibration method. See docu. + save: 'none' Anomalies: compute: yes - cross_validation: no + cross_validation: no + save: 'none' Skill: - metric: mean_bias EnsCorr rps rpss crps crpss EnsSprErr # str: Skill metric or list of skill metrics. See docu. + metric: rps rpss crps crpss # str: Skill metric or list of skill metrics. See docu. + cross_validation: yes + save: 'all' Probabilities: percentiles: [[1/3, 2/3], [1/10], [9/10]] # frac: Quantile thresholds. + save: 'none' Indicators: index: no ncores: 15 # Optional, int: number of cores, defaults to 1 @@ -46,5 +51,5 @@ Analysis: Run: Loglevel: INFO Terminal: yes - output_dir: /esarchive/scratch/nmilders/scorecards_data/to_system/cross_validation/tercile_cross_val/ECMWF-SEAS5/prlr/ + output_dir: /esarchive/scratch/nmilders/scorecards_data/test/ code_dir: /esarchive/scratch/nmilders/gitlab/git_clones/s2s-suite/ -- GitLab From cdb70ec6ffa65c751f2362505b02efce615ab74a Mon Sep 17 00:00:00 2001 From: Nadia Milders Date: Wed, 8 Nov 2023 14:22:10 +0100 Subject: [PATCH 3/6] Included cross-validation in RMSSS --- modules/Skill/R/tmp/RMSSS.R | 482 ++++++++++++++++++++++++++++++++++++ modules/Skill/Skill.R | 5 + 2 files changed, 487 insertions(+) create mode 100644 modules/Skill/R/tmp/RMSSS.R diff --git a/modules/Skill/R/tmp/RMSSS.R b/modules/Skill/R/tmp/RMSSS.R new file mode 100644 index 00000000..e2fc4a80 --- /dev/null +++ b/modules/Skill/R/tmp/RMSSS.R @@ -0,0 +1,482 @@ +#'Compute root mean square error skill score +#' +#'Compute the root mean square error skill score (RMSSS) between an array of +#'forecast 'exp' and an array of observation 'obs'. The two arrays should +#'have the same dimensions except along 'dat_dim' and 'memb_dim'. The RMSSSs +#'are computed along 'time_dim', the dimension which corresponds to the start +#'date dimension. +#'RMSSS computes the root mean square error skill score of each exp in 1:nexp +#'against each obs in 1:nobs which gives nexp * nobs RMSSS for each grid point +#'of the array.\cr +#'The p-value and significance test are optionally provided by an one-sided +#'Fisher test or Random Walk test.\cr +#' +#'@param exp A named numeric array of experimental data which contains at least +#' time dimension (time_dim). It can also be a vector with the same length as +#' 'obs', then the vector will automatically be 'time_dim'. +#'@param obs A named numeric array of observational data which contains at least +#' time dimension (time_dim). The dimensions should be the same as parameter +#' 'exp' except the length of 'dat_dim' and 'memb_dim' dimension. It can also +#' be a vector with the same length as 'exp', then the vector will +#' automatically be 'time_dim'. +#'@param ref A named numerical array of the reference forecast data with at +#' least time dimension, or 0 (typical climatological forecast) or 1 +#' (normalized climatological forecast). If it is an array, the dimensions must +#' be the same as 'exp' except 'memb_dim' and 'dat_dim'. If there is only one +#' reference dataset, it should not have dataset dimension. If there is +#' corresponding reference for each experiment, the dataset dimension must +#' have the same length as in 'exp'. If 'ref' is NULL, the typical +#' climatological forecast is used as reference forecast (equivalent to 0.) +#' The default value is NULL. +#'@param dat_dim A character string indicating the name of dataset (nobs/nexp) +#' dimension. The default value is NULL. +#'@param time_dim A character string indicating the name of dimension along +#' which the RMSSS are computed. The default value is 'sdate'. +#'@param memb_dim A character string indicating the name of the member dimension +#' to compute the ensemble mean; it should be set to NULL if the data are +#' already the ensemble mean. The default value is NULL. +#'@param pval A logical value indicating whether to compute or not the p-value +#' of the test Ho: RMSSS = 0. The default value is TRUE. +#'@param sign A logical value indicating whether to compute or not the +#' statistical significance of the test Ho: RMSSS = 0. The default value is +#' FALSE. +#'@param alpha A numeric of the significance level to be used in the +#' statistical significance test. The default value is 0.05. +#'@param sig_method A character string indicating the significance method. The +#' options are "one-sided Fisher" (default) and "Random Walk". +#'@param sig_method.type A character string indicating the test type of the +#' significance method. Check \code{RandomWalkTest()} parameter +#' \code{test.type} for details if parameter "sig_method" is "Random Walk". The +#' default is NULL (since "one-sided Fisher" doesn't have different test +#' types.) +#'@param clim.cross.val A logical indicating whether to build the climatological +#' forecast in cross-validation (i.e. excluding the observed value of the time +#' step when building the probabilistic distribution function for that +#' particular time step). Only used if 'ref' is NULL. The default value is +#' NULL, in which case the climatological reference is set to zero. FALSE +#' indicates calculating the ref from the obs data without cross-validation, +#' and TRUE indicates calculating the ref from the obs data applying +#' cross-validation. +#'@param ncores An integer indicating the number of cores to use for parallel +#' computation. The default value is NULL. +#' +#'@return +#'A list containing the numeric arrays with dimension:\cr +#' c(nexp, nobs, all other dimensions of exp except time_dim).\cr +#'nexp is the number of experiment (i.e., dat_dim in exp), and nobs is the +#'number of observation (i.e., dat_dim in obs). If dat_dim is NULL, nexp and +#'nobs are omitted.\cr +#'\item{$rmsss}{ +#' A numerical array of the root mean square error skill score. +#'} +#'\item{$p.val}{ +#' A numerical array of the p-value with the same dimensions as $rmsss. +#' Only present if \code{pval = TRUE}. +#'} +#'\item{sign}{ +#' A logical array of the statistical significance of the RMSSS with the same +#' dimensions as $rmsss. Only present if \code{sign = TRUE}. +#'} +#' +#'@examples +#'# Load sample data as in Load() example: +#'example(Load) +#'clim <- Clim(sampleData$mod, sampleData$obs) +#'ano_exp <- Ano(sampleData$mod, clim$clim_exp) +#'ano_obs <- Ano(sampleData$obs, clim$clim_obs) +#'rmsss <- RMSSS(ano_exp, ano_obs, dat_dim = 'dataset', memb_dim = 'member') +#' +#' set.seed(1) +#' exp1 <- array(rnorm(30), dim = c(dataset = 2, time = 3, memb = 5)) +#' set.seed(2) +#' obs1 <- array(rnorm(15), dim = c(time = 3, memb = 5, dataset = 1)) +#' res1 <- RMSSS(exp1, obs1, time_dim = 'time', dat_dim = 'dataset') +#' +#' exp2 <- array(rnorm(30), dim = c(lat = 2, time = 3, memb = 5)) +#' obs2 <- array(rnorm(15), dim = c(time = 3, lat = 2)) +#' res2 <- RMSSS(exp2, obs2, time_dim = 'time', memb_dim = 'memb') +#' +#' exp3 <- array(rnorm(30), dim = c(lat = 2, time = 3)) +#' obs3 <- array(rnorm(15), dim = c(time = 3, lat = 2)) +#' res3 <- RMSSS(exp3, obs3, time_dim = 'time') +#' +#'@rdname RMSSS +#'@import multiApply +#'@importFrom stats pf +#'@export +RMSSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', dat_dim = NULL, + memb_dim = NULL, pval = TRUE, clim.cross.val = NULL, sign = FALSE, alpha = 0.05, + sig_method = 'one-sided Fisher', sig_method.type = NULL, ncores = NULL) { + + # Check inputs + ## exp, obs, and ref (1) + if (is.null(exp) | is.null(obs)) { + stop("Parameter 'exp' and 'obs' cannot be NULL.") + } + if (!is.numeric(exp) | !is.numeric(obs)) { + stop("Parameter 'exp' and 'obs' must be a numeric array.") + } + if (is.null(dim(exp)) & is.null(dim(obs))) { #is vector + if (length(exp) == length(obs)) { + exp <- array(exp, dim = c(length(exp))) + names(dim(exp)) <- c(time_dim) + obs <- array(obs, dim = c(length(obs))) + names(dim(obs)) <- c(time_dim) + } else { + stop(paste0("Parameter 'exp' and 'obs' must be array with as least two ", + "dimensions time_dim and dat_dim, or vector of same length.")) + } + } else if (is.null(dim(exp)) | is.null(dim(obs))) { + stop(paste0("Parameter 'exp' and 'obs' must be array with as least two ", + "dimensions time_dim and dat_dim, or vector of same length.")) + } + if (any(is.null(names(dim(exp)))) | any(nchar(names(dim(exp))) == 0) | + any(is.null(names(dim(obs)))) | any(nchar(names(dim(obs))) == 0)) { + stop("Parameter 'exp' and 'obs' must have dimension names.") + } + if (!is.null(ref)) { + if (!is.numeric(ref)) { + stop("Parameter 'ref' must be numeric.") + } + if (is.array(ref)) { + if (any(is.null(names(dim(ref))))| any(nchar(names(dim(ref))) == 0)) { + stop("Parameter 'ref' must have dimension names.") + } + } else if (length(ref) != 1 | any(!ref %in% c(0, 1))) { + stop("Parameter 'ref' must be a numeric array or number 0 or 1.") + } + } else { + ref <- 0 + } + if (!is.array(ref)) { # 0 or 1 + ref <- array(data = ref, dim = dim(exp)) + } + + ## time_dim + if (!is.character(time_dim) | length(time_dim) > 1) { + stop("Parameter 'time_dim' must be a character string.") + } + if (!time_dim %in% names(dim(exp)) | !time_dim %in% names(dim(obs))) { + stop("Parameter 'time_dim' is not found in 'exp' or 'obs' dimension.") + } + ## dat_dim + if (!is.null(dat_dim)) { + if (!is.character(dat_dim) | length(dat_dim) > 1) { + stop("Parameter 'dat_dim' must be a character string or NULL.") + } + if (!dat_dim %in% names(dim(exp)) | !dat_dim %in% names(dim(obs))) { + stop("Parameter 'dat_dim' is not found in 'exp' or 'obs' dimension.", + " Set it as NULL if there is no dataset dimension.") + } + } + ## memb_dim + if (!is.null(memb_dim)) { + if (!is.character(memb_dim) | length(memb_dim) > 1) { + stop("Parameter 'memb_dim' must be a character string.") + } + if (!memb_dim %in% names(dim(exp))) { + stop("Parameter 'memb_dim' is not found in 'exp' dimension.") + } + } + ## pval + if (!is.logical(pval) | length(pval) > 1) { + stop("Parameter 'pval' must be one logical value.") + } + ## sign + if (!is.logical(sign) | length(sign) > 1) { + stop("Parameter 'sign' must be one logical value.") + } + ## alpha + if (!is.numeric(alpha) | length(alpha) > 1) { + stop("Parameter 'alpha' must be one numeric value.") + } + ## sig_method + if (length(sig_method) != 1 | !any(sig_method %in% c('one-sided Fisher', 'Random Walk'))) { + stop("Parameter 'sig_method' must be one of 'one-sided Fisher' or 'Random Walk'.") + } + ## sig_method.type + if (sig_method == 'Random Walk') { + if (is.null(sig_method.type)) { + .warning("Parameter 'sig_method.type' must be specified if 'sig_method' is ", + "Random Walk. Assign it as 'two.sided'.") + .warning("Note that in s2dv <= 1.4.1, Random Walk uses 'two.sided.approx' method.", + "If you want to retain the same functionality, please specify parameter ", + "'sig_method.type' as 'two.sided.approx'.") + sig_method.type <- "two.sided" + } + if (!any(sig_method.type %in% c('two.sided.approx', 'two.sided', 'greater', 'less'))) { + stop("Parameter 'sig_method.type' must be a method accepted by RandomWalkTest() parameter 'test.type'.") + } + if (sig_method.type == 'two.sided.approx' & pval == T) { + .warning("p-value cannot be calculated by Random Walk 'two.sided.approx' method.") + pval <- FALSE + if (alpha != 0.05) { + .warning("DelSole and Tippett (2016) aproximation is valid for alpha ", + "= 0.05 only. Returning the significance at the 0.05 significance level.") + alpha <- 0.05 + } + } + } + ## ncores + if (!is.null(ncores)) { + if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 | length(ncores) > 1) { + stop("Parameter 'ncores' must be a positive integer.") + } + } + ## exp, obs, and ref (2) + name_exp <- sort(names(dim(exp))) + name_obs <- sort(names(dim(obs))) + if (!is.null(memb_dim)) { + if (memb_dim %in% name_exp) { + name_exp <- name_exp[-which(name_exp == memb_dim)] + } + if (memb_dim %in% name_obs) { + name_obs <- name_obs[-which(name_obs == memb_dim)] + } + } + if (!is.null(dat_dim)) { + name_exp <- name_exp[-which(name_exp == dat_dim)] + name_obs <- name_obs[-which(name_obs == dat_dim)] + } + if (!all(name_exp == name_obs)) { + stop("Parameter 'exp' and 'obs' must have the same dimension names.") + } + if (!all(dim(exp)[name_exp] == dim(obs)[name_obs])) { + stop(paste0("Parameter 'exp' and 'obs' must have same length of ", + "all dimensions except 'dat_dim' and 'memb_dim'.")) + } + + name_ref <- sort(names(dim(ref))) + if (!is.null(memb_dim) && memb_dim %in% name_ref) { + name_ref <- name_ref[-which(name_ref == memb_dim)] + } + if (!is.null(dat_dim)) { + if (dat_dim %in% name_ref) { + if (!identical(dim(exp)[dat_dim], dim(ref)[dat_dim])) { + stop(paste0("If parameter 'ref' has dataset dimension, it must be ", + "equal to dataset dimension of 'exp'.")) + } + name_ref <- name_ref[-which(name_ref == dat_dim)] + } + } + if (!identical(length(name_exp), length(name_ref)) | + !identical(dim(exp)[name_exp], dim(ref)[name_ref])) { + stop(paste0("Parameter 'exp' and 'ref' must have the same length of ", + "all dimensions except 'memb_dim' and 'dat_dim' if there is ", + "only one reference dataset.")) + } + + if (dim(exp)[time_dim] <= 2) { + stop("The length of time_dim must be more than 2 to compute RMSSS.") + } + + + ############################### + # # Sort dimension + # name_exp <- names(dim(exp)) + # name_obs <- names(dim(obs)) + # order_obs <- match(name_exp, name_obs) + # obs <- Reorder(obs, order_obs) + + ############################### + ## Ensemble mean + if (!is.null(memb_dim)) { + if (memb_dim %in% names(dim(exp))) { + exp <- MeanDims(exp, memb_dim, na.rm = T) + } + if (memb_dim %in% names(dim(obs))) { + obs <- MeanDims(obs, memb_dim, na.rm = T) + } + if (memb_dim %in% names(dim(ref))) { + ref <- MeanDims(ref, memb_dim, na.rm = T) + } + } + + ############################### + # Calculate RMSSS + + data <- list(exp = exp, obs = obs, ref = ref) + if (!is.null(dat_dim)) { + if (dat_dim %in% names(dim(ref))) { + target_dims <- list(exp = c(time_dim, dat_dim), + obs = c(time_dim, dat_dim), + ref = c(time_dim, dat_dim)) + } else { + target_dims <- list(exp = c(time_dim, dat_dim), + obs = c(time_dim, dat_dim), + ref = c(time_dim)) + } + } else { + target_dims <- list(exp = time_dim, obs = time_dim, ref = time_dim) + } + + res <- Apply(data, + target_dims = target_dims, + fun = .RMSSS, + time_dim = time_dim, dat_dim = dat_dim, + pval = pval, sign = sign, alpha = alpha, + sig_method = sig_method, sig_method.type = sig_method.type, + ncores = ncores) + + return(res) +} + +.RMSSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', dat_dim = NULL, pval = TRUE, + sign = FALSE, alpha = 0.05, sig_method = 'one-sided Fisher', + sig_method.type = NULL) { + # exp: [sdate, (dat)] + # obs: [sdate, (dat)] + # ref: [sdate, (dat)] or NULL + + ## Previous ref code + # if (is.null(ref)) { + # ref <- array(data = 0, dim = dim(obs)) + # } else if (identical(ref, 0) | identical(ref, 1)) { + # ref <- array(ref, dim = dim(exp)) + # } + + ## Include ref condition for cross-validation + obs_time_len <- dim(obs)[time_dim] + + if (is.null(ref)) { + if (is.null(clim.cross.val)) { ## If clim.cross.val is NULL reference is set to 0 + ref <- array(data = 0, dim = dim(obs)) + } else if (isFALSE(clim.cross.val)) { ## Without cross-validation + ref <- array(data = rep(obs, each = obs_time_len), dim = c(obs_time_len, obs_time_len)) + } else if (isTRUE(clim.cross.val)) { ## With cross-validation (excluding the value of that year to create ref for that year) + ref <- array(data = NA, dim = c(obs_time_len, obs_time_len - 1)) + for (i in 1:obs_time_len) { + ref[i, ] <- obs[-i] + } + } + } else if (identical(ref, 0) | identical(ref, 1)) { + ref <- array(ref, dim = dim(exp)) + } + + if (is.null(dat_dim)) { + # exp: [sdate] + # obs: [sdate] + nexp <- 1 + nobs <- 1 + nref <- 1 + # Add dat dim back temporarily + dim(exp) <- c(dim(exp), dat = 1) + dim(obs) <- c(dim(obs), dat = 1) + dim(ref) <- c(dim(ref), dat = 1) + + } else { + # exp: [sdate, dat_exp] + # obs: [sdate, dat_obs] + nexp <- as.numeric(dim(exp)[2]) + nobs <- as.numeric(dim(obs)[2]) + if (dat_dim %in% names(dim(ref))) { + nref <- as.numeric(dim(ref)[2]) + } else { + dim(ref) <- c(dim(ref), dat = 1) + nref <- 1 + } + } + + nsdate <- as.numeric(dim(exp)[1]) + + # RMS of forecast + dif1 <- array(dim = c(nsdate, nexp, nobs)) + names(dim(dif1)) <- c(time_dim, 'nexp', 'nobs') + + for (i in 1:nobs) { + dif1[, , i] <- sapply(1:nexp, function(x) {exp[, x] - obs[, i]}) + } + + rms_exp <- colMeans(dif1^2, na.rm = TRUE)^0.5 # [nexp, nobs] + + # RMS of reference + dif2 <- array(dim = c(nsdate, nref, nobs)) + names(dim(dif2)) <- c(time_dim, 'nexp', 'nobs') + for (i in 1:nobs) { + dif2[, , i] <- sapply(1:nref, function(x) {ref[, x] - obs[, i]}) + } + rms_ref <- colMeans(dif2^2, na.rm = TRUE)^0.5 # [nref, nobs] + if (nexp != nref) { + # expand rms_ref to nexp (nref is 1) + rms_ref <- array(rms_ref, dim = c(nobs = nobs, nexp = nexp)) + rms_ref <- Reorder(rms_ref, c(2, 1)) + } + + rmsss <- 1 - rms_exp / rms_ref + + ################################################# + + if (sig_method == 'one-sided Fisher') { + p_val <- array(dim = c(nexp = nexp, nobs = nobs)) + ## pval and sign + if (pval || sign) { + eno1 <- Eno(dif1, time_dim) + if (is.null(ref)) { + eno2 <- Eno(obs, time_dim) + eno2 <- array(eno2, dim = c(nobs = nobs, nexp = nexp)) + eno2 <- Reorder(eno2, c(2, 1)) + } else { + eno2 <- Eno(dif2, time_dim) + if (nref != nexp) { + eno2 <- array(eno2, dim = c(nobs = nobs, nexp = nexp)) + eno2 <- Reorder(eno2, c(2, 1)) + } + } + + F.stat <- (eno2 * rms_ref^2 / (eno2 - 1)) / ((eno1 * rms_exp^2 / (eno1- 1))) + tmp <- !is.na(eno1) & !is.na(eno2) & eno1 > 2 & eno2 > 2 + p_val <- 1 - pf(F.stat, eno1 - 1, eno2 - 1) + if (sign) signif <- p_val <= alpha + # If there isn't enough valid data, return NA + p_val[which(!tmp)] <- NA + if (sign) signif[which(!tmp)] <- NA + + # change not enough valid data rmsss to NA + rmsss[which(!tmp)] <- NA + } + + } else if (sig_method == "Random Walk") { + + if (sign) signif <- array(dim = c(nexp = nexp, nobs = nobs)) + if (pval) p_val <- array(dim = c(nexp = nexp, nobs = nobs)) + + for (i in 1:nexp) { + for (j in 1:nobs) { + error_exp <- array(data = abs(exp[, i] - obs[, j]), dim = c(time = nsdate)) + if (nref == nexp) { + error_ref <- array(data = abs(ref[, i] - obs[, j]), dim = c(time = nsdate)) + } else { + # nref = 1 + error_ref <- array(data = abs(ref - obs[, j]), dim = c(time = nsdate)) + } + aux <- .RandomWalkTest(skill_A = error_exp, skill_B = error_ref, + test.type = sig_method.type, + pval = pval, sign = sign, alpha = alpha) + if (sign) signif[i, j] <- aux$sign + if (pval) p_val[i, j] <- aux$p.val + } + } + } + + ################################### + # Remove extra dimensions if dat_dim = NULL + if (is.null(dat_dim)) { + dim(rmsss) <- NULL + if (pval) dim(p_val) <- NULL + if (sign) dim(signif) <- NULL + } + ################################### + + # output + res <- list(rmsss = rmsss) + if (pval) { + p.val <- list(p.val = p_val) + res <- c(res, p.val) + } + if (sign) { + signif <- list(sign = signif) + res <- c(res, signif) + } + + return(res) +} diff --git a/modules/Skill/Skill.R b/modules/Skill/Skill.R index dcd28eb7..84f9233e 100644 --- a/modules/Skill/Skill.R +++ b/modules/Skill/Skill.R @@ -19,6 +19,10 @@ source("modules/Skill/R/tmp/GetProbs.R") source("modules/Skill/compute_skill_metrics.R") source("modules/Skill/compute_probabilities.R") +## Temporary +source("modules/Skill/R/tmp/RMSSS.R") +.RandomWalkTest <- s2dv:::.RandomWalkTest + Skill <- function(recipe, data, agg = 'global') { # data$hcst: s2dv_cube containing the hindcast @@ -233,6 +237,7 @@ Skill <- function(recipe, data, agg = 'global') { pval = FALSE, sign = TRUE, sig_method = 'Random Walk', + clim.cross.val = cross.val, ncores = ncores) # Compute ensemble mean and modify dimensions skill <- lapply(skill, function(x) { -- GitLab From a8990e703a875859c23124577132ee6b1c66f44d Mon Sep 17 00:00:00 2001 From: Nadia Milders Date: Thu, 9 Nov 2023 16:29:14 +0100 Subject: [PATCH 4/6] changed scorecards check --- tools/check_recipe.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/check_recipe.R b/tools/check_recipe.R index 339c8111..dadaa822 100644 --- a/tools/check_recipe.R +++ b/tools/check_recipe.R @@ -590,7 +590,7 @@ check_recipe <- function(recipe) { } } if ('enscorr' %in% tolower(sc_metrics)) { - recipe$Analysis$Workflow$Statistics <- c('variance', 'covariance') + recipe$Analysis$Workflow$Statistics <- c('standard_deviation', 'covariance') } recipe$Analysis$Workflow$Skill$metric <- requested_metrics } -- GitLab From 330deb317c4f4e1ff7d51408d0893ea3e1b401c9 Mon Sep 17 00:00:00 2001 From: Nadia Milders Date: Thu, 9 Nov 2023 16:35:41 +0100 Subject: [PATCH 5/6] removed RMSSS cross-val --- modules/Skill/R/tmp/RMSSS.R | 482 ------------------------------------ modules/Skill/Skill.R | 5 - 2 files changed, 487 deletions(-) delete mode 100644 modules/Skill/R/tmp/RMSSS.R diff --git a/modules/Skill/R/tmp/RMSSS.R b/modules/Skill/R/tmp/RMSSS.R deleted file mode 100644 index e2fc4a80..00000000 --- a/modules/Skill/R/tmp/RMSSS.R +++ /dev/null @@ -1,482 +0,0 @@ -#'Compute root mean square error skill score -#' -#'Compute the root mean square error skill score (RMSSS) between an array of -#'forecast 'exp' and an array of observation 'obs'. The two arrays should -#'have the same dimensions except along 'dat_dim' and 'memb_dim'. The RMSSSs -#'are computed along 'time_dim', the dimension which corresponds to the start -#'date dimension. -#'RMSSS computes the root mean square error skill score of each exp in 1:nexp -#'against each obs in 1:nobs which gives nexp * nobs RMSSS for each grid point -#'of the array.\cr -#'The p-value and significance test are optionally provided by an one-sided -#'Fisher test or Random Walk test.\cr -#' -#'@param exp A named numeric array of experimental data which contains at least -#' time dimension (time_dim). It can also be a vector with the same length as -#' 'obs', then the vector will automatically be 'time_dim'. -#'@param obs A named numeric array of observational data which contains at least -#' time dimension (time_dim). The dimensions should be the same as parameter -#' 'exp' except the length of 'dat_dim' and 'memb_dim' dimension. It can also -#' be a vector with the same length as 'exp', then the vector will -#' automatically be 'time_dim'. -#'@param ref A named numerical array of the reference forecast data with at -#' least time dimension, or 0 (typical climatological forecast) or 1 -#' (normalized climatological forecast). If it is an array, the dimensions must -#' be the same as 'exp' except 'memb_dim' and 'dat_dim'. If there is only one -#' reference dataset, it should not have dataset dimension. If there is -#' corresponding reference for each experiment, the dataset dimension must -#' have the same length as in 'exp'. If 'ref' is NULL, the typical -#' climatological forecast is used as reference forecast (equivalent to 0.) -#' The default value is NULL. -#'@param dat_dim A character string indicating the name of dataset (nobs/nexp) -#' dimension. The default value is NULL. -#'@param time_dim A character string indicating the name of dimension along -#' which the RMSSS are computed. The default value is 'sdate'. -#'@param memb_dim A character string indicating the name of the member dimension -#' to compute the ensemble mean; it should be set to NULL if the data are -#' already the ensemble mean. The default value is NULL. -#'@param pval A logical value indicating whether to compute or not the p-value -#' of the test Ho: RMSSS = 0. The default value is TRUE. -#'@param sign A logical value indicating whether to compute or not the -#' statistical significance of the test Ho: RMSSS = 0. The default value is -#' FALSE. -#'@param alpha A numeric of the significance level to be used in the -#' statistical significance test. The default value is 0.05. -#'@param sig_method A character string indicating the significance method. The -#' options are "one-sided Fisher" (default) and "Random Walk". -#'@param sig_method.type A character string indicating the test type of the -#' significance method. Check \code{RandomWalkTest()} parameter -#' \code{test.type} for details if parameter "sig_method" is "Random Walk". The -#' default is NULL (since "one-sided Fisher" doesn't have different test -#' types.) -#'@param clim.cross.val A logical indicating whether to build the climatological -#' forecast in cross-validation (i.e. excluding the observed value of the time -#' step when building the probabilistic distribution function for that -#' particular time step). Only used if 'ref' is NULL. The default value is -#' NULL, in which case the climatological reference is set to zero. FALSE -#' indicates calculating the ref from the obs data without cross-validation, -#' and TRUE indicates calculating the ref from the obs data applying -#' cross-validation. -#'@param ncores An integer indicating the number of cores to use for parallel -#' computation. The default value is NULL. -#' -#'@return -#'A list containing the numeric arrays with dimension:\cr -#' c(nexp, nobs, all other dimensions of exp except time_dim).\cr -#'nexp is the number of experiment (i.e., dat_dim in exp), and nobs is the -#'number of observation (i.e., dat_dim in obs). If dat_dim is NULL, nexp and -#'nobs are omitted.\cr -#'\item{$rmsss}{ -#' A numerical array of the root mean square error skill score. -#'} -#'\item{$p.val}{ -#' A numerical array of the p-value with the same dimensions as $rmsss. -#' Only present if \code{pval = TRUE}. -#'} -#'\item{sign}{ -#' A logical array of the statistical significance of the RMSSS with the same -#' dimensions as $rmsss. Only present if \code{sign = TRUE}. -#'} -#' -#'@examples -#'# Load sample data as in Load() example: -#'example(Load) -#'clim <- Clim(sampleData$mod, sampleData$obs) -#'ano_exp <- Ano(sampleData$mod, clim$clim_exp) -#'ano_obs <- Ano(sampleData$obs, clim$clim_obs) -#'rmsss <- RMSSS(ano_exp, ano_obs, dat_dim = 'dataset', memb_dim = 'member') -#' -#' set.seed(1) -#' exp1 <- array(rnorm(30), dim = c(dataset = 2, time = 3, memb = 5)) -#' set.seed(2) -#' obs1 <- array(rnorm(15), dim = c(time = 3, memb = 5, dataset = 1)) -#' res1 <- RMSSS(exp1, obs1, time_dim = 'time', dat_dim = 'dataset') -#' -#' exp2 <- array(rnorm(30), dim = c(lat = 2, time = 3, memb = 5)) -#' obs2 <- array(rnorm(15), dim = c(time = 3, lat = 2)) -#' res2 <- RMSSS(exp2, obs2, time_dim = 'time', memb_dim = 'memb') -#' -#' exp3 <- array(rnorm(30), dim = c(lat = 2, time = 3)) -#' obs3 <- array(rnorm(15), dim = c(time = 3, lat = 2)) -#' res3 <- RMSSS(exp3, obs3, time_dim = 'time') -#' -#'@rdname RMSSS -#'@import multiApply -#'@importFrom stats pf -#'@export -RMSSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', dat_dim = NULL, - memb_dim = NULL, pval = TRUE, clim.cross.val = NULL, sign = FALSE, alpha = 0.05, - sig_method = 'one-sided Fisher', sig_method.type = NULL, ncores = NULL) { - - # Check inputs - ## exp, obs, and ref (1) - if (is.null(exp) | is.null(obs)) { - stop("Parameter 'exp' and 'obs' cannot be NULL.") - } - if (!is.numeric(exp) | !is.numeric(obs)) { - stop("Parameter 'exp' and 'obs' must be a numeric array.") - } - if (is.null(dim(exp)) & is.null(dim(obs))) { #is vector - if (length(exp) == length(obs)) { - exp <- array(exp, dim = c(length(exp))) - names(dim(exp)) <- c(time_dim) - obs <- array(obs, dim = c(length(obs))) - names(dim(obs)) <- c(time_dim) - } else { - stop(paste0("Parameter 'exp' and 'obs' must be array with as least two ", - "dimensions time_dim and dat_dim, or vector of same length.")) - } - } else if (is.null(dim(exp)) | is.null(dim(obs))) { - stop(paste0("Parameter 'exp' and 'obs' must be array with as least two ", - "dimensions time_dim and dat_dim, or vector of same length.")) - } - if (any(is.null(names(dim(exp)))) | any(nchar(names(dim(exp))) == 0) | - any(is.null(names(dim(obs)))) | any(nchar(names(dim(obs))) == 0)) { - stop("Parameter 'exp' and 'obs' must have dimension names.") - } - if (!is.null(ref)) { - if (!is.numeric(ref)) { - stop("Parameter 'ref' must be numeric.") - } - if (is.array(ref)) { - if (any(is.null(names(dim(ref))))| any(nchar(names(dim(ref))) == 0)) { - stop("Parameter 'ref' must have dimension names.") - } - } else if (length(ref) != 1 | any(!ref %in% c(0, 1))) { - stop("Parameter 'ref' must be a numeric array or number 0 or 1.") - } - } else { - ref <- 0 - } - if (!is.array(ref)) { # 0 or 1 - ref <- array(data = ref, dim = dim(exp)) - } - - ## time_dim - if (!is.character(time_dim) | length(time_dim) > 1) { - stop("Parameter 'time_dim' must be a character string.") - } - if (!time_dim %in% names(dim(exp)) | !time_dim %in% names(dim(obs))) { - stop("Parameter 'time_dim' is not found in 'exp' or 'obs' dimension.") - } - ## dat_dim - if (!is.null(dat_dim)) { - if (!is.character(dat_dim) | length(dat_dim) > 1) { - stop("Parameter 'dat_dim' must be a character string or NULL.") - } - if (!dat_dim %in% names(dim(exp)) | !dat_dim %in% names(dim(obs))) { - stop("Parameter 'dat_dim' is not found in 'exp' or 'obs' dimension.", - " Set it as NULL if there is no dataset dimension.") - } - } - ## memb_dim - if (!is.null(memb_dim)) { - if (!is.character(memb_dim) | length(memb_dim) > 1) { - stop("Parameter 'memb_dim' must be a character string.") - } - if (!memb_dim %in% names(dim(exp))) { - stop("Parameter 'memb_dim' is not found in 'exp' dimension.") - } - } - ## pval - if (!is.logical(pval) | length(pval) > 1) { - stop("Parameter 'pval' must be one logical value.") - } - ## sign - if (!is.logical(sign) | length(sign) > 1) { - stop("Parameter 'sign' must be one logical value.") - } - ## alpha - if (!is.numeric(alpha) | length(alpha) > 1) { - stop("Parameter 'alpha' must be one numeric value.") - } - ## sig_method - if (length(sig_method) != 1 | !any(sig_method %in% c('one-sided Fisher', 'Random Walk'))) { - stop("Parameter 'sig_method' must be one of 'one-sided Fisher' or 'Random Walk'.") - } - ## sig_method.type - if (sig_method == 'Random Walk') { - if (is.null(sig_method.type)) { - .warning("Parameter 'sig_method.type' must be specified if 'sig_method' is ", - "Random Walk. Assign it as 'two.sided'.") - .warning("Note that in s2dv <= 1.4.1, Random Walk uses 'two.sided.approx' method.", - "If you want to retain the same functionality, please specify parameter ", - "'sig_method.type' as 'two.sided.approx'.") - sig_method.type <- "two.sided" - } - if (!any(sig_method.type %in% c('two.sided.approx', 'two.sided', 'greater', 'less'))) { - stop("Parameter 'sig_method.type' must be a method accepted by RandomWalkTest() parameter 'test.type'.") - } - if (sig_method.type == 'two.sided.approx' & pval == T) { - .warning("p-value cannot be calculated by Random Walk 'two.sided.approx' method.") - pval <- FALSE - if (alpha != 0.05) { - .warning("DelSole and Tippett (2016) aproximation is valid for alpha ", - "= 0.05 only. Returning the significance at the 0.05 significance level.") - alpha <- 0.05 - } - } - } - ## ncores - if (!is.null(ncores)) { - if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 | length(ncores) > 1) { - stop("Parameter 'ncores' must be a positive integer.") - } - } - ## exp, obs, and ref (2) - name_exp <- sort(names(dim(exp))) - name_obs <- sort(names(dim(obs))) - if (!is.null(memb_dim)) { - if (memb_dim %in% name_exp) { - name_exp <- name_exp[-which(name_exp == memb_dim)] - } - if (memb_dim %in% name_obs) { - name_obs <- name_obs[-which(name_obs == memb_dim)] - } - } - if (!is.null(dat_dim)) { - name_exp <- name_exp[-which(name_exp == dat_dim)] - name_obs <- name_obs[-which(name_obs == dat_dim)] - } - if (!all(name_exp == name_obs)) { - stop("Parameter 'exp' and 'obs' must have the same dimension names.") - } - if (!all(dim(exp)[name_exp] == dim(obs)[name_obs])) { - stop(paste0("Parameter 'exp' and 'obs' must have same length of ", - "all dimensions except 'dat_dim' and 'memb_dim'.")) - } - - name_ref <- sort(names(dim(ref))) - if (!is.null(memb_dim) && memb_dim %in% name_ref) { - name_ref <- name_ref[-which(name_ref == memb_dim)] - } - if (!is.null(dat_dim)) { - if (dat_dim %in% name_ref) { - if (!identical(dim(exp)[dat_dim], dim(ref)[dat_dim])) { - stop(paste0("If parameter 'ref' has dataset dimension, it must be ", - "equal to dataset dimension of 'exp'.")) - } - name_ref <- name_ref[-which(name_ref == dat_dim)] - } - } - if (!identical(length(name_exp), length(name_ref)) | - !identical(dim(exp)[name_exp], dim(ref)[name_ref])) { - stop(paste0("Parameter 'exp' and 'ref' must have the same length of ", - "all dimensions except 'memb_dim' and 'dat_dim' if there is ", - "only one reference dataset.")) - } - - if (dim(exp)[time_dim] <= 2) { - stop("The length of time_dim must be more than 2 to compute RMSSS.") - } - - - ############################### - # # Sort dimension - # name_exp <- names(dim(exp)) - # name_obs <- names(dim(obs)) - # order_obs <- match(name_exp, name_obs) - # obs <- Reorder(obs, order_obs) - - ############################### - ## Ensemble mean - if (!is.null(memb_dim)) { - if (memb_dim %in% names(dim(exp))) { - exp <- MeanDims(exp, memb_dim, na.rm = T) - } - if (memb_dim %in% names(dim(obs))) { - obs <- MeanDims(obs, memb_dim, na.rm = T) - } - if (memb_dim %in% names(dim(ref))) { - ref <- MeanDims(ref, memb_dim, na.rm = T) - } - } - - ############################### - # Calculate RMSSS - - data <- list(exp = exp, obs = obs, ref = ref) - if (!is.null(dat_dim)) { - if (dat_dim %in% names(dim(ref))) { - target_dims <- list(exp = c(time_dim, dat_dim), - obs = c(time_dim, dat_dim), - ref = c(time_dim, dat_dim)) - } else { - target_dims <- list(exp = c(time_dim, dat_dim), - obs = c(time_dim, dat_dim), - ref = c(time_dim)) - } - } else { - target_dims <- list(exp = time_dim, obs = time_dim, ref = time_dim) - } - - res <- Apply(data, - target_dims = target_dims, - fun = .RMSSS, - time_dim = time_dim, dat_dim = dat_dim, - pval = pval, sign = sign, alpha = alpha, - sig_method = sig_method, sig_method.type = sig_method.type, - ncores = ncores) - - return(res) -} - -.RMSSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', dat_dim = NULL, pval = TRUE, - sign = FALSE, alpha = 0.05, sig_method = 'one-sided Fisher', - sig_method.type = NULL) { - # exp: [sdate, (dat)] - # obs: [sdate, (dat)] - # ref: [sdate, (dat)] or NULL - - ## Previous ref code - # if (is.null(ref)) { - # ref <- array(data = 0, dim = dim(obs)) - # } else if (identical(ref, 0) | identical(ref, 1)) { - # ref <- array(ref, dim = dim(exp)) - # } - - ## Include ref condition for cross-validation - obs_time_len <- dim(obs)[time_dim] - - if (is.null(ref)) { - if (is.null(clim.cross.val)) { ## If clim.cross.val is NULL reference is set to 0 - ref <- array(data = 0, dim = dim(obs)) - } else if (isFALSE(clim.cross.val)) { ## Without cross-validation - ref <- array(data = rep(obs, each = obs_time_len), dim = c(obs_time_len, obs_time_len)) - } else if (isTRUE(clim.cross.val)) { ## With cross-validation (excluding the value of that year to create ref for that year) - ref <- array(data = NA, dim = c(obs_time_len, obs_time_len - 1)) - for (i in 1:obs_time_len) { - ref[i, ] <- obs[-i] - } - } - } else if (identical(ref, 0) | identical(ref, 1)) { - ref <- array(ref, dim = dim(exp)) - } - - if (is.null(dat_dim)) { - # exp: [sdate] - # obs: [sdate] - nexp <- 1 - nobs <- 1 - nref <- 1 - # Add dat dim back temporarily - dim(exp) <- c(dim(exp), dat = 1) - dim(obs) <- c(dim(obs), dat = 1) - dim(ref) <- c(dim(ref), dat = 1) - - } else { - # exp: [sdate, dat_exp] - # obs: [sdate, dat_obs] - nexp <- as.numeric(dim(exp)[2]) - nobs <- as.numeric(dim(obs)[2]) - if (dat_dim %in% names(dim(ref))) { - nref <- as.numeric(dim(ref)[2]) - } else { - dim(ref) <- c(dim(ref), dat = 1) - nref <- 1 - } - } - - nsdate <- as.numeric(dim(exp)[1]) - - # RMS of forecast - dif1 <- array(dim = c(nsdate, nexp, nobs)) - names(dim(dif1)) <- c(time_dim, 'nexp', 'nobs') - - for (i in 1:nobs) { - dif1[, , i] <- sapply(1:nexp, function(x) {exp[, x] - obs[, i]}) - } - - rms_exp <- colMeans(dif1^2, na.rm = TRUE)^0.5 # [nexp, nobs] - - # RMS of reference - dif2 <- array(dim = c(nsdate, nref, nobs)) - names(dim(dif2)) <- c(time_dim, 'nexp', 'nobs') - for (i in 1:nobs) { - dif2[, , i] <- sapply(1:nref, function(x) {ref[, x] - obs[, i]}) - } - rms_ref <- colMeans(dif2^2, na.rm = TRUE)^0.5 # [nref, nobs] - if (nexp != nref) { - # expand rms_ref to nexp (nref is 1) - rms_ref <- array(rms_ref, dim = c(nobs = nobs, nexp = nexp)) - rms_ref <- Reorder(rms_ref, c(2, 1)) - } - - rmsss <- 1 - rms_exp / rms_ref - - ################################################# - - if (sig_method == 'one-sided Fisher') { - p_val <- array(dim = c(nexp = nexp, nobs = nobs)) - ## pval and sign - if (pval || sign) { - eno1 <- Eno(dif1, time_dim) - if (is.null(ref)) { - eno2 <- Eno(obs, time_dim) - eno2 <- array(eno2, dim = c(nobs = nobs, nexp = nexp)) - eno2 <- Reorder(eno2, c(2, 1)) - } else { - eno2 <- Eno(dif2, time_dim) - if (nref != nexp) { - eno2 <- array(eno2, dim = c(nobs = nobs, nexp = nexp)) - eno2 <- Reorder(eno2, c(2, 1)) - } - } - - F.stat <- (eno2 * rms_ref^2 / (eno2 - 1)) / ((eno1 * rms_exp^2 / (eno1- 1))) - tmp <- !is.na(eno1) & !is.na(eno2) & eno1 > 2 & eno2 > 2 - p_val <- 1 - pf(F.stat, eno1 - 1, eno2 - 1) - if (sign) signif <- p_val <= alpha - # If there isn't enough valid data, return NA - p_val[which(!tmp)] <- NA - if (sign) signif[which(!tmp)] <- NA - - # change not enough valid data rmsss to NA - rmsss[which(!tmp)] <- NA - } - - } else if (sig_method == "Random Walk") { - - if (sign) signif <- array(dim = c(nexp = nexp, nobs = nobs)) - if (pval) p_val <- array(dim = c(nexp = nexp, nobs = nobs)) - - for (i in 1:nexp) { - for (j in 1:nobs) { - error_exp <- array(data = abs(exp[, i] - obs[, j]), dim = c(time = nsdate)) - if (nref == nexp) { - error_ref <- array(data = abs(ref[, i] - obs[, j]), dim = c(time = nsdate)) - } else { - # nref = 1 - error_ref <- array(data = abs(ref - obs[, j]), dim = c(time = nsdate)) - } - aux <- .RandomWalkTest(skill_A = error_exp, skill_B = error_ref, - test.type = sig_method.type, - pval = pval, sign = sign, alpha = alpha) - if (sign) signif[i, j] <- aux$sign - if (pval) p_val[i, j] <- aux$p.val - } - } - } - - ################################### - # Remove extra dimensions if dat_dim = NULL - if (is.null(dat_dim)) { - dim(rmsss) <- NULL - if (pval) dim(p_val) <- NULL - if (sign) dim(signif) <- NULL - } - ################################### - - # output - res <- list(rmsss = rmsss) - if (pval) { - p.val <- list(p.val = p_val) - res <- c(res, p.val) - } - if (sign) { - signif <- list(sign = signif) - res <- c(res, signif) - } - - return(res) -} diff --git a/modules/Skill/Skill.R b/modules/Skill/Skill.R index 84f9233e..dcd28eb7 100644 --- a/modules/Skill/Skill.R +++ b/modules/Skill/Skill.R @@ -19,10 +19,6 @@ source("modules/Skill/R/tmp/GetProbs.R") source("modules/Skill/compute_skill_metrics.R") source("modules/Skill/compute_probabilities.R") -## Temporary -source("modules/Skill/R/tmp/RMSSS.R") -.RandomWalkTest <- s2dv:::.RandomWalkTest - Skill <- function(recipe, data, agg = 'global') { # data$hcst: s2dv_cube containing the hindcast @@ -237,7 +233,6 @@ Skill <- function(recipe, data, agg = 'global') { pval = FALSE, sign = TRUE, sig_method = 'Random Walk', - clim.cross.val = cross.val, ncores = ncores) # Compute ensemble mean and modify dimensions skill <- lapply(skill, function(x) { -- GitLab From 83e97c77cd08bc4ccc100389c63162c6cf79648f Mon Sep 17 00:00:00 2001 From: Nadia Milders Date: Fri, 10 Nov 2023 17:47:23 +0100 Subject: [PATCH 6/6] Included _syear metrics --- modules/Skill/R/tmp/CRPS.R | 177 ++++++++ modules/Skill/R/tmp/RPS.R | 380 ++++++++++++++++++ modules/Skill/Skill.R | 70 +++- modules/Statistics/Statistics.R | 89 ++++ .../recipe_scorecards_s2s-suite.yml | 11 +- tools/check_recipe.R | 3 +- 6 files changed, 715 insertions(+), 15 deletions(-) create mode 100644 modules/Skill/R/tmp/CRPS.R create mode 100644 modules/Skill/R/tmp/RPS.R create mode 100644 modules/Statistics/Statistics.R diff --git a/modules/Skill/R/tmp/CRPS.R b/modules/Skill/R/tmp/CRPS.R new file mode 100644 index 00000000..c08375c4 --- /dev/null +++ b/modules/Skill/R/tmp/CRPS.R @@ -0,0 +1,177 @@ +#'Compute the Continuous Ranked Probability Score +#' +#'The Continuous Ranked Probability Score (CRPS; Wilks, 2011) is the continuous +#'version of the Ranked Probability Score (RPS; Wilks, 2011). It is a skill +#'metric to evaluate the full distribution of probabilistic forecasts. It has a +#'negative orientation (i.e., the higher-quality forecast the smaller CRPS) and +#'it rewards the forecast that has probability concentration around the observed +#'value. In case of a deterministic forecast, the CRPS is reduced to the mean +#'absolute error. It has the same units as the data. The function is based on +#'enscrps_cpp from SpecsVerification. If there is more than one dataset, CRPS +#'will be computed for each pair of exp and obs data. +#' +#'@param exp A named numerical array of the forecast with at least time +#' dimension. +#'@param obs A named numerical array of the observation with at least time +#' dimension. The dimensions must be the same as 'exp' except 'memb_dim' and +#' 'dat_dim'. +#'@param time_dim A character string indicating the name of the time dimension. +#' The default value is 'sdate'. +#'@param memb_dim A character string indicating the name of the member dimension +#' to compute the probabilities of the forecast. The default value is 'member'. +#'@param dat_dim A character string indicating the name of dataset dimension. +#' The length of this dimension can be different between 'exp' and 'obs'. The +#' default value is NULL. +#'@param Fair A logical indicating whether to compute the FairCRPS (the +#' potential CRPS that the forecast would have with an infinite ensemble size). +#' The default value is FALSE. +#'@return_mean A logical idicating whether to return the temporal mean of CRPS +#' or not. When TRUE the temporal mean is calculated, when FALSE the time +#' dimension is not aggregated. The default is TRUE. +#'@param ncores An integer indicating the number of cores to use for parallel +#' computation. The default value is NULL. +#' +#'@return +#'A numerical array of CRPS with dimensions c(nexp, nobs, the rest dimensions of +#''exp' except 'time_dim' and 'memb_dim' dimensions). nexp is the number of +#'experiment (i.e., dat_dim in exp), and nobs is the number of observation +#'(i.e., dat_dim in obs). If dat_dim is NULL, nexp and nobs are omitted. +#' +#'@references +#'Wilks, 2011; https://doi.org/10.1016/B978-0-12-385022-5.00008-7 +#' +#'@examples +#'exp <- array(rnorm(1000), dim = c(lat = 3, lon = 2, member = 10, sdate = 50)) +#'obs <- array(rnorm(1000), dim = c(lat = 3, lon = 2, sdate = 50)) +#'res <- CRPS(exp = exp, obs = obs) +#' +#'@import multiApply +#'@importFrom SpecsVerification enscrps_cpp +#'@importFrom ClimProjDiags Subset +#'@export +CRPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', dat_dim = NULL, + Fair = FALSE, return_mean = TRUE, ncores = NULL) { + # Check inputs + ## exp and obs (1) + if (!is.array(exp) | !is.numeric(exp)) + stop("Parameter 'exp' must be a numeric array.") + if (!is.array(obs) | !is.numeric(obs)) + stop("Parameter 'obs' must be a numeric array.") + if(any(is.null(names(dim(exp))))| any(nchar(names(dim(exp))) == 0) | + any(is.null(names(dim(obs))))| any(nchar(names(dim(obs))) == 0)) { + stop("Parameter 'exp' and 'obs' must have dimension names.") + } + ## time_dim + if (!is.character(time_dim) | length(time_dim) != 1) + stop("Parameter 'time_dim' must be a character string.") + if (!time_dim %in% names(dim(exp)) | !time_dim %in% names(dim(obs))) { + stop("Parameter 'time_dim' is not found in 'exp' or 'obs' dimension.") + } + ## memb_dim + if (!is.character(memb_dim) | length(memb_dim) > 1) { + stop("Parameter 'memb_dim' must be a character string.") + } + if (!memb_dim %in% names(dim(exp))) { + stop("Parameter 'memb_dim' is not found in 'exp' dimension.") + } + ## dat_dim + if (!is.null(dat_dim)) { + if (!is.character(dat_dim) | length(dat_dim) > 1) { + stop("Parameter 'dat_dim' must be a character string.") + } + if (!dat_dim %in% names(dim(exp)) | !dat_dim %in% names(dim(obs))) { + stop("Parameter 'dat_dim' is not found in 'exp' or 'obs' dimension.", + " Set it as NULL if there is no dataset dimension.") + } + } + ## exp and obs (2) + if (memb_dim %in% names(dim(obs))) { + if (identical(as.numeric(dim(obs)[memb_dim]), 1)) { + obs <- ClimProjDiags::Subset(x = obs, along = memb_dim, indices = 1, drop = 'selected') + } else { + stop("Not implemented for observations with members ('obs' can have 'memb_dim', but it should be of length = 1).") + } + } + name_exp <- sort(names(dim(exp))) + name_obs <- sort(names(dim(obs))) + name_exp <- name_exp[-which(name_exp == memb_dim)] + if (!is.null(dat_dim)) { + name_exp <- name_exp[-which(name_exp == dat_dim)] + name_obs <- name_obs[-which(name_obs == dat_dim)] + } + if (!identical(length(name_exp), length(name_obs)) | + !identical(dim(exp)[name_exp], dim(obs)[name_obs])) { + stop(paste0("Parameter 'exp' and 'obs' must have same length of ", + "all dimensions except 'memb_dim' and 'dat_dim'.")) + } + ## Fair + if (!is.logical(Fair) | length(Fair) > 1) { + stop("Parameter 'Fair' must be either TRUE or FALSE.") + } + ## ncores + if (!is.null(ncores)) { + if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 | length(ncores) > 1) { + stop("Parameter 'ncores' must be either NULL or a positive integer.") + } + } + + ############################### + + # Compute CRPS + crps <- Apply(data = list(exp = exp, obs = obs), + target_dims = list(exp = c(time_dim, memb_dim, dat_dim), + obs = c(time_dim, dat_dim)), + fun = .CRPS, + time_dim = time_dim, memb_dim = memb_dim, dat_dim = dat_dim, + Fair = Fair, + ncores = ncores)$output1 + + if (return_mean == TRUE) { + crps <- MeanDims(crps, time_dim, na.rm = FALSE) + } else { + crps <- crps + } + + return(crps) +} + +.CRPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', dat_dim = NULL, + Fair = FALSE) { + # exp: [sdate, memb, (dat_dim)] + # obs: [sdate, (dat_dim)] + + # Adjust dimensions if needed + if (is.null(dat_dim)) { + nexp <- 1 + nobs <- 1 + dim(exp) <- c(dim(exp), nexp = nexp) + dim(obs) <- c(dim(obs), nobs = nobs) + } else { + nexp <- as.numeric(dim(exp)[dat_dim]) + nobs <- as.numeric(dim(obs)[dat_dim]) + } + + # for FairCRPS + R_new <- ifelse(Fair, Inf, NA) + + CRPS <- array(dim = c(dim(exp)[time_dim], nexp = nexp, nobs = nobs)) + + for (i in 1:nexp) { + for (j in 1:nobs) { + exp_data <- exp[ , , i] + obs_data <- obs[ , j] + + if (is.null(dim(exp_data))) dim(exp_data) <- c(dim(exp)[1:2]) + if (is.null(dim(obs_data))) dim(obs_data) <- c(dim(obs)[1]) + + crps <- SpecsVerification::enscrps_cpp(ens = exp_data, obs = obs_data, R_new = R_new) + CRPS[ , i, j] <- crps + } + } + + if (is.null(dat_dim)) { + dim(CRPS) <- c(dim(CRPS)[time_dim]) + } + + return(CRPS) +} diff --git a/modules/Skill/R/tmp/RPS.R b/modules/Skill/R/tmp/RPS.R new file mode 100644 index 00000000..54ec8440 --- /dev/null +++ b/modules/Skill/R/tmp/RPS.R @@ -0,0 +1,380 @@ +#'Compute the Ranked Probability Score +#' +#'The Ranked Probability Score (RPS; Wilks, 2011) is defined as the sum of the +#'squared differences between the cumulative forecast probabilities (computed +#'from the ensemble members) and the observations (defined as 0% if the category +#'did not happen and 100% if it happened). It can be used to evaluate the skill +#'of multi-categorical probabilistic forecasts. The RPS ranges between 0 +#'(perfect forecast) and n-1 (worst possible forecast), where n is the number of +#'categories. In the case of a forecast divided into two categories (the lowest +#'number of categories that a probabilistic forecast can have), the RPS +#'corresponds to the Brier Score (BS; Wilks, 2011), therefore ranging between 0 +#'and 1.\cr +#'The function first calculates the probabilities for forecasts and observations, +#'then use them to calculate RPS. Or, the probabilities of exp and obs can be +#'provided directly to compute the score. If there is more than one dataset, RPS +#'will be computed for each pair of exp and obs data. The fraction of acceptable +#'NAs can be adjusted. +#' +#'@param exp A named numerical array of either the forecasts with at least time +#' and member dimensions, or the probabilities with at least time and category +#' dimensions. The probabilities can be generated by \code{s2dv::GetProbs}. +#'@param obs A named numerical array of either the observation with at least +#' time dimension, or the probabilities with at least time and category +#' dimensions. The probabilities can be generated by \code{s2dv::GetProbs}. The +#' dimensions must be the same as 'exp' except 'memb_dim' and 'dat_dim'. +#'@param time_dim A character string indicating the name of the time dimension. +#' The default value is 'sdate'. +#'@param memb_dim A character string indicating the name of the member dimension +#' to compute the probabilities of the forecast. The default value is 'member'. +#' If the data are probabilities, set memb_dim as NULL. +#'@param cat_dim A character string indicating the name of the category +#' dimension that is needed when the exp and obs are probabilities. The default +#' value is NULL, which means that the data are not probabilities. +#'@param dat_dim A character string indicating the name of dataset dimension. +#' The length of this dimension can be different between 'exp' and 'obs'. +#' The default value is NULL. +#'@param prob_thresholds A numeric vector of the relative thresholds (from 0 to +#' 1) between the categories. The default value is c(1/3, 2/3), which +#' corresponds to tercile equiprobable categories. +#'@param indices_for_clim A vector of the indices to be taken along 'time_dim' +#' for computing the thresholds between the probabilistic categories. If NULL, +#' the whole period is used. The default value is NULL. +#'@param Fair A logical indicating whether to compute the FairRPS (the +#' potential RPS that the forecast would have with an infinite ensemble size). +#' The default value is FALSE. +#'@param weights A named numerical array of the weights for 'exp' probability +#' calculation. If 'dat_dim' is NULL, the dimensions should include 'memb_dim' +#' and 'time_dim'. Else, the dimension should also include 'dat_dim'. The +#' default value is NULL. The ensemble should have at least 70 members or span +#' at least 10 time steps and have more than 45 members if consistency between +#' the weighted and unweighted methodologies is desired. +#'@param cross.val A logical indicating whether to compute the thresholds +#' between probabilistic categories in cross-validation. The default value is +#' FALSE. +#'@return_mean A logical idicating whether to return the temporal mean of CRPS +#' or not. When TRUE the temporal mean is calculated, when FALSE the time +#' dimension is not aggregated. The default is TRUE. +#'@param na.rm A logical or numeric value between 0 and 1. If it is numeric, it +#' means the lower limit for the fraction of the non-NA values. 1 is equal to +#' FALSE (no NA is acceptable), 0 is equal to TRUE (all NAs are acceptable). +# The function returns NA if the fraction of non-NA values in the data is less +#' than na.rm. Otherwise, RPS will be calculated. The default value is FALSE. +#'@param ncores An integer indicating the number of cores to use for parallel +#' computation. The default value is NULL. +#' +#'@return +#'A numerical array of RPS with dimensions c(nexp, nobs, the rest dimensions of +#''exp' except 'time_dim' and 'memb_dim' dimensions). nexp is the number of +#'experiment (i.e., dat_dim in exp), and nobs is the number of observation +#'(i.e., dat_dim in obs). If dat_dim is NULL, nexp and nobs are omitted. +#' +#'@references +#'Wilks, 2011; https://doi.org/10.1016/B978-0-12-385022-5.00008-7 +#' +#'@examples +#'# Use synthetic data +#'exp <- array(rnorm(1000), dim = c(lat = 3, lon = 2, member = 10, sdate = 50)) +#'obs <- array(rnorm(1000), dim = c(lat = 3, lon = 2, sdate = 50)) +#'res <- RPS(exp = exp, obs = obs) +#'# Use probabilities as inputs +#'exp_probs <- GetProbs(exp, time_dim = 'sdate', memb_dim = 'member') +#'obs_probs <- GetProbs(obs, time_dim = 'sdate', memb_dim = NULL) +#'res2 <- RPS(exp = exp_probs, obs = obs_probs, memb_dim = NULL, cat_dim = 'bin') +#' +#' +#'@import multiApply +#'@importFrom easyVerification convert2prob +#'@export +RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', cat_dim = NULL, + dat_dim = NULL, prob_thresholds = c(1/3, 2/3), indices_for_clim = NULL, + Fair = FALSE, weights = NULL, cross.val = FALSE, return_mean = TRUE, + na.rm = FALSE, ncores = NULL) { + + # Check inputs + ## exp and obs (1) + if (!is.array(exp) | !is.numeric(exp)) + stop('Parameter "exp" must be a numeric array.') + if (!is.array(obs) | !is.numeric(obs)) + stop('Parameter "obs" must be a numeric array.') + if(any(is.null(names(dim(exp))))| any(nchar(names(dim(exp))) == 0) | + any(is.null(names(dim(obs))))| any(nchar(names(dim(obs))) == 0)) { + stop("Parameter 'exp' and 'obs' must have dimension names.") + } + ## time_dim + if (!is.character(time_dim) | length(time_dim) != 1) + stop('Parameter "time_dim" must be a character string.') + if (!time_dim %in% names(dim(exp)) | !time_dim %in% names(dim(obs))) { + stop("Parameter 'time_dim' is not found in 'exp' or 'obs' dimension.") + } + ## memb_dim & cat_dim + if (is.null(memb_dim) + is.null(cat_dim) != 1) { + stop("Only one of the two parameters 'memb_dim' and 'cat_dim' can have value.") + } + ## memb_dim + if (!is.null(memb_dim)) { + if (!is.character(memb_dim) | length(memb_dim) > 1) { + stop("Parameter 'memb_dim' must be a character string.") + } + if (!memb_dim %in% names(dim(exp))) { + stop("Parameter 'memb_dim' is not found in 'exp' dimension.") + } + } + ## cat_dim + if (!is.null(cat_dim)) { + if (!is.character(cat_dim) | length(cat_dim) > 1) { + stop("Parameter 'cat_dim' must be a character string.") + } + if (!cat_dim %in% names(dim(exp)) | !cat_dim %in% names(dim(obs))) { + stop("Parameter 'cat_dim' is not found in 'exp' or 'obs' dimension.") + } + } + ## dat_dim + if (!is.null(dat_dim)) { + if (!is.character(dat_dim) | length(dat_dim) > 1) { + stop("Parameter 'dat_dim' must be a character string.") + } + if (!dat_dim %in% names(dim(exp)) | !dat_dim %in% names(dim(obs))) { + stop("Parameter 'dat_dim' is not found in 'exp' or 'obs' dimension.", + " Set it as NULL if there is no dataset dimension.") + } + } + ## exp and obs (2) + name_exp <- sort(names(dim(exp))) + name_obs <- sort(names(dim(obs))) + if (!is.null(memb_dim)) { + name_exp <- name_exp[-which(name_exp == memb_dim)] + if (memb_dim %in% name_obs) { + name_obs <- name_obs[-which(name_obs == memb_dim)] + } + } + if (!is.null(dat_dim)) { + name_exp <- name_exp[-which(name_exp == dat_dim)] + name_obs <- name_obs[-which(name_obs == dat_dim)] + } + if (!identical(length(name_exp), length(name_obs)) | + !identical(dim(exp)[name_exp], dim(obs)[name_obs])) { + stop(paste0("Parameter 'exp' and 'obs' must have same length of ", + "all dimensions except 'memb_dim' and 'dat_dim'.")) + } + ## prob_thresholds + if (!is.numeric(prob_thresholds) | !is.vector(prob_thresholds) | + any(prob_thresholds <= 0) | any(prob_thresholds >= 1)) { + stop("Parameter 'prob_thresholds' must be a numeric vector between 0 and 1.") + } + ## indices_for_clim + if (is.null(indices_for_clim)) { + indices_for_clim <- 1:dim(obs)[time_dim] + } else { + if (!is.numeric(indices_for_clim) | !is.vector(indices_for_clim)) { + stop("Parameter 'indices_for_clim' must be NULL or a numeric vector.") + } else if (length(indices_for_clim) > dim(obs)[time_dim] | + max(indices_for_clim) > dim(obs)[time_dim] | + any(indices_for_clim) < 1) { + stop("Parameter 'indices_for_clim' should be the indices of 'time_dim'.") + } + } + ## Fair + if (!is.logical(Fair) | length(Fair) > 1) { + stop("Parameter 'Fair' must be either TRUE or FALSE.") + } + ## cross.val + if (!is.logical(cross.val) | length(cross.val) > 1) { + stop("Parameter 'cross.val' must be either TRUE or FALSE.") + } + ## weights + if (!is.null(weights) & is.null(cat_dim)) { + if (!is.array(weights) | !is.numeric(weights)) + stop("Parameter 'weights' must be a named numeric array.") + if (is.null(dat_dim)) { + if (length(dim(weights)) != 2 | any(!names(dim(weights)) %in% c(memb_dim, time_dim))) + stop("Parameter 'weights' must have two dimensions with the names of 'memb_dim' and 'time_dim'.") + if (dim(weights)[memb_dim] != dim(exp)[memb_dim] | + dim(weights)[time_dim] != dim(exp)[time_dim]) { + stop(paste0("Parameter 'weights' must have the same dimension lengths ", + "as 'memb_dim' and 'time_dim' in 'exp'.")) + } + weights <- Reorder(weights, c(time_dim, memb_dim)) + + } else { + if (length(dim(weights)) != 3 | any(!names(dim(weights)) %in% c(memb_dim, time_dim, dat_dim))) + stop("Parameter 'weights' must have three dimensions with the names of 'memb_dim', 'time_dim' and 'dat_dim'.") + if (dim(weights)[memb_dim] != dim(exp)[memb_dim] | + dim(weights)[time_dim] != dim(exp)[time_dim] | + dim(weights)[dat_dim] != dim(exp)[dat_dim]) { + stop(paste0("Parameter 'weights' must have the same dimension lengths ", + "as 'memb_dim', 'time_dim' and 'dat_dim' in 'exp'.")) + } + weights <- Reorder(weights, c(time_dim, memb_dim, dat_dim)) + + } + } else if (!is.null(weights) & !is.null(cat_dim)) { + .warning(paste0("Parameter 'exp' and 'obs' are probabilities already, so parameter ", + "'weights' is not used. Change 'weights' to NULL.")) + weights <- NULL + } + ## na.rm + if (!isTRUE(na.rm) & !isFALSE(na.rm) & !(is.numeric(na.rm) & na.rm >= 0 & na.rm <= 1)) { + stop('"na.rm" should be TRUE, FALSE or a numeric between 0 and 1') + } + ## ncores + if (!is.null(ncores)) { + if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 | + length(ncores) > 1) { + stop("Parameter 'ncores' must be either NULL or a positive integer.") + } + } + + ############################### + + # Compute RPS + + ## Decide target_dims + if (!is.null(memb_dim)) { + target_dims_exp <- c(time_dim, memb_dim, dat_dim) + if (!memb_dim %in% names(dim(obs))) { + target_dims_obs <- c(time_dim, dat_dim) + } else { + target_dims_obs <- c(time_dim, memb_dim, dat_dim) + } + } else { # cat_dim + target_dims_exp <- target_dims_obs <- c(time_dim, cat_dim, dat_dim) + } + + rps <- Apply(data = list(exp = exp, obs = obs), + target_dims = list(exp = target_dims_exp, + obs = target_dims_obs), + fun = .RPS, + dat_dim = dat_dim, time_dim = time_dim, + memb_dim = memb_dim, cat_dim = cat_dim, + prob_thresholds = prob_thresholds, + indices_for_clim = indices_for_clim, Fair = Fair, + weights = weights, cross.val = cross.val, + na.rm = na.rm, ncores = ncores)$output1 + + if (return_mean == TRUE) { + rps <- MeanDims(rps, time_dim, na.rm = TRUE) + } else { + rps <- rps + } + + return(rps) +} + + +.RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', cat_dim = NULL, + dat_dim = NULL, prob_thresholds = c(1/3, 2/3), indices_for_clim = NULL, + Fair = FALSE, weights = NULL, cross.val = FALSE, na.rm = FALSE) { + #--- if memb_dim: + # exp: [sdate, memb, (dat)] + # obs: [sdate, (memb), (dat)] + # weights: NULL or same as exp + #--- if cat_dim: + # exp: [sdate, bin, (dat)] + # obs: [sdate, bin, (dat)] + + # Adjust dimensions to be [sdate, memb, dat] for both exp and obs + if (!is.null(memb_dim)) { + if (!memb_dim %in% names(dim(obs))) { + obs <- InsertDim(obs, posdim = 2, lendim = 1, name = memb_dim) + } + } + + if (is.null(dat_dim)) { + nexp <- 1 + nobs <- 1 + dim(exp) <- c(dim(exp), nexp = nexp) + dim(obs) <- c(dim(obs), nobs = nobs) + if (!is.null(weights)) dim(weights) <- c(dim(weights), nexp = nexp) + } else { + nexp <- as.numeric(dim(exp)[dat_dim]) + nobs <- as.numeric(dim(obs)[dat_dim]) + } + + rps <- array(dim = c(dim(exp)[time_dim], nexp = nexp, nobs = nobs)) + + for (i in 1:nexp) { + for (j in 1:nobs) { + exp_data <- exp[ , , i] + obs_data <- obs[ , , j] + + if (is.null(dim(exp_data))) dim(exp_data) <- c(dim(exp)[1:2]) + if (is.null(dim(obs_data))) dim(obs_data) <- c(dim(obs)[1:2]) + + # Find the fraction of NAs + ## If any member/bin is NA at this time step, it is not good value. + exp_mean <- rowMeans(exp_data) + obs_mean <- rowMeans(obs_data) + good_values <- !is.na(exp_mean) & !is.na(obs_mean) + + if (isTRUE(na.rm)) { + f_NAs <- 0 + } else if (isFALSE(na.rm)) { + f_NAs <- 1 + } else { + f_NAs <- na.rm + } + + if (f_NAs <= sum(good_values) / length(obs_mean)) { + + exp_data <- exp_data[good_values, , drop = F] + obs_data <- obs_data[good_values, , drop = F] + + # If the data inputs are forecast/observation, calculate probabilities + if (is.null(cat_dim)) { + if (!is.null(weights)) { + weights_data <- weights[which(good_values) , , i] + if (is.null(dim(weights_data))) dim(weights_data) <- c(dim(weights)[1:2]) + } else { + weights_data <- weights #NULL + } + + # Subset indices_for_clim + dum <- match(indices_for_clim, which(good_values)) + good_indices_for_clim <- dum[!is.na(dum)] + + exp_probs <- .GetProbs(data = exp_data, indices_for_quantiles = good_indices_for_clim, + prob_thresholds = prob_thresholds, weights = weights_data, cross.val = cross.val) + # exp_probs: [bin, sdate] + obs_probs <- .GetProbs(data = obs_data, indices_for_quantiles = good_indices_for_clim, + prob_thresholds = prob_thresholds, weights = NULL, cross.val = cross.val) + # obs_probs: [bin, sdate] + + } else { # inputs are probabilities already + exp_probs <- t(exp_data) + obs_probs <- t(obs_data) + } + + probs_exp_cumsum <- apply(exp_probs, 2, cumsum) + probs_obs_cumsum <- apply(obs_probs, 2, cumsum) + + # rps: [sdate, nexp, nobs] + rps [good_values, i, j] <- colSums((probs_exp_cumsum - probs_obs_cumsum)^2) + + if (Fair) { # FairRPS + ## adjustment <- rowSums(-1 * (1/R - 1/R.new) * ens.cum * (R - ens.cum)/R/(R - 1)) [formula taken from SpecsVerification::EnsRps] + R <- dim(exp)[2] #memb + R_new <- Inf + adjustment <- (-1) / (R - 1) * probs_exp_cumsum * (1 - probs_exp_cumsum) + adjustment <- colSums(adjustment) + rps[ , i, j] <- rps[ , i, j] + adjustment + } + + } else { ## not enough values different from NA + + rps[ , i, j] <- as.numeric(NA) + + } + + } + } + + if (is.null(dat_dim)) { + dim(rps) <- dim(exp)[time_dim] + } + + return(rps) +} + + + diff --git a/modules/Skill/Skill.R b/modules/Skill/Skill.R index dcd28eb7..d3f60d4e 100644 --- a/modules/Skill/Skill.R +++ b/modules/Skill/Skill.R @@ -19,6 +19,10 @@ source("modules/Skill/R/tmp/GetProbs.R") source("modules/Skill/compute_skill_metrics.R") source("modules/Skill/compute_probabilities.R") +## Temporary +source("modules/Skill/R/tmp/RPS.R") +source("modules/Skill/R/tmp/CRPS.R") + Skill <- function(recipe, data, agg = 'global') { # data$hcst: s2dv_cube containing the hindcast @@ -84,15 +88,39 @@ Skill <- function(recipe, data, agg = 'global') { memb_dim = memb_dim, Fair = Fair, cross.val = cross.val, + return_mean = TRUE, ncores = ncores) skill <- .drop_dims(skill) skill_metrics[[ metric ]] <- skill - rps_clim <- Apply(list(data$obs$data), - target_dims = c(time_dim, memb_dim), - cross.val = cross.val, - fun = RPS_clim)$output1 - rps_clim <- .drop_dims(rps_clim) - skill_metrics[[paste0(metric, "_clim")]] <- rps_clim + # RPS_clim + } else if (metric %in% c('rps_clim')) { + skill <- Apply(list(data$obs$data), + target_dims = c(time_dim, memb_dim), + cross.val = cross.val, + return_mean = TRUE, + fun = RPS_clim)$output1 + skill <- .drop_dims(skill) + skill_metrics[[metric]] <- skill + # RPS_syear and FRPS_syear + } else if (metric %in% c('rps_syear', 'frps_syear')) { + skill <- RPS(data$hcst$data, data$obs$data, + time_dim = time_dim, + memb_dim = memb_dim, + Fair = Fair, + cross.val = cross.val, + return_mean = FALSE, + ncores = ncores) + skill <- .drop_dims(skill) + skill_metrics[[metric]] <- skill + # RPS_clim_syear + } else if (metric %in% c('rps_clim_syear')) { ## not returning syear dimension name + skill <- Apply(list(data$obs$data), + target_dims = c(time_dim, memb_dim), + cross.val = cross.val, + fun = RPS_clim, + return_mean = FALSE, output_dims = 'syear')$output1 + skill <- .drop_dims(skill) + skill_metrics[[ metric ]] <- skill # Ranked Probability Skill Score and Fair version } else if (metric %in% c('rpss', 'frpss')) { skill <- RPSS(data$hcst$data, data$obs$data, @@ -137,14 +165,36 @@ Skill <- function(recipe, data, agg = 'global') { time_dim = time_dim, memb_dim = memb_dim, Fair = Fair, + return_mean = TRUE, ncores = ncores) skill <- .drop_dims(skill) skill_metrics[[ metric ]] <- skill - crps_clim <- Apply(list(data$obs$data), target_dims = time_dim, + # CRPS_clim + } else if (metric %in% c('crps_clim')) { + skill <- Apply(list(data$obs$data), target_dims = time_dim, + fun = CRPS_clim, memb_dim = memb_dim, + clim.cross.val = cross.val, + return_mean = TRUE)$output1 + skill <- .drop_dims(skill) + skill_metrics[[ metric ]] <- skill + # CRPS_syear and FCRPS_syear + } else if (metric %in% c('crps_syear', 'fcrps_syear')) { + skill <- CRPS(data$hcst$data, data$obs$data, + time_dim = time_dim, + memb_dim = memb_dim, + Fair = Fair, + return_mean = FALSE, + ncores = ncores) + skill <- .drop_dims(skill) + skill_metrics[[ metric ]] <- skill + # CRPS_clim_syear + } else if (metric %in% c('crps_clim_syear')) { + skill <- Apply(list(data$obs$data), target_dims = time_dim, fun = CRPS_clim, memb_dim = memb_dim, - clim.cross.val = cross.val)$output1 - crps_clim <- .drop_dims(crps_clim) - skill_metrics[['crps_clim']] <- crps_clim + clim.cross.val = cross.val, + return_mean = FALSE)$output1 + skill <- .drop_dims(skill) + skill_metrics[[metric]] <- skill # CRPSS and FCRPSS } else if (metric %in% c('crpss', 'fcrpss')) { skill <- CRPSS(data$hcst$data, data$obs$data, diff --git a/modules/Statistics/Statistics.R b/modules/Statistics/Statistics.R new file mode 100644 index 00000000..40f1889b --- /dev/null +++ b/modules/Statistics/Statistics.R @@ -0,0 +1,89 @@ + + +compute_statistics <- function(recipe, data, agg = 'global'){ + + # data$hcst: s2dv_cube containing the hindcast + + # obs: s2dv_cube containing the observations + # recipe: auto-s2s recipe as provided by read_yaml + + time_dim <- 'syear' + memb_dim <- 'ensemble' + + + ## Duplicate obs along hcst ensemble dimension + obs_data <- adrop(data$obs$data, drop = 9) + obs_data <- InsertDim(data = obs_data, pos = 9, lendim = 25, name = 'ensemble') + + + statistics_list <- tolower(recipe$Analysis$Workflow$Statistics$metric) + + statistics <- list() + + for (stat in strsplit(statistics_list, ", | |,")[[1]]) { + # Whether the fair version of the metric is to be computed + if (stat %in% c('cov', 'covariance')) { + + covariance <- Apply(data = list(x= obs_data, y=data$hcst$data), + target_dims = c(time_dim, memb_dim), + fun = function(x,y){cov(as.vector(x),as.vector(y), + use = "everything", + method = "pearson")})$output1 + + statistics[[ stat ]] <- covariance + + } ## close if on cov + + + if (stat %in% c('std', 'standard_deviation')) { + + ## Calculate standard deviation + std_hcst <- Apply(data = data$hcst$data, + target_dims = c(time_dim, memb_dim), + fun = 'sd')$output1 + + std_obs <- Apply(data = data$obs$data, + target_dims = c(time_dim, memb_dim), + fun = 'sd')$output1 + + statistics[[ stat ]] <- list('std_hcst' = std_hcst, 'std_obs' = std_obs) + + } ## close if on std + + if (stat %in% c('var', 'variance')) { + + ## Calculate standard deviation + var_hcst <- (Apply(data = data$hcst$data, + target_dims = c(time_dim, memb_dim), + fun = 'sd')$output1)^2 + + var_obs <- (Apply(data = data$obs$data, + target_dims = c(time_dim, memb_dim), + fun = 'sd')$output1)^2 + + statistics[[ stat ]] <- list('var_hcst' = var_hcst, 'var_obs' = var_obs) + + } ## close if on var + + } + + info(recipe$Run$logger, "##### STATISTICS COMPUTATION COMPLETE #####") + .log_memory_usage(recipe$Run$logger, when = "After statistics computation") + + # Save outputs + if (recipe$Analysis$Workflow$Skill$save != 'none') { + info(recipe$Run$logger, "##### START SAVING STATISTICS #####") + } + recipe$Run$output_dir <- paste0(recipe$Run$output_dir, + "/outputs/Statistics/") + + if (recipe$Analysis$Workflow$Statistics$save == 'all') { + # Save all statistics + save_metrics(recipe = recipe, skill = statistics, ## Not able to save data with these dimensions + data_cube = data$hcst, agg = agg) ## The length of parameter 'order' should be the same with the dimension length of parameter 'data'. + } + + # Return results + return(statistics) +} + diff --git a/recipes/atomic_recipes/recipe_scorecards_s2s-suite.yml b/recipes/atomic_recipes/recipe_scorecards_s2s-suite.yml index 1fd6adc3..17c1acae 100644 --- a/recipes/atomic_recipes/recipe_scorecards_s2s-suite.yml +++ b/recipes/atomic_recipes/recipe_scorecards_s2s-suite.yml @@ -16,15 +16,15 @@ Analysis: Time: sdate: '0101' ## MMDD fcst_year: # Optional, int: Forecast year 'YYYY' - hcst_start: '1993' # Mandatory, int: Hindcast start year 'YYYY' + hcst_start: '2014' # Mandatory, int: Hindcast start year 'YYYY' hcst_end: '2016' # Mandatory, int: Hindcast end year 'YYYY' ftime_min: 1 # Mandatory, int: First leadtime time step in months ftime_max: 2 # Mandatory, int: Last leadtime time step in months Region: latmin: 30 # Mandatory, int: minimum latitude - latmax: 90 # Mandatory, int: maximum latitude + latmax: 35 # Mandatory, int: maximum latitude lonmin: 0 # Mandatory, int: minimum longitude - lonmax: 359.9 # Mandatory, int: maximum longitude + lonmax: 10 # Mandatory, int: maximum longitude Regrid: method: conservative # conservative for prlr, bilinear for tas, psl, sfcWind type: to_system @@ -37,9 +37,12 @@ Analysis: cross_validation: no save: 'none' Skill: - metric: rps rpss crps crpss # str: Skill metric or list of skill metrics. See docu. + metric: rps_syear rps_clim_syear crps_syear crps_clim_syear # str: Skill metric or list of skill metrics. See docu. cross_validation: yes save: 'all' + Statistics: + metric: cov std var + save: 'all' Probabilities: percentiles: [[1/3, 2/3], [1/10], [9/10]] # frac: Quantile thresholds. save: 'none' diff --git a/tools/check_recipe.R b/tools/check_recipe.R index dadaa822..e68f0b90 100644 --- a/tools/check_recipe.R +++ b/tools/check_recipe.R @@ -446,7 +446,8 @@ check_recipe <- function(recipe) { "rpss", "frps", "frpss", "crps", "crps_syear", "crpss", "bss10", "bss90", "mean_bias", "mean_bias_ss", "enssprerr", "rps_clim", - "crps_clim", "enscorr_specs", "frps_specs", "rpss_specs", + "rps_clim_syear", "crps_clim", "crps_clim_syear", + "enscorr_specs", "frps_specs", "rpss_specs", "frpss_specs", "bss10_specs", "bss90_specs") if ("Skill" %in% names(recipe$Analysis$Workflow)) { if (is.null(recipe$Analysis$Workflow$Skill$metric)) { -- GitLab