diff --git a/modules/Skill/R/CRPS_clim.R b/modules/Skill/R/CRPS_clim.R index 36db4e945fcfc7256ed66c05b98bf9db8c6892e3..b66cab7872bf91102766a9124c82830deba90d24 100644 --- a/modules/Skill/R/CRPS_clim.R +++ b/modules/Skill/R/CRPS_clim.R @@ -1,13 +1,27 @@ -CRPS_clim <- function(obs, memb_dim ='ensemble'){ +# CRPS version for climatology +CRPS_clim <- function(obs, memb_dim ='ensemble', return_mean = TRUE, clim.cross.val= TRUE){ time_dim <- names(dim(obs)) obs_time_len <- dim(obs)[time_dim] - + + if (isFALSE(clim.cross.val)) { ## Without cross-validation ref <- array(data = rep(obs, each = obs_time_len), dim = c(obs_time_len, obs_time_len)) - names(dim(ref)) <- c(time_dim, memb_dim) - # ref: [sdate, memb] - # obs: [sdate] + } else if (isTRUE(clim.cross.val)) { ## With cross-validation (excluding the value of that year to create ref for that year) + ref <- array(data = NA, dim = c(obs_time_len, obs_time_len - 1)) + for (i in 1:obs_time_len) { + ref[i, ] <- obs[-i] + } + } + + names(dim(ref)) <- c(time_dim, memb_dim) + # ref: [sdate, memb] + # obs: [sdate] crps_ref <- s2dv:::.CRPS(exp = ref, obs = obs, time_dim = time_dim, memb_dim = memb_dim, - dat_dim = NULL, Fair = FALSE) - # crps_ref should be [sdate] - return(mean(crps_ref)) + dat_dim = NULL, Fair = FALSE) + + # crps_ref should be [sdate] + if (return_mean == TRUE) { + return(mean(crps_ref)) + } else { + return(crps_ref) + } } diff --git a/modules/Skill/R/RPS_clim.R b/modules/Skill/R/RPS_clim.R index e8b6452d14f4bfa2972af651b16f56011cf20b13..4a079cd4855fea557f67b6f99e878e90a83398b3 100644 --- a/modules/Skill/R/RPS_clim.R +++ b/modules/Skill/R/RPS_clim.R @@ -1,12 +1,12 @@ # RPS version for climatology -RPS_clim <- function(obs, indices_for_clim = NULL, prob_thresholds = c(1/3, 2/3)) { +RPS_clim <- function(obs, indices_for_clim = NULL, prob_thresholds = c(1/3, 2/3), cross.val = TRUE, return_mean = TRUE) { if (is.null(indices_for_clim)){ indices_for_clim <- 1:length(obs) } obs_probs <- .GetProbs(data = obs, indices_for_quantiles = indices_for_clim, ## temporarily removed s2dv::: - prob_thresholds = prob_thresholds, weights = NULL, cross.val = T) ## here! + prob_thresholds = prob_thresholds, weights = NULL, cross.val = cross.val) # clim_probs: [bin, sdate] clim_probs <- c(prob_thresholds[1], diff(prob_thresholds), 1 - prob_thresholds[length(prob_thresholds)]) clim_probs <- array(clim_probs, dim = dim(obs_probs)) @@ -15,6 +15,10 @@ RPS_clim <- function(obs, indices_for_clim = NULL, prob_thresholds = c(1/3, 2/3) probs_clim_cumsum <- apply(clim_probs, 2, cumsum) probs_obs_cumsum <- apply(obs_probs, 2, cumsum) rps_ref <- apply((probs_clim_cumsum - probs_obs_cumsum)^2, 2, sum) - - return(mean(rps_ref)) + + if (return_mean == TRUE) { + return(mean(rps_ref)) + } else { + return(rps_ref) + } } diff --git a/modules/Skill/R/tmp/CRPS.R b/modules/Skill/R/tmp/CRPS.R new file mode 100644 index 0000000000000000000000000000000000000000..c08375c4c310d1a135a25a7ea98bb50a4e08ca59 --- /dev/null +++ b/modules/Skill/R/tmp/CRPS.R @@ -0,0 +1,177 @@ +#'Compute the Continuous Ranked Probability Score +#' +#'The Continuous Ranked Probability Score (CRPS; Wilks, 2011) is the continuous +#'version of the Ranked Probability Score (RPS; Wilks, 2011). It is a skill +#'metric to evaluate the full distribution of probabilistic forecasts. It has a +#'negative orientation (i.e., the higher-quality forecast the smaller CRPS) and +#'it rewards the forecast that has probability concentration around the observed +#'value. In case of a deterministic forecast, the CRPS is reduced to the mean +#'absolute error. It has the same units as the data. The function is based on +#'enscrps_cpp from SpecsVerification. If there is more than one dataset, CRPS +#'will be computed for each pair of exp and obs data. +#' +#'@param exp A named numerical array of the forecast with at least time +#' dimension. +#'@param obs A named numerical array of the observation with at least time +#' dimension. The dimensions must be the same as 'exp' except 'memb_dim' and +#' 'dat_dim'. +#'@param time_dim A character string indicating the name of the time dimension. +#' The default value is 'sdate'. +#'@param memb_dim A character string indicating the name of the member dimension +#' to compute the probabilities of the forecast. The default value is 'member'. +#'@param dat_dim A character string indicating the name of dataset dimension. +#' The length of this dimension can be different between 'exp' and 'obs'. The +#' default value is NULL. +#'@param Fair A logical indicating whether to compute the FairCRPS (the +#' potential CRPS that the forecast would have with an infinite ensemble size). +#' The default value is FALSE. +#'@return_mean A logical idicating whether to return the temporal mean of CRPS +#' or not. When TRUE the temporal mean is calculated, when FALSE the time +#' dimension is not aggregated. The default is TRUE. +#'@param ncores An integer indicating the number of cores to use for parallel +#' computation. The default value is NULL. +#' +#'@return +#'A numerical array of CRPS with dimensions c(nexp, nobs, the rest dimensions of +#''exp' except 'time_dim' and 'memb_dim' dimensions). nexp is the number of +#'experiment (i.e., dat_dim in exp), and nobs is the number of observation +#'(i.e., dat_dim in obs). If dat_dim is NULL, nexp and nobs are omitted. +#' +#'@references +#'Wilks, 2011; https://doi.org/10.1016/B978-0-12-385022-5.00008-7 +#' +#'@examples +#'exp <- array(rnorm(1000), dim = c(lat = 3, lon = 2, member = 10, sdate = 50)) +#'obs <- array(rnorm(1000), dim = c(lat = 3, lon = 2, sdate = 50)) +#'res <- CRPS(exp = exp, obs = obs) +#' +#'@import multiApply +#'@importFrom SpecsVerification enscrps_cpp +#'@importFrom ClimProjDiags Subset +#'@export +CRPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', dat_dim = NULL, + Fair = FALSE, return_mean = TRUE, ncores = NULL) { + # Check inputs + ## exp and obs (1) + if (!is.array(exp) | !is.numeric(exp)) + stop("Parameter 'exp' must be a numeric array.") + if (!is.array(obs) | !is.numeric(obs)) + stop("Parameter 'obs' must be a numeric array.") + if(any(is.null(names(dim(exp))))| any(nchar(names(dim(exp))) == 0) | + any(is.null(names(dim(obs))))| any(nchar(names(dim(obs))) == 0)) { + stop("Parameter 'exp' and 'obs' must have dimension names.") + } + ## time_dim + if (!is.character(time_dim) | length(time_dim) != 1) + stop("Parameter 'time_dim' must be a character string.") + if (!time_dim %in% names(dim(exp)) | !time_dim %in% names(dim(obs))) { + stop("Parameter 'time_dim' is not found in 'exp' or 'obs' dimension.") + } + ## memb_dim + if (!is.character(memb_dim) | length(memb_dim) > 1) { + stop("Parameter 'memb_dim' must be a character string.") + } + if (!memb_dim %in% names(dim(exp))) { + stop("Parameter 'memb_dim' is not found in 'exp' dimension.") + } + ## dat_dim + if (!is.null(dat_dim)) { + if (!is.character(dat_dim) | length(dat_dim) > 1) { + stop("Parameter 'dat_dim' must be a character string.") + } + if (!dat_dim %in% names(dim(exp)) | !dat_dim %in% names(dim(obs))) { + stop("Parameter 'dat_dim' is not found in 'exp' or 'obs' dimension.", + " Set it as NULL if there is no dataset dimension.") + } + } + ## exp and obs (2) + if (memb_dim %in% names(dim(obs))) { + if (identical(as.numeric(dim(obs)[memb_dim]), 1)) { + obs <- ClimProjDiags::Subset(x = obs, along = memb_dim, indices = 1, drop = 'selected') + } else { + stop("Not implemented for observations with members ('obs' can have 'memb_dim', but it should be of length = 1).") + } + } + name_exp <- sort(names(dim(exp))) + name_obs <- sort(names(dim(obs))) + name_exp <- name_exp[-which(name_exp == memb_dim)] + if (!is.null(dat_dim)) { + name_exp <- name_exp[-which(name_exp == dat_dim)] + name_obs <- name_obs[-which(name_obs == dat_dim)] + } + if (!identical(length(name_exp), length(name_obs)) | + !identical(dim(exp)[name_exp], dim(obs)[name_obs])) { + stop(paste0("Parameter 'exp' and 'obs' must have same length of ", + "all dimensions except 'memb_dim' and 'dat_dim'.")) + } + ## Fair + if (!is.logical(Fair) | length(Fair) > 1) { + stop("Parameter 'Fair' must be either TRUE or FALSE.") + } + ## ncores + if (!is.null(ncores)) { + if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 | length(ncores) > 1) { + stop("Parameter 'ncores' must be either NULL or a positive integer.") + } + } + + ############################### + + # Compute CRPS + crps <- Apply(data = list(exp = exp, obs = obs), + target_dims = list(exp = c(time_dim, memb_dim, dat_dim), + obs = c(time_dim, dat_dim)), + fun = .CRPS, + time_dim = time_dim, memb_dim = memb_dim, dat_dim = dat_dim, + Fair = Fair, + ncores = ncores)$output1 + + if (return_mean == TRUE) { + crps <- MeanDims(crps, time_dim, na.rm = FALSE) + } else { + crps <- crps + } + + return(crps) +} + +.CRPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', dat_dim = NULL, + Fair = FALSE) { + # exp: [sdate, memb, (dat_dim)] + # obs: [sdate, (dat_dim)] + + # Adjust dimensions if needed + if (is.null(dat_dim)) { + nexp <- 1 + nobs <- 1 + dim(exp) <- c(dim(exp), nexp = nexp) + dim(obs) <- c(dim(obs), nobs = nobs) + } else { + nexp <- as.numeric(dim(exp)[dat_dim]) + nobs <- as.numeric(dim(obs)[dat_dim]) + } + + # for FairCRPS + R_new <- ifelse(Fair, Inf, NA) + + CRPS <- array(dim = c(dim(exp)[time_dim], nexp = nexp, nobs = nobs)) + + for (i in 1:nexp) { + for (j in 1:nobs) { + exp_data <- exp[ , , i] + obs_data <- obs[ , j] + + if (is.null(dim(exp_data))) dim(exp_data) <- c(dim(exp)[1:2]) + if (is.null(dim(obs_data))) dim(obs_data) <- c(dim(obs)[1]) + + crps <- SpecsVerification::enscrps_cpp(ens = exp_data, obs = obs_data, R_new = R_new) + CRPS[ , i, j] <- crps + } + } + + if (is.null(dat_dim)) { + dim(CRPS) <- c(dim(CRPS)[time_dim]) + } + + return(CRPS) +} diff --git a/modules/Skill/R/tmp/RPS.R b/modules/Skill/R/tmp/RPS.R new file mode 100644 index 0000000000000000000000000000000000000000..54ec8440e440702ba9a51abd3690f9ff4ac29458 --- /dev/null +++ b/modules/Skill/R/tmp/RPS.R @@ -0,0 +1,380 @@ +#'Compute the Ranked Probability Score +#' +#'The Ranked Probability Score (RPS; Wilks, 2011) is defined as the sum of the +#'squared differences between the cumulative forecast probabilities (computed +#'from the ensemble members) and the observations (defined as 0% if the category +#'did not happen and 100% if it happened). It can be used to evaluate the skill +#'of multi-categorical probabilistic forecasts. The RPS ranges between 0 +#'(perfect forecast) and n-1 (worst possible forecast), where n is the number of +#'categories. In the case of a forecast divided into two categories (the lowest +#'number of categories that a probabilistic forecast can have), the RPS +#'corresponds to the Brier Score (BS; Wilks, 2011), therefore ranging between 0 +#'and 1.\cr +#'The function first calculates the probabilities for forecasts and observations, +#'then use them to calculate RPS. Or, the probabilities of exp and obs can be +#'provided directly to compute the score. If there is more than one dataset, RPS +#'will be computed for each pair of exp and obs data. The fraction of acceptable +#'NAs can be adjusted. +#' +#'@param exp A named numerical array of either the forecasts with at least time +#' and member dimensions, or the probabilities with at least time and category +#' dimensions. The probabilities can be generated by \code{s2dv::GetProbs}. +#'@param obs A named numerical array of either the observation with at least +#' time dimension, or the probabilities with at least time and category +#' dimensions. The probabilities can be generated by \code{s2dv::GetProbs}. The +#' dimensions must be the same as 'exp' except 'memb_dim' and 'dat_dim'. +#'@param time_dim A character string indicating the name of the time dimension. +#' The default value is 'sdate'. +#'@param memb_dim A character string indicating the name of the member dimension +#' to compute the probabilities of the forecast. The default value is 'member'. +#' If the data are probabilities, set memb_dim as NULL. +#'@param cat_dim A character string indicating the name of the category +#' dimension that is needed when the exp and obs are probabilities. The default +#' value is NULL, which means that the data are not probabilities. +#'@param dat_dim A character string indicating the name of dataset dimension. +#' The length of this dimension can be different between 'exp' and 'obs'. +#' The default value is NULL. +#'@param prob_thresholds A numeric vector of the relative thresholds (from 0 to +#' 1) between the categories. The default value is c(1/3, 2/3), which +#' corresponds to tercile equiprobable categories. +#'@param indices_for_clim A vector of the indices to be taken along 'time_dim' +#' for computing the thresholds between the probabilistic categories. If NULL, +#' the whole period is used. The default value is NULL. +#'@param Fair A logical indicating whether to compute the FairRPS (the +#' potential RPS that the forecast would have with an infinite ensemble size). +#' The default value is FALSE. +#'@param weights A named numerical array of the weights for 'exp' probability +#' calculation. If 'dat_dim' is NULL, the dimensions should include 'memb_dim' +#' and 'time_dim'. Else, the dimension should also include 'dat_dim'. The +#' default value is NULL. The ensemble should have at least 70 members or span +#' at least 10 time steps and have more than 45 members if consistency between +#' the weighted and unweighted methodologies is desired. +#'@param cross.val A logical indicating whether to compute the thresholds +#' between probabilistic categories in cross-validation. The default value is +#' FALSE. +#'@return_mean A logical idicating whether to return the temporal mean of CRPS +#' or not. When TRUE the temporal mean is calculated, when FALSE the time +#' dimension is not aggregated. The default is TRUE. +#'@param na.rm A logical or numeric value between 0 and 1. If it is numeric, it +#' means the lower limit for the fraction of the non-NA values. 1 is equal to +#' FALSE (no NA is acceptable), 0 is equal to TRUE (all NAs are acceptable). +# The function returns NA if the fraction of non-NA values in the data is less +#' than na.rm. Otherwise, RPS will be calculated. The default value is FALSE. +#'@param ncores An integer indicating the number of cores to use for parallel +#' computation. The default value is NULL. +#' +#'@return +#'A numerical array of RPS with dimensions c(nexp, nobs, the rest dimensions of +#''exp' except 'time_dim' and 'memb_dim' dimensions). nexp is the number of +#'experiment (i.e., dat_dim in exp), and nobs is the number of observation +#'(i.e., dat_dim in obs). If dat_dim is NULL, nexp and nobs are omitted. +#' +#'@references +#'Wilks, 2011; https://doi.org/10.1016/B978-0-12-385022-5.00008-7 +#' +#'@examples +#'# Use synthetic data +#'exp <- array(rnorm(1000), dim = c(lat = 3, lon = 2, member = 10, sdate = 50)) +#'obs <- array(rnorm(1000), dim = c(lat = 3, lon = 2, sdate = 50)) +#'res <- RPS(exp = exp, obs = obs) +#'# Use probabilities as inputs +#'exp_probs <- GetProbs(exp, time_dim = 'sdate', memb_dim = 'member') +#'obs_probs <- GetProbs(obs, time_dim = 'sdate', memb_dim = NULL) +#'res2 <- RPS(exp = exp_probs, obs = obs_probs, memb_dim = NULL, cat_dim = 'bin') +#' +#' +#'@import multiApply +#'@importFrom easyVerification convert2prob +#'@export +RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', cat_dim = NULL, + dat_dim = NULL, prob_thresholds = c(1/3, 2/3), indices_for_clim = NULL, + Fair = FALSE, weights = NULL, cross.val = FALSE, return_mean = TRUE, + na.rm = FALSE, ncores = NULL) { + + # Check inputs + ## exp and obs (1) + if (!is.array(exp) | !is.numeric(exp)) + stop('Parameter "exp" must be a numeric array.') + if (!is.array(obs) | !is.numeric(obs)) + stop('Parameter "obs" must be a numeric array.') + if(any(is.null(names(dim(exp))))| any(nchar(names(dim(exp))) == 0) | + any(is.null(names(dim(obs))))| any(nchar(names(dim(obs))) == 0)) { + stop("Parameter 'exp' and 'obs' must have dimension names.") + } + ## time_dim + if (!is.character(time_dim) | length(time_dim) != 1) + stop('Parameter "time_dim" must be a character string.') + if (!time_dim %in% names(dim(exp)) | !time_dim %in% names(dim(obs))) { + stop("Parameter 'time_dim' is not found in 'exp' or 'obs' dimension.") + } + ## memb_dim & cat_dim + if (is.null(memb_dim) + is.null(cat_dim) != 1) { + stop("Only one of the two parameters 'memb_dim' and 'cat_dim' can have value.") + } + ## memb_dim + if (!is.null(memb_dim)) { + if (!is.character(memb_dim) | length(memb_dim) > 1) { + stop("Parameter 'memb_dim' must be a character string.") + } + if (!memb_dim %in% names(dim(exp))) { + stop("Parameter 'memb_dim' is not found in 'exp' dimension.") + } + } + ## cat_dim + if (!is.null(cat_dim)) { + if (!is.character(cat_dim) | length(cat_dim) > 1) { + stop("Parameter 'cat_dim' must be a character string.") + } + if (!cat_dim %in% names(dim(exp)) | !cat_dim %in% names(dim(obs))) { + stop("Parameter 'cat_dim' is not found in 'exp' or 'obs' dimension.") + } + } + ## dat_dim + if (!is.null(dat_dim)) { + if (!is.character(dat_dim) | length(dat_dim) > 1) { + stop("Parameter 'dat_dim' must be a character string.") + } + if (!dat_dim %in% names(dim(exp)) | !dat_dim %in% names(dim(obs))) { + stop("Parameter 'dat_dim' is not found in 'exp' or 'obs' dimension.", + " Set it as NULL if there is no dataset dimension.") + } + } + ## exp and obs (2) + name_exp <- sort(names(dim(exp))) + name_obs <- sort(names(dim(obs))) + if (!is.null(memb_dim)) { + name_exp <- name_exp[-which(name_exp == memb_dim)] + if (memb_dim %in% name_obs) { + name_obs <- name_obs[-which(name_obs == memb_dim)] + } + } + if (!is.null(dat_dim)) { + name_exp <- name_exp[-which(name_exp == dat_dim)] + name_obs <- name_obs[-which(name_obs == dat_dim)] + } + if (!identical(length(name_exp), length(name_obs)) | + !identical(dim(exp)[name_exp], dim(obs)[name_obs])) { + stop(paste0("Parameter 'exp' and 'obs' must have same length of ", + "all dimensions except 'memb_dim' and 'dat_dim'.")) + } + ## prob_thresholds + if (!is.numeric(prob_thresholds) | !is.vector(prob_thresholds) | + any(prob_thresholds <= 0) | any(prob_thresholds >= 1)) { + stop("Parameter 'prob_thresholds' must be a numeric vector between 0 and 1.") + } + ## indices_for_clim + if (is.null(indices_for_clim)) { + indices_for_clim <- 1:dim(obs)[time_dim] + } else { + if (!is.numeric(indices_for_clim) | !is.vector(indices_for_clim)) { + stop("Parameter 'indices_for_clim' must be NULL or a numeric vector.") + } else if (length(indices_for_clim) > dim(obs)[time_dim] | + max(indices_for_clim) > dim(obs)[time_dim] | + any(indices_for_clim) < 1) { + stop("Parameter 'indices_for_clim' should be the indices of 'time_dim'.") + } + } + ## Fair + if (!is.logical(Fair) | length(Fair) > 1) { + stop("Parameter 'Fair' must be either TRUE or FALSE.") + } + ## cross.val + if (!is.logical(cross.val) | length(cross.val) > 1) { + stop("Parameter 'cross.val' must be either TRUE or FALSE.") + } + ## weights + if (!is.null(weights) & is.null(cat_dim)) { + if (!is.array(weights) | !is.numeric(weights)) + stop("Parameter 'weights' must be a named numeric array.") + if (is.null(dat_dim)) { + if (length(dim(weights)) != 2 | any(!names(dim(weights)) %in% c(memb_dim, time_dim))) + stop("Parameter 'weights' must have two dimensions with the names of 'memb_dim' and 'time_dim'.") + if (dim(weights)[memb_dim] != dim(exp)[memb_dim] | + dim(weights)[time_dim] != dim(exp)[time_dim]) { + stop(paste0("Parameter 'weights' must have the same dimension lengths ", + "as 'memb_dim' and 'time_dim' in 'exp'.")) + } + weights <- Reorder(weights, c(time_dim, memb_dim)) + + } else { + if (length(dim(weights)) != 3 | any(!names(dim(weights)) %in% c(memb_dim, time_dim, dat_dim))) + stop("Parameter 'weights' must have three dimensions with the names of 'memb_dim', 'time_dim' and 'dat_dim'.") + if (dim(weights)[memb_dim] != dim(exp)[memb_dim] | + dim(weights)[time_dim] != dim(exp)[time_dim] | + dim(weights)[dat_dim] != dim(exp)[dat_dim]) { + stop(paste0("Parameter 'weights' must have the same dimension lengths ", + "as 'memb_dim', 'time_dim' and 'dat_dim' in 'exp'.")) + } + weights <- Reorder(weights, c(time_dim, memb_dim, dat_dim)) + + } + } else if (!is.null(weights) & !is.null(cat_dim)) { + .warning(paste0("Parameter 'exp' and 'obs' are probabilities already, so parameter ", + "'weights' is not used. Change 'weights' to NULL.")) + weights <- NULL + } + ## na.rm + if (!isTRUE(na.rm) & !isFALSE(na.rm) & !(is.numeric(na.rm) & na.rm >= 0 & na.rm <= 1)) { + stop('"na.rm" should be TRUE, FALSE or a numeric between 0 and 1') + } + ## ncores + if (!is.null(ncores)) { + if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 | + length(ncores) > 1) { + stop("Parameter 'ncores' must be either NULL or a positive integer.") + } + } + + ############################### + + # Compute RPS + + ## Decide target_dims + if (!is.null(memb_dim)) { + target_dims_exp <- c(time_dim, memb_dim, dat_dim) + if (!memb_dim %in% names(dim(obs))) { + target_dims_obs <- c(time_dim, dat_dim) + } else { + target_dims_obs <- c(time_dim, memb_dim, dat_dim) + } + } else { # cat_dim + target_dims_exp <- target_dims_obs <- c(time_dim, cat_dim, dat_dim) + } + + rps <- Apply(data = list(exp = exp, obs = obs), + target_dims = list(exp = target_dims_exp, + obs = target_dims_obs), + fun = .RPS, + dat_dim = dat_dim, time_dim = time_dim, + memb_dim = memb_dim, cat_dim = cat_dim, + prob_thresholds = prob_thresholds, + indices_for_clim = indices_for_clim, Fair = Fair, + weights = weights, cross.val = cross.val, + na.rm = na.rm, ncores = ncores)$output1 + + if (return_mean == TRUE) { + rps <- MeanDims(rps, time_dim, na.rm = TRUE) + } else { + rps <- rps + } + + return(rps) +} + + +.RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', cat_dim = NULL, + dat_dim = NULL, prob_thresholds = c(1/3, 2/3), indices_for_clim = NULL, + Fair = FALSE, weights = NULL, cross.val = FALSE, na.rm = FALSE) { + #--- if memb_dim: + # exp: [sdate, memb, (dat)] + # obs: [sdate, (memb), (dat)] + # weights: NULL or same as exp + #--- if cat_dim: + # exp: [sdate, bin, (dat)] + # obs: [sdate, bin, (dat)] + + # Adjust dimensions to be [sdate, memb, dat] for both exp and obs + if (!is.null(memb_dim)) { + if (!memb_dim %in% names(dim(obs))) { + obs <- InsertDim(obs, posdim = 2, lendim = 1, name = memb_dim) + } + } + + if (is.null(dat_dim)) { + nexp <- 1 + nobs <- 1 + dim(exp) <- c(dim(exp), nexp = nexp) + dim(obs) <- c(dim(obs), nobs = nobs) + if (!is.null(weights)) dim(weights) <- c(dim(weights), nexp = nexp) + } else { + nexp <- as.numeric(dim(exp)[dat_dim]) + nobs <- as.numeric(dim(obs)[dat_dim]) + } + + rps <- array(dim = c(dim(exp)[time_dim], nexp = nexp, nobs = nobs)) + + for (i in 1:nexp) { + for (j in 1:nobs) { + exp_data <- exp[ , , i] + obs_data <- obs[ , , j] + + if (is.null(dim(exp_data))) dim(exp_data) <- c(dim(exp)[1:2]) + if (is.null(dim(obs_data))) dim(obs_data) <- c(dim(obs)[1:2]) + + # Find the fraction of NAs + ## If any member/bin is NA at this time step, it is not good value. + exp_mean <- rowMeans(exp_data) + obs_mean <- rowMeans(obs_data) + good_values <- !is.na(exp_mean) & !is.na(obs_mean) + + if (isTRUE(na.rm)) { + f_NAs <- 0 + } else if (isFALSE(na.rm)) { + f_NAs <- 1 + } else { + f_NAs <- na.rm + } + + if (f_NAs <= sum(good_values) / length(obs_mean)) { + + exp_data <- exp_data[good_values, , drop = F] + obs_data <- obs_data[good_values, , drop = F] + + # If the data inputs are forecast/observation, calculate probabilities + if (is.null(cat_dim)) { + if (!is.null(weights)) { + weights_data <- weights[which(good_values) , , i] + if (is.null(dim(weights_data))) dim(weights_data) <- c(dim(weights)[1:2]) + } else { + weights_data <- weights #NULL + } + + # Subset indices_for_clim + dum <- match(indices_for_clim, which(good_values)) + good_indices_for_clim <- dum[!is.na(dum)] + + exp_probs <- .GetProbs(data = exp_data, indices_for_quantiles = good_indices_for_clim, + prob_thresholds = prob_thresholds, weights = weights_data, cross.val = cross.val) + # exp_probs: [bin, sdate] + obs_probs <- .GetProbs(data = obs_data, indices_for_quantiles = good_indices_for_clim, + prob_thresholds = prob_thresholds, weights = NULL, cross.val = cross.val) + # obs_probs: [bin, sdate] + + } else { # inputs are probabilities already + exp_probs <- t(exp_data) + obs_probs <- t(obs_data) + } + + probs_exp_cumsum <- apply(exp_probs, 2, cumsum) + probs_obs_cumsum <- apply(obs_probs, 2, cumsum) + + # rps: [sdate, nexp, nobs] + rps [good_values, i, j] <- colSums((probs_exp_cumsum - probs_obs_cumsum)^2) + + if (Fair) { # FairRPS + ## adjustment <- rowSums(-1 * (1/R - 1/R.new) * ens.cum * (R - ens.cum)/R/(R - 1)) [formula taken from SpecsVerification::EnsRps] + R <- dim(exp)[2] #memb + R_new <- Inf + adjustment <- (-1) / (R - 1) * probs_exp_cumsum * (1 - probs_exp_cumsum) + adjustment <- colSums(adjustment) + rps[ , i, j] <- rps[ , i, j] + adjustment + } + + } else { ## not enough values different from NA + + rps[ , i, j] <- as.numeric(NA) + + } + + } + } + + if (is.null(dat_dim)) { + dim(rps) <- dim(exp)[time_dim] + } + + return(rps) +} + + + diff --git a/modules/Skill/Skill.R b/modules/Skill/Skill.R index fd54f17f3471cd0ca880712eee91f64f304666cf..d3f60d4e214d6c112179f24bb7bec8cfce9c4fac 100644 --- a/modules/Skill/Skill.R +++ b/modules/Skill/Skill.R @@ -19,6 +19,10 @@ source("modules/Skill/R/tmp/GetProbs.R") source("modules/Skill/compute_skill_metrics.R") source("modules/Skill/compute_probabilities.R") +## Temporary +source("modules/Skill/R/tmp/RPS.R") +source("modules/Skill/R/tmp/CRPS.R") + Skill <- function(recipe, data, agg = 'global') { # data$hcst: s2dv_cube containing the hindcast @@ -84,14 +88,39 @@ Skill <- function(recipe, data, agg = 'global') { memb_dim = memb_dim, Fair = Fair, cross.val = cross.val, + return_mean = TRUE, ncores = ncores) skill <- .drop_dims(skill) skill_metrics[[ metric ]] <- skill - rps_clim <- Apply(list(data$obs$data), - target_dims = c(time_dim, memb_dim), - RPS_clim)$output1 - rps_clim <- .drop_dims(rps_clim) - skill_metrics[[paste0(metric, "_clim")]] <- rps_clim + # RPS_clim + } else if (metric %in% c('rps_clim')) { + skill <- Apply(list(data$obs$data), + target_dims = c(time_dim, memb_dim), + cross.val = cross.val, + return_mean = TRUE, + fun = RPS_clim)$output1 + skill <- .drop_dims(skill) + skill_metrics[[metric]] <- skill + # RPS_syear and FRPS_syear + } else if (metric %in% c('rps_syear', 'frps_syear')) { + skill <- RPS(data$hcst$data, data$obs$data, + time_dim = time_dim, + memb_dim = memb_dim, + Fair = Fair, + cross.val = cross.val, + return_mean = FALSE, + ncores = ncores) + skill <- .drop_dims(skill) + skill_metrics[[metric]] <- skill + # RPS_clim_syear + } else if (metric %in% c('rps_clim_syear')) { ## not returning syear dimension name + skill <- Apply(list(data$obs$data), + target_dims = c(time_dim, memb_dim), + cross.val = cross.val, + fun = RPS_clim, + return_mean = FALSE, output_dims = 'syear')$output1 + skill <- .drop_dims(skill) + skill_metrics[[ metric ]] <- skill # Ranked Probability Skill Score and Fair version } else if (metric %in% c('rpss', 'frpss')) { skill <- RPSS(data$hcst$data, data$obs$data, @@ -136,13 +165,36 @@ Skill <- function(recipe, data, agg = 'global') { time_dim = time_dim, memb_dim = memb_dim, Fair = Fair, + return_mean = TRUE, ncores = ncores) skill <- .drop_dims(skill) skill_metrics[[ metric ]] <- skill - crps_clim <- Apply(list(data$obs$data), target_dims = time_dim, - fun = CRPS_clim, memb_dim = memb_dim)$output1 - crps_clim <- .drop_dims(crps_clim) - skill_metrics[['crps_clim']] <- crps_clim + # CRPS_clim + } else if (metric %in% c('crps_clim')) { + skill <- Apply(list(data$obs$data), target_dims = time_dim, + fun = CRPS_clim, memb_dim = memb_dim, + clim.cross.val = cross.val, + return_mean = TRUE)$output1 + skill <- .drop_dims(skill) + skill_metrics[[ metric ]] <- skill + # CRPS_syear and FCRPS_syear + } else if (metric %in% c('crps_syear', 'fcrps_syear')) { + skill <- CRPS(data$hcst$data, data$obs$data, + time_dim = time_dim, + memb_dim = memb_dim, + Fair = Fair, + return_mean = FALSE, + ncores = ncores) + skill <- .drop_dims(skill) + skill_metrics[[ metric ]] <- skill + # CRPS_clim_syear + } else if (metric %in% c('crps_clim_syear')) { + skill <- Apply(list(data$obs$data), target_dims = time_dim, + fun = CRPS_clim, memb_dim = memb_dim, + clim.cross.val = cross.val, + return_mean = FALSE)$output1 + skill <- .drop_dims(skill) + skill_metrics[[metric]] <- skill # CRPSS and FCRPSS } else if (metric %in% c('crpss', 'fcrpss')) { skill <- CRPSS(data$hcst$data, data$obs$data, diff --git a/modules/Statistics/Statistics.R b/modules/Statistics/Statistics.R new file mode 100644 index 0000000000000000000000000000000000000000..40f1889b3a5eb536eb6899041b484d859cd58475 --- /dev/null +++ b/modules/Statistics/Statistics.R @@ -0,0 +1,89 @@ + + +compute_statistics <- function(recipe, data, agg = 'global'){ + + # data$hcst: s2dv_cube containing the hindcast + + # obs: s2dv_cube containing the observations + # recipe: auto-s2s recipe as provided by read_yaml + + time_dim <- 'syear' + memb_dim <- 'ensemble' + + + ## Duplicate obs along hcst ensemble dimension + obs_data <- adrop(data$obs$data, drop = 9) + obs_data <- InsertDim(data = obs_data, pos = 9, lendim = 25, name = 'ensemble') + + + statistics_list <- tolower(recipe$Analysis$Workflow$Statistics$metric) + + statistics <- list() + + for (stat in strsplit(statistics_list, ", | |,")[[1]]) { + # Whether the fair version of the metric is to be computed + if (stat %in% c('cov', 'covariance')) { + + covariance <- Apply(data = list(x= obs_data, y=data$hcst$data), + target_dims = c(time_dim, memb_dim), + fun = function(x,y){cov(as.vector(x),as.vector(y), + use = "everything", + method = "pearson")})$output1 + + statistics[[ stat ]] <- covariance + + } ## close if on cov + + + if (stat %in% c('std', 'standard_deviation')) { + + ## Calculate standard deviation + std_hcst <- Apply(data = data$hcst$data, + target_dims = c(time_dim, memb_dim), + fun = 'sd')$output1 + + std_obs <- Apply(data = data$obs$data, + target_dims = c(time_dim, memb_dim), + fun = 'sd')$output1 + + statistics[[ stat ]] <- list('std_hcst' = std_hcst, 'std_obs' = std_obs) + + } ## close if on std + + if (stat %in% c('var', 'variance')) { + + ## Calculate standard deviation + var_hcst <- (Apply(data = data$hcst$data, + target_dims = c(time_dim, memb_dim), + fun = 'sd')$output1)^2 + + var_obs <- (Apply(data = data$obs$data, + target_dims = c(time_dim, memb_dim), + fun = 'sd')$output1)^2 + + statistics[[ stat ]] <- list('var_hcst' = var_hcst, 'var_obs' = var_obs) + + } ## close if on var + + } + + info(recipe$Run$logger, "##### STATISTICS COMPUTATION COMPLETE #####") + .log_memory_usage(recipe$Run$logger, when = "After statistics computation") + + # Save outputs + if (recipe$Analysis$Workflow$Skill$save != 'none') { + info(recipe$Run$logger, "##### START SAVING STATISTICS #####") + } + recipe$Run$output_dir <- paste0(recipe$Run$output_dir, + "/outputs/Statistics/") + + if (recipe$Analysis$Workflow$Statistics$save == 'all') { + # Save all statistics + save_metrics(recipe = recipe, skill = statistics, ## Not able to save data with these dimensions + data_cube = data$hcst, agg = agg) ## The length of parameter 'order' should be the same with the dimension length of parameter 'data'. + } + + # Return results + return(statistics) +} + diff --git a/recipes/atomic_recipes/recipe_scorecards_s2s-suite.yml b/recipes/atomic_recipes/recipe_scorecards_s2s-suite.yml index ae17f9cd1dbbcf2ada5b47a68219272d983aff6d..17c1acae318ffe254f8781c2def99cb73fe509d3 100644 --- a/recipes/atomic_recipes/recipe_scorecards_s2s-suite.yml +++ b/recipes/atomic_recipes/recipe_scorecards_s2s-suite.yml @@ -16,28 +16,36 @@ Analysis: Time: sdate: '0101' ## MMDD fcst_year: # Optional, int: Forecast year 'YYYY' - hcst_start: '1993' # Mandatory, int: Hindcast start year 'YYYY' + hcst_start: '2014' # Mandatory, int: Hindcast start year 'YYYY' hcst_end: '2016' # Mandatory, int: Hindcast end year 'YYYY' ftime_min: 1 # Mandatory, int: First leadtime time step in months - ftime_max: 6 # Mandatory, int: Last leadtime time step in months + ftime_max: 2 # Mandatory, int: Last leadtime time step in months Region: - latmin: -90 # Mandatory, int: minimum latitude - latmax: 90 # Mandatory, int: maximum latitude + latmin: 30 # Mandatory, int: minimum latitude + latmax: 35 # Mandatory, int: maximum latitude lonmin: 0 # Mandatory, int: minimum longitude - lonmax: 359.9 # Mandatory, int: maximum longitude + lonmax: 10 # Mandatory, int: maximum longitude Regrid: method: conservative # conservative for prlr, bilinear for tas, psl, sfcWind type: to_system Workflow: Calibration: method: raw # Mandatory, str: Calibration method. See docu. + save: 'none' Anomalies: compute: yes - cross_validation: no + cross_validation: no + save: 'none' Skill: - metric: mean_bias EnsCorr rps rpss crps crpss EnsSprErr # str: Skill metric or list of skill metrics. See docu. + metric: rps_syear rps_clim_syear crps_syear crps_clim_syear # str: Skill metric or list of skill metrics. See docu. + cross_validation: yes + save: 'all' + Statistics: + metric: cov std var + save: 'all' Probabilities: percentiles: [[1/3, 2/3], [1/10], [9/10]] # frac: Quantile thresholds. + save: 'none' Indicators: index: no ncores: 15 # Optional, int: number of cores, defaults to 1 @@ -46,5 +54,5 @@ Analysis: Run: Loglevel: INFO Terminal: yes - output_dir: /esarchive/scratch/nmilders/scorecards_data/to_system/cross_validation/tercile_cross_val/ECMWF-SEAS5/prlr/ + output_dir: /esarchive/scratch/nmilders/scorecards_data/test/ code_dir: /esarchive/scratch/nmilders/gitlab/git_clones/s2s-suite/ diff --git a/tools/check_recipe.R b/tools/check_recipe.R index e4c59a5592724837a65f71ff1fe475f8e33780d5..e68f0b901c514b2ba644d87eae42bbc3f69e4c5a 100644 --- a/tools/check_recipe.R +++ b/tools/check_recipe.R @@ -442,10 +442,12 @@ check_recipe <- function(recipe) { } # Skill - AVAILABLE_METRICS <- c("enscorr", "corr_individual_members", "rps", "rpss", - "frps", "frpss", "crps", "crpss", "bss10", "bss90", + AVAILABLE_METRICS <- c("enscorr", "corr_individual_members", "rps", "rps_syear", + "rpss", "frps", "frpss", "crps", "crps_syear", + "crpss", "bss10", "bss90", "mean_bias", "mean_bias_ss", "enssprerr", "rps_clim", - "rpss_clim", "enscorr_specs", "frps_specs", "rpss_specs", + "rps_clim_syear", "crps_clim", "crps_clim_syear", + "enscorr_specs", "frps_specs", "rpss_specs", "frpss_specs", "bss10_specs", "bss90_specs") if ("Skill" %in% names(recipe$Analysis$Workflow)) { if (is.null(recipe$Analysis$Workflow$Skill$metric)) { @@ -571,6 +573,28 @@ check_recipe <- function(recipe) { } else { sc_metrics <- strsplit(recipe$Analysis$Workflow$Scorecards$metric, ", | |,")[[1]] + if (recipe$Analysis$Workflow$Scorecards$metric_aggregation == 'score') { + if ('rpss' %in% tolower(sc_metrics)) { + if (!('rps_clim_syear' %in% requested_metrics)) { + requested_metrics <- c(requested_metrics, 'rps_clim_syear') + } + if (!('rps_syear' %in% requested_metrics)) { + requested_metrics <- c(requested_metrics, 'rps_syear') + } + } + if ('crpss' %in% tolower(sc_metrics)) { + if (!('crps_clim_syear' %in% requested_metrics)) { + requested_metrics <- c(requested_metrics, 'crps_clim_syear') + } + if (!('crps_syear' %in% requested_metrics)) { + requested_metrics <- c(requested_metrics, 'crps_syear') + } + } + if ('enscorr' %in% tolower(sc_metrics)) { + recipe$Analysis$Workflow$Statistics <- c('standard_deviation', 'covariance') + } + recipe$Analysis$Workflow$Skill$metric <- requested_metrics + } if (!all(tolower(sc_metrics) %in% tolower(requested_metrics))) { error(recipe$Run$logger, paste0("All of the metrics requested under 'Scorecards' must ", @@ -718,4 +742,5 @@ check_recipe <- function(recipe) { } else { info(recipe$Run$logger, "##### RECIPE CHECK SUCCESSFULL #####") } + return(recipe) } diff --git a/tools/prepare_outputs.R b/tools/prepare_outputs.R index fa4d23ed13190021478009dee6a44582f451d9fa..c7f3e3e60c6f9c1d843ef8a14790a9438c1dc0af 100644 --- a/tools/prepare_outputs.R +++ b/tools/prepare_outputs.R @@ -94,7 +94,7 @@ prepare_outputs <- function(recipe_file, warn(recipe$Run$logger, "Recipe checks disabled. The recipe will not be checked for errors.") } else { - check_recipe(recipe) + recipe <- check_recipe(recipe) } # Restructure the recipe to make the atomic recipe more readable if (restructure) {