#'Compute the Ranked Probability Score #' #'The Ranked Probability Score (RPS; Wilks, 2011) is defined as the sum of the #'squared differences between the cumulative forecast probabilities (computed #'from the ensemble members) and the observations (defined as 0% if the category #'did not happen and 100% if it happened). It can be used to evaluate the skill #'of multi-categorical probabilistic forecasts. The RPS ranges between 0 #'(perfect forecast) and n-1 (worst possible forecast), where n is the number of #'categories. In the case of a forecast divided into two categories (the lowest #'number of categories that a probabilistic forecast can have), the RPS #'corresponds to the Brier Score (BS; Wilks, 2011), therefore ranging between 0 #'and 1.\cr #'The function first calculates the probabilities for forecasts and observations, #'then use them to calculate RPS. Or, the probabilities of exp and obs can be #'provided directly to compute the score. If there is more than one dataset, RPS #'will be computed for each pair of exp and obs data. The fraction of acceptable #'NAs can be adjusted. #' #'@param exp A named numerical array of either the forecasts with at least time #' and member dimensions, or the probabilities with at least time and category #' dimensions. The probabilities can be generated by \code{s2dv::GetProbs}. #'@param obs A named numerical array of either the observation with at least #' time dimension, or the probabilities with at least time and category #' dimensions. The probabilities can be generated by \code{s2dv::GetProbs}. The #' dimensions must be the same as 'exp' except 'memb_dim' and 'dat_dim'. #'@param time_dim A character string indicating the name of the time dimension. #' The default value is 'sdate'. #'@param memb_dim A character string indicating the name of the member dimension #' to compute the probabilities of the forecast. The default value is 'member'. #' If the data are probabilities, set memb_dim as NULL. #'@param cat_dim A character string indicating the name of the category #' dimension that is needed when the exp and obs are probabilities. The default #' value is NULL, which means that the data are not probabilities. #'@param dat_dim A character string indicating the name of dataset dimension. #' The length of this dimension can be different between 'exp' and 'obs'. #' The default value is NULL. #'@param prob_thresholds A numeric vector of the relative thresholds (from 0 to #' 1) between the categories. The default value is c(1/3, 2/3), which #' corresponds to tercile equiprobable categories. #'@param indices_for_clim A vector of the indices to be taken along 'time_dim' #' for computing the thresholds between the probabilistic categories. If NULL, #' the whole period is used. The default value is NULL. #'@param Fair A logical indicating whether to compute the FairRPS (the #' potential RPS that the forecast would have with an infinite ensemble size). #' The default value is FALSE. #'@param nmemb A numeric value indicating the number of members used to compute the probabilities. This parameter is necessary when calculating FairRPS from probabilities. Default is NULL. #'@param weights A named numerical array of the weights for 'exp' probability #' calculation. If 'dat_dim' is NULL, the dimensions should include 'memb_dim' #' and 'time_dim'. Else, the dimension should also include 'dat_dim'. The #' default value is NULL. The ensemble should have at least 70 members or span #' at least 10 time steps and have more than 45 members if consistency between #' the weighted and unweighted methodologies is desired. #'@param cross.val A logical indicating whether to compute the thresholds #' between probabilistic categories in cross-validation. The default value is #' FALSE. #'@param return_mean A logical indicating whether to return the temporal mean #' of the RPS or not. If TRUE, the temporal mean is calculated along time_dim, #' if FALSE the time dimension is not aggregated. The default is TRUE. #'@param na.rm A logical or numeric value between 0 and 1. If it is numeric, it #' means the lower limit for the fraction of the non-NA values. 1 is equal to #' FALSE (no NA is acceptable), 0 is equal to TRUE (all NAs are acceptable). # The function returns NA if the fraction of non-NA values in the data is less #' than na.rm. Otherwise, RPS will be calculated. The default value is FALSE. #'@param ncores An integer indicating the number of cores to use for parallel #' computation. The default value is NULL. #' #'@return #'A numerical array of RPS with dimensions c(nexp, nobs, the rest dimensions of #''exp' except 'time_dim' and 'memb_dim' dimensions). nexp is the number of #'experiment (i.e., dat_dim in exp), and nobs is the number of observation #'(i.e., dat_dim in obs). If dat_dim is NULL, nexp and nobs are omitted. #' #'@references #'Wilks, 2011; https://doi.org/10.1016/B978-0-12-385022-5.00008-7 #' #'@examples #'# Use synthetic data #'exp <- array(rnorm(1000), dim = c(lat = 3, lon = 2, member = 10, sdate = 50)) #'obs <- array(rnorm(1000), dim = c(lat = 3, lon = 2, sdate = 50)) #'res <- RPS(exp = exp, obs = obs) #'# Use probabilities as inputs #'exp_probs <- GetProbs(exp, time_dim = 'sdate', memb_dim = 'member') #'obs_probs <- GetProbs(obs, time_dim = 'sdate', memb_dim = NULL) #'res2 <- RPS(exp = exp_probs, obs = obs_probs, memb_dim = NULL, cat_dim = 'bin') #' #' #'@import multiApply #'@importFrom easyVerification convert2prob #'@export RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', cat_dim = NULL, dat_dim = NULL, prob_thresholds = c(1/3, 2/3), indices_for_clim = NULL, Fair = FALSE, nmemb = NULL, weights = NULL, cross.val = FALSE, return_mean = TRUE, na.rm = FALSE, ncores = NULL) { # Check inputs ## exp and obs (1) if (!is.array(exp) | !is.numeric(exp)) stop('Parameter "exp" must be a numeric array.') if (!is.array(obs) | !is.numeric(obs)) stop('Parameter "obs" must be a numeric array.') if (any(is.null(names(dim(exp)))) | any(nchar(names(dim(exp))) == 0) | any(is.null(names(dim(obs)))) | any(nchar(names(dim(obs))) == 0)) { stop("Parameter 'exp' and 'obs' must have dimension names.") } ## time_dim if (!is.character(time_dim) | length(time_dim) != 1) stop('Parameter "time_dim" must be a character string.') if (!time_dim %in% names(dim(exp)) | !time_dim %in% names(dim(obs))) { stop("Parameter 'time_dim' is not found in 'exp' or 'obs' dimension.") } ## memb_dim & cat_dim if (is.null(memb_dim) + is.null(cat_dim) != 1) { stop("Only one of the two parameters 'memb_dim' and 'cat_dim' can have value.") } ## memb_dim if (!is.null(memb_dim)) { if (!is.character(memb_dim) | length(memb_dim) > 1) { stop("Parameter 'memb_dim' must be a character string.") } if (!memb_dim %in% names(dim(exp))) { stop("Parameter 'memb_dim' is not found in 'exp' dimension.") } } ## cat_dim if (!is.null(cat_dim)) { if (!is.character(cat_dim) | length(cat_dim) > 1) { stop("Parameter 'cat_dim' must be a character string.") } if (!cat_dim %in% names(dim(exp)) | !cat_dim %in% names(dim(obs))) { stop("Parameter 'cat_dim' is not found in 'exp' or 'obs' dimension.") } } ## dat_dim if (!is.null(dat_dim)) { if (!is.character(dat_dim) | length(dat_dim) > 1) { stop("Parameter 'dat_dim' must be a character string.") } if (!dat_dim %in% names(dim(exp)) | !dat_dim %in% names(dim(obs))) { stop("Parameter 'dat_dim' is not found in 'exp' or 'obs' dimension.", " Set it as NULL if there is no dataset dimension.") } } ## exp and obs (2) name_exp <- sort(names(dim(exp))) name_obs <- sort(names(dim(obs))) if (!is.null(memb_dim)) { name_exp <- name_exp[-which(name_exp == memb_dim)] if (memb_dim %in% name_obs) { name_obs <- name_obs[-which(name_obs == memb_dim)] } } if (!is.null(dat_dim)) { name_exp <- name_exp[-which(name_exp == dat_dim)] name_obs <- name_obs[-which(name_obs == dat_dim)] } if (!identical(length(name_exp), length(name_obs)) | !identical(dim(exp)[name_exp], dim(obs)[name_obs])) { stop("Parameter 'exp' and 'obs' must have same length of ", "all dimensions except 'memb_dim' and 'dat_dim'.") } ## prob_thresholds if (!is.numeric(prob_thresholds) | !is.vector(prob_thresholds) | any(prob_thresholds <= 0) | any(prob_thresholds >= 1)) { stop("Parameter 'prob_thresholds' must be a numeric vector between 0 and 1.") } ## indices_for_clim if (is.null(indices_for_clim)) { indices_for_clim <- seq_len(dim(obs)[time_dim]) } else { if (!is.numeric(indices_for_clim) | !is.vector(indices_for_clim)) { stop("Parameter 'indices_for_clim' must be NULL or a numeric vector.") } else if (length(indices_for_clim) > dim(obs)[time_dim] | max(indices_for_clim) > dim(obs)[time_dim] | any(indices_for_clim) < 1) { stop("Parameter 'indices_for_clim' should be the indices of 'time_dim'.") } } ## Fair if (!is.logical(Fair) | length(Fair) > 1) { stop("Parameter 'Fair' must be either TRUE or FALSE.") } if (Fair) { if (!is.null(cat_dim)) { if (cat_dim %in% names(dim(exp))) { if (is.null(nmemb)) { stop("Parameter 'nmemb' necessary to compute Fair", "score from probabilities") } } } } ## return_mean if (!is.logical(return_mean) | length(return_mean) > 1) { stop("Parameter 'return_mean' must be either TRUE or FALSE.") } ## cross.val if (!is.logical(cross.val) | length(cross.val) > 1) { stop("Parameter 'cross.val' must be either TRUE or FALSE.") } ## weights if (!is.null(weights) & is.null(cat_dim)) { if (!is.array(weights) | !is.numeric(weights)) stop("Parameter 'weights' must be a named numeric array.") if (is.null(dat_dim)) { if (length(dim(weights)) != 2 | !all(names(dim(weights)) %in% c(memb_dim, time_dim))) stop("Parameter 'weights' must have two dimensions with the names of ", "'memb_dim' and 'time_dim'.") if (dim(weights)[memb_dim] != dim(exp)[memb_dim] | dim(weights)[time_dim] != dim(exp)[time_dim]) { stop("Parameter 'weights' must have the same dimension lengths ", "as 'memb_dim' and 'time_dim' in 'exp'.") } weights <- Reorder(weights, c(time_dim, memb_dim)) } else { if (length(dim(weights)) != 3 | !all(names(dim(weights)) %in% c(memb_dim, time_dim, dat_dim))) stop("Parameter 'weights' must have three dimensions with the names of ", "'memb_dim', 'time_dim' and 'dat_dim'.") if (dim(weights)[memb_dim] != dim(exp)[memb_dim] | dim(weights)[time_dim] != dim(exp)[time_dim] | dim(weights)[dat_dim] != dim(exp)[dat_dim]) { stop("Parameter 'weights' must have the same dimension lengths ", "as 'memb_dim', 'time_dim' and 'dat_dim' in 'exp'.") } weights <- Reorder(weights, c(time_dim, memb_dim, dat_dim)) } } else if (!is.null(weights) & !is.null(cat_dim)) { .warning(paste0("Parameter 'exp' and 'obs' are probabilities already, so parameter ", "'weights' is not used. Change 'weights' to NULL.")) weights <- NULL } ## na.rm if (!isTRUE(na.rm) & !isFALSE(na.rm) & !(is.numeric(na.rm) & na.rm >= 0 & na.rm <= 1)) { stop('"na.rm" should be TRUE, FALSE or a numeric between 0 and 1') } ## ncores if (!is.null(ncores)) { if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 | length(ncores) > 1) { stop("Parameter 'ncores' must be either NULL or a positive integer.") } } ############################### # Compute RPS ## Decide target_dims if (!is.null(memb_dim)) { target_dims_exp <- c(time_dim, memb_dim, dat_dim) if (!memb_dim %in% names(dim(obs))) { target_dims_obs <- c(time_dim, dat_dim) } else { target_dims_obs <- c(time_dim, memb_dim, dat_dim) } } else { # cat_dim target_dims_exp <- target_dims_obs <- c(time_dim, cat_dim, dat_dim) } rps <- Apply(data = list(exp = exp, obs = obs), target_dims = list(exp = target_dims_exp, obs = target_dims_obs), fun = .RPS, dat_dim = dat_dim, time_dim = time_dim, memb_dim = memb_dim, cat_dim = cat_dim, prob_thresholds = prob_thresholds, nmemb = nmemb, indices_for_clim = indices_for_clim, Fair = Fair, weights = weights, cross.val = cross.val, na.rm = na.rm, ncores = ncores)$output1 if (return_mean) { rps <- MeanDims(rps, time_dim, na.rm = TRUE) } else { rps <- rps } return(rps) } .RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', cat_dim = NULL, dat_dim = NULL, prob_thresholds = c(1/3, 2/3), indices_for_clim = NULL, Fair = FALSE, nmemb = NULL, weights = NULL, cross.val = FALSE, na.rm = FALSE) { #--- if memb_dim: # exp: [sdate, memb, (dat)] # obs: [sdate, (memb), (dat)] # weights: NULL or same as exp #--- if cat_dim: # exp: [sdate, bin, (dat)] # obs: [sdate, bin, (dat)] # Adjust dimensions to be [sdate, memb, dat] for both exp and obs if (!is.null(memb_dim)) { if (!memb_dim %in% names(dim(obs))) { obs <- InsertDim(obs, posdim = 2, lendim = 1, name = memb_dim) } } if (is.null(dat_dim)) { nexp <- 1 nobs <- 1 dim(exp) <- c(dim(exp), nexp = nexp) dim(obs) <- c(dim(obs), nobs = nobs) if (!is.null(weights)) dim(weights) <- c(dim(weights), nexp = nexp) } else { nexp <- as.numeric(dim(exp)[dat_dim]) nobs <- as.numeric(dim(obs)[dat_dim]) } rps <- array(dim = c(dim(exp)[time_dim], nexp = nexp, nobs = nobs)) for (i in 1:nexp) { for (j in 1:nobs) { exp_data <- exp[, , i] obs_data <- obs[, , j] if (is.null(dim(exp_data))) dim(exp_data) <- c(dim(exp)[1:2]) if (is.null(dim(obs_data))) dim(obs_data) <- c(dim(obs)[1:2]) # Find the fraction of NAs ## If any member/bin is NA at this time step, it is not good value. exp_mean <- rowMeans(exp_data) obs_mean <- rowMeans(obs_data) good_values <- !is.na(exp_mean) & !is.na(obs_mean) if (isTRUE(na.rm)) { f_NAs <- 0 } else if (isFALSE(na.rm)) { f_NAs <- 1 } else { f_NAs <- na.rm } if (f_NAs <= sum(good_values) / length(obs_mean)) { exp_data <- exp_data[good_values, , drop = F] obs_data <- obs_data[good_values, , drop = F] # If the data inputs are forecast/observation, calculate probabilities if (is.null(cat_dim)) { if (!is.null(weights)) { weights_data <- weights[which(good_values), , i] if (is.null(dim(weights_data))) dim(weights_data) <- c(dim(weights)[1:2]) } else { weights_data <- weights #NULL } # Subset indices_for_clim dum <- match(indices_for_clim, which(good_values)) good_indices_for_clim <- dum[!is.na(dum)] exp_probs <- .GetProbs(data = exp_data, indices_for_quantiles = good_indices_for_clim, prob_thresholds = prob_thresholds, weights = weights_data, cross.val = cross.val) # exp_probs: [bin, sdate] obs_probs <- .GetProbs(data = obs_data, indices_for_quantiles = good_indices_for_clim, prob_thresholds = prob_thresholds, weights = NULL, cross.val = cross.val) # obs_probs: [bin, sdate] } else { # inputs are probabilities already if (all(names(dim(exp_data)) == c(time_dim, memb_dim)) || all(names(dim(exp_data)) == c(time_dim, cat_dim))) { exp_probs <- t(exp_data) obs_probs <- t(obs_data) } } probs_exp_cumsum <- apply(exp_probs, 2, cumsum) probs_obs_cumsum <- apply(obs_probs, 2, cumsum) # rps: [sdate, nexp, nobs] rps [good_values, i, j] <- colSums((probs_exp_cumsum - probs_obs_cumsum)^2) if (Fair) { # FairRPS if (!is.null(memb_dim)) { if (memb_dim %in% names(dim(exp))) { ## adjustment <- rowSums(-1 * (1/R - 1/R.new) * ens.cum * (R - ens.cum)/R/(R - 1)) ## [formula taken from SpecsVerification::EnsRps] R <- dim(exp)[2] #memb } } else { R <- nmemb } warning("Applying fair correction.") adjustment <- (-1) / (R - 1) * probs_exp_cumsum * (1 - probs_exp_cumsum) adjustment <- colSums(adjustment) rps[, i, j] <- rps[, i, j] + adjustment } } else { ## not enough values different from NA rps[, i, j] <- NA_real_ } } } if (is.null(dat_dim)) { dim(rps) <- dim(exp)[time_dim] } return(rps) }