RPS.R

#'Compute the Ranked Probability Score
#'
#'The Ranked Probability Score (RPS; Wilks, 2011) is defined as the sum of the
#'squared differences between the cumulative forecast probabilities (computed 
#'from the ensemble members) and the observations (defined as 0% if the category
#'did not happen and 100% if it happened). It can be used to evaluate the skill 
#'of multi-categorical probabilistic forecasts. The RPS ranges between 0 
#'(perfect forecast) and n-1 (worst possible forecast), where n is the number of
#'categories. In the case of a forecast divided into two categories (the lowest
#'number of categories that a probabilistic forecast can have), the RPS 
#'corresponds to the Brier Score (BS; Wilks, 2011), therefore ranging between 0 
#'and 1.\cr 
#'The function first calculates the probabilities for forecasts and observations, 
#'then use them to calculate RPS. Or, the probabilities of exp and obs can be 
#'provided directly to compute the score. If there is more than one dataset, RPS
#'will be computed for each pair of exp and obs data. The fraction of acceptable
#'NAs can be adjusted.
#'
#'@param exp A named numerical array of either the forecasts with at least time
#'  and member dimensions, or the probabilities with at least time and category
#'  dimensions. The probabilities can be generated by \code{s2dv::GetProbs}.
#'@param obs A named numerical array of either the observation with at least 
#'  time dimension, or the probabilities with at least time and category 
#'  dimensions. The probabilities can be generated by \code{s2dv::GetProbs}. The
#'  dimensions must be the same as 'exp' except 'memb_dim' and 'dat_dim'.
#'@param time_dim A character string indicating the name of the time dimension.
#'  The default value is 'sdate'.
#'@param memb_dim A character string indicating the name of the member dimension
#'  to compute the probabilities of the forecast. The default value is 'member'.
#'  If the data are probabilities, set memb_dim as NULL.
#'@param cat_dim A character string indicating the name of the category 
#'  dimension that is needed when the exp and obs are probabilities. The default
#'  value is NULL, which means that the data are not probabilities.
#'@param dat_dim A character string indicating the name of dataset dimension. 
#'  The length of this dimension can be different between 'exp' and 'obs'. 
#'  The default value is NULL.
#'@param prob_thresholds A numeric vector of the relative thresholds (from 0 to
#'  1) between the categories. The default value is c(1/3, 2/3), which 
#'  corresponds to tercile equiprobable categories.
#'@param indices_for_clim A vector of the indices to be taken along 'time_dim' 
#'  for computing the thresholds between the probabilistic categories. If NULL,
#'  the whole period is used. The default value is NULL.
#'@param Fair A logical indicating whether to compute the FairRPS (the 
#'  potential RPS that the forecast would have with an infinite ensemble size).
#'  The default value is FALSE.
#'@param nmemb A numeric value indicating the number of members used to compute the probabilities. This parameter is necessary when calculating FairRPS from probabilities. Default is NULL.
#'@param weights A named numerical array of the weights for 'exp' probability 
#'  calculation. If 'dat_dim' is NULL, the dimensions should include 'memb_dim' 
#'  and 'time_dim'. Else, the dimension should also include 'dat_dim'. The 
#'  default value is NULL. The ensemble should have at least 70 members or span 
#'  at least 10 time steps and have more than 45 members if consistency between
#'  the weighted and unweighted methodologies is desired. 
#'@param cross.val A logical indicating whether to compute the thresholds
#'  between probabilistic categories in cross-validation. The default value is
#'  FALSE.
#'@param return_mean A logical indicating whether to return the temporal mean
#'  of the RPS or not. If TRUE, the temporal mean is calculated along time_dim,
#'  if FALSE the time dimension is not aggregated. The default is TRUE.
#'@param na.rm A logical or numeric value between 0 and 1. If it is numeric, it 
#'  means the lower limit for the fraction of the non-NA values. 1 is equal to 
#'  FALSE (no NA is acceptable), 0 is equal to TRUE (all NAs are acceptable). 
#   The function returns NA if the fraction of non-NA values in the data is less
#'  than na.rm. Otherwise, RPS will be calculated. The default value is FALSE.
#'@param ncores An integer indicating the number of cores to use for parallel 
#'  computation. The default value is NULL.
#'
#'@return 
#'A numerical array of RPS with dimensions c(nexp, nobs, the rest dimensions of 
#''exp' except 'time_dim' and 'memb_dim' dimensions). nexp is the number of 
#'experiment (i.e., dat_dim in exp), and nobs is the number of observation 
#'(i.e., dat_dim in obs). If dat_dim is NULL, nexp and nobs are omitted.
#'
#'@references 
#'Wilks, 2011; https://doi.org/10.1016/B978-0-12-385022-5.00008-7
#'
#'@examples
#'# Use synthetic data
#'exp <- array(rnorm(1000), dim = c(lat = 3, lon = 2, member = 10, sdate = 50))
#'obs <- array(rnorm(1000), dim = c(lat = 3, lon = 2, sdate = 50))
#'res <- RPS(exp = exp, obs = obs)
#'# Use probabilities as inputs
#'exp_probs <- GetProbs(exp, time_dim = 'sdate', memb_dim = 'member')
#'obs_probs <- GetProbs(obs, time_dim = 'sdate', memb_dim = NULL)
#'res2 <- RPS(exp = exp_probs, obs = obs_probs, memb_dim = NULL, cat_dim = 'bin')
#'
#'
#'@import multiApply
#'@importFrom easyVerification convert2prob
#'@export
RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', cat_dim = NULL, 
                dat_dim = NULL, prob_thresholds = c(1/3, 2/3), indices_for_clim = NULL, 
                Fair = FALSE, nmemb = NULL, weights = NULL,
		cross.val = FALSE, return_mean = TRUE,
                na.rm = FALSE, ncores = NULL) {
  
  # Check inputs
  ## exp and obs (1)
  if (!is.array(exp) | !is.numeric(exp))
    stop('Parameter "exp" must be a numeric array.')
  if (!is.array(obs) | !is.numeric(obs))
    stop('Parameter "obs" must be a numeric array.')
  if (any(is.null(names(dim(exp)))) | any(nchar(names(dim(exp))) == 0) |
      any(is.null(names(dim(obs)))) | any(nchar(names(dim(obs))) == 0)) {
    stop("Parameter 'exp' and 'obs' must have dimension names.")
  }
  ## time_dim
  if (!is.character(time_dim) | length(time_dim) != 1)
    stop('Parameter "time_dim" must be a character string.')
  if (!time_dim %in% names(dim(exp)) | !time_dim %in% names(dim(obs))) {
    stop("Parameter 'time_dim' is not found in 'exp' or 'obs' dimension.")
  }
  ## memb_dim & cat_dim
  if (is.null(memb_dim) + is.null(cat_dim) != 1) {
    stop("Only one of the two parameters 'memb_dim' and 'cat_dim' can have value.")
  }
  ## memb_dim
  if (!is.null(memb_dim)) {
    if (!is.character(memb_dim) | length(memb_dim) > 1) {
      stop("Parameter 'memb_dim' must be a character string.")
    }
    if (!memb_dim %in% names(dim(exp))) {
      stop("Parameter 'memb_dim' is not found in 'exp' dimension.")
    }
  }
  ## cat_dim
  if (!is.null(cat_dim)) {
    if (!is.character(cat_dim) | length(cat_dim) > 1) {
      stop("Parameter 'cat_dim' must be a character string.")
    }
    if (!cat_dim %in% names(dim(exp)) | !cat_dim %in% names(dim(obs))) {
      stop("Parameter 'cat_dim' is not found in 'exp' or 'obs' dimension.")
    }
  }
  ## dat_dim
  if (!is.null(dat_dim)) {
    if (!is.character(dat_dim) | length(dat_dim) > 1) {
      stop("Parameter 'dat_dim' must be a character string.")
    }
    if (!dat_dim %in% names(dim(exp)) | !dat_dim %in% names(dim(obs))) {
      stop("Parameter 'dat_dim' is not found in 'exp' or 'obs' dimension.",
           " Set it as NULL if there is no dataset dimension.")
    }
  }
  ## exp and obs (2)
  name_exp <- sort(names(dim(exp)))
  name_obs <- sort(names(dim(obs)))
  if (!is.null(memb_dim)) {
    name_exp <- name_exp[-which(name_exp == memb_dim)]
    if (memb_dim %in% name_obs) {
      name_obs <- name_obs[-which(name_obs == memb_dim)]
    }
  }
  if (!is.null(dat_dim)) {
    name_exp <- name_exp[-which(name_exp == dat_dim)]
    name_obs <- name_obs[-which(name_obs == dat_dim)]
  }
  if (!identical(length(name_exp), length(name_obs)) |
      !identical(dim(exp)[name_exp], dim(obs)[name_obs])) {
    stop("Parameter 'exp' and 'obs' must have same length of ",
         "all dimensions except 'memb_dim' and 'dat_dim'.")
  }
  ## prob_thresholds
  if (!is.numeric(prob_thresholds) | !is.vector(prob_thresholds) |
      any(prob_thresholds <= 0) | any(prob_thresholds >= 1)) {
    stop("Parameter 'prob_thresholds' must be a numeric vector between 0 and 1.")
  }
  ## indices_for_clim
  if (is.null(indices_for_clim)) {
    indices_for_clim <- seq_len(dim(obs)[time_dim])
  } else {
    if (!is.numeric(indices_for_clim) | !is.vector(indices_for_clim)) {
      stop("Parameter 'indices_for_clim' must be NULL or a numeric vector.")
    } else if (length(indices_for_clim) > dim(obs)[time_dim] |
               max(indices_for_clim) > dim(obs)[time_dim] |
               any(indices_for_clim) < 1) {
      stop("Parameter 'indices_for_clim' should be the indices of 'time_dim'.")
    }
  }
  ## Fair
  if (!is.logical(Fair) | length(Fair) > 1) {
    stop("Parameter 'Fair' must be either TRUE or FALSE.")
  }
  if (Fair) {
    if (!is.null(cat_dim)) {
      if (cat_dim %in% names(dim(exp))) {
        if (is.null(nmemb)) {
  	  stop("Parameter 'nmemb' necessary to compute Fair",
	       "score from probabilities")
        }
      }
    }
  }
  ## return_mean
  if (!is.logical(return_mean) | length(return_mean) > 1) {
    stop("Parameter 'return_mean' must be either TRUE or FALSE.")
  }
  ## cross.val
  if (!is.logical(cross.val)  | length(cross.val) > 1) {
    stop("Parameter 'cross.val' must be either TRUE or FALSE.")
  }
  ## weights
  if (!is.null(weights) & is.null(cat_dim)) {
    if (!is.array(weights) | !is.numeric(weights))
      stop("Parameter 'weights' must be a named numeric array.")
    if (is.null(dat_dim)) {
      if (length(dim(weights)) != 2 | !all(names(dim(weights)) %in% c(memb_dim, time_dim)))
        stop("Parameter 'weights' must have two dimensions with the names of ",
             "'memb_dim' and 'time_dim'.")
      if (dim(weights)[memb_dim] != dim(exp)[memb_dim] |
          dim(weights)[time_dim] != dim(exp)[time_dim]) {
        stop("Parameter 'weights' must have the same dimension lengths ", 
             "as 'memb_dim' and 'time_dim' in 'exp'.")
      }
      weights <- Reorder(weights, c(time_dim, memb_dim))
      
    } else {
      if (length(dim(weights)) != 3 | !all(names(dim(weights)) %in% c(memb_dim, time_dim, dat_dim)))
        stop("Parameter 'weights' must have three dimensions with the names of ",
             "'memb_dim', 'time_dim' and 'dat_dim'.")
      if (dim(weights)[memb_dim] != dim(exp)[memb_dim] |
          dim(weights)[time_dim] != dim(exp)[time_dim] |
          dim(weights)[dat_dim] != dim(exp)[dat_dim]) {
        stop("Parameter 'weights' must have the same dimension lengths ", 
             "as 'memb_dim', 'time_dim' and 'dat_dim' in 'exp'.")
      }
      weights <- Reorder(weights, c(time_dim, memb_dim, dat_dim))
      
    }
  } else if (!is.null(weights) & !is.null(cat_dim)) {
    .warning(paste0("Parameter 'exp' and 'obs' are probabilities already, so parameter ",
                    "'weights' is not used. Change 'weights' to NULL."))
    weights <- NULL
  }
  ## na.rm
  if (!isTRUE(na.rm) & !isFALSE(na.rm) & !(is.numeric(na.rm) & na.rm >= 0 & na.rm <= 1)) {
    stop('"na.rm" should be TRUE, FALSE or a numeric between 0 and 1')
  }
  ## ncores
  if (!is.null(ncores)) {
    if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 |
        length(ncores) > 1) {
      stop("Parameter 'ncores' must be either NULL or a positive integer.")
    }
  }
  
  ###############################
  
  # Compute RPS
  
  ## Decide target_dims
  if (!is.null(memb_dim)) {
    target_dims_exp <- c(time_dim, memb_dim, dat_dim)
    if (!memb_dim %in% names(dim(obs))) {
      target_dims_obs <- c(time_dim, dat_dim)
    } else {
      target_dims_obs <- c(time_dim, memb_dim, dat_dim)
    }
  } else {  # cat_dim
    target_dims_exp <- target_dims_obs <- c(time_dim, cat_dim, dat_dim)
  }
  
  rps <- Apply(data = list(exp = exp, obs = obs), 
               target_dims = list(exp = target_dims_exp, 
                                  obs = target_dims_obs),
               fun = .RPS,
               dat_dim = dat_dim, time_dim = time_dim,
               memb_dim = memb_dim, cat_dim = cat_dim,
               prob_thresholds = prob_thresholds, nmemb = nmemb,
               indices_for_clim = indices_for_clim, Fair = Fair,
               weights = weights, cross.val = cross.val, 
               na.rm = na.rm, ncores = ncores)$output1
  
  if (return_mean) {
    rps <- MeanDims(rps, time_dim, na.rm = TRUE)
  } else {
    rps <- rps
  }
  
  return(rps)
}


.RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', cat_dim = NULL,
                 dat_dim = NULL, prob_thresholds = c(1/3, 2/3), indices_for_clim = NULL, 
                 Fair = FALSE, nmemb = NULL, weights = NULL,
		 cross.val = FALSE, na.rm = FALSE) {
  #--- if memb_dim: 
  # exp: [sdate, memb, (dat)]
  # obs: [sdate, (memb), (dat)]
  # weights: NULL or same as exp
  #--- if cat_dim:
  # exp: [sdate, bin, (dat)]
  # obs: [sdate, bin, (dat)]
  
  # Adjust dimensions to be [sdate, memb, dat] for both exp and obs
  if (!is.null(memb_dim)) {
    if (!memb_dim %in% names(dim(obs))) {
      obs <- InsertDim(obs, posdim = 2, lendim = 1, name = memb_dim)
    }
  }
  
  if (is.null(dat_dim)) {
    nexp <- 1
    nobs <- 1
    dim(exp) <- c(dim(exp), nexp = nexp)
    dim(obs) <- c(dim(obs), nobs = nobs)
    if (!is.null(weights)) dim(weights) <- c(dim(weights), nexp = nexp)
  } else {
    nexp <- as.numeric(dim(exp)[dat_dim])
    nobs <- as.numeric(dim(obs)[dat_dim])
  }
  
  rps <- array(dim = c(dim(exp)[time_dim], nexp = nexp, nobs = nobs))
  
  for (i in 1:nexp) {
    for (j in 1:nobs) {
      exp_data <- exp[, , i]
      obs_data <- obs[, , j]
      
      if (is.null(dim(exp_data))) dim(exp_data) <- c(dim(exp)[1:2])
      if (is.null(dim(obs_data))) dim(obs_data) <- c(dim(obs)[1:2])
      
      # Find the fraction of NAs
      ## If any member/bin is NA at this time step, it is not good value.
      exp_mean <- rowMeans(exp_data)
      obs_mean <- rowMeans(obs_data)
      good_values <- !is.na(exp_mean) & !is.na(obs_mean)
      
      if (isTRUE(na.rm)) {
        f_NAs <- 0
      } else if (isFALSE(na.rm)) {
        f_NAs <- 1
      } else {
        f_NAs <- na.rm
      }
      
      if (f_NAs <= sum(good_values) / length(obs_mean)) {
        
        exp_data <- exp_data[good_values, , drop = F]
        obs_data <- obs_data[good_values, , drop = F]
        
        # If the data inputs are forecast/observation, calculate probabilities
        if (is.null(cat_dim)) {
          if (!is.null(weights)) {
            weights_data <- weights[which(good_values), , i]
            if (is.null(dim(weights_data))) dim(weights_data) <- c(dim(weights)[1:2])
          } else {
            weights_data <- weights #NULL
          }
          
          # Subset indices_for_clim
          dum <- match(indices_for_clim, which(good_values))
          good_indices_for_clim <- dum[!is.na(dum)]
          
          exp_probs <- .GetProbs(data = exp_data, indices_for_quantiles = good_indices_for_clim, 
                                 prob_thresholds = prob_thresholds, weights = weights_data, 
                                 cross.val = cross.val)
          # exp_probs: [bin, sdate]
          obs_probs <- .GetProbs(data = obs_data, indices_for_quantiles = good_indices_for_clim, 
                                 prob_thresholds = prob_thresholds, weights = NULL, 
                                 cross.val = cross.val)
          # obs_probs: [bin, sdate]
          
        } else { # inputs are probabilities already
	  if (all(names(dim(exp_data)) == c(time_dim, memb_dim)) || 
	      all(names(dim(exp_data)) == c(time_dim, cat_dim))) {
            exp_probs <- t(exp_data)
            obs_probs <- t(obs_data)
	  } 
        }
        
        probs_exp_cumsum <- apply(exp_probs, 2, cumsum)
        probs_obs_cumsum <- apply(obs_probs, 2, cumsum)
        
        # rps: [sdate, nexp, nobs]
        rps [good_values, i, j] <- colSums((probs_exp_cumsum - probs_obs_cumsum)^2)
        if (Fair) { # FairRPS
          if (!is.null(memb_dim)) {
   	    if (memb_dim %in% names(dim(exp))) {
              ## adjustment <- rowSums(-1 * (1/R - 1/R.new) * ens.cum * (R - ens.cum)/R/(R - 1))
              ## [formula taken from SpecsVerification::EnsRps]
              R <- dim(exp)[2]  #memb
            }
	  } else {
	    R <- nmemb
	  }
	  warning("Applying fair correction.")
          adjustment <- (-1) / (R - 1) * probs_exp_cumsum * (1 - probs_exp_cumsum)
          adjustment <- colSums(adjustment)
          rps[, i, j] <- rps[, i, j] + adjustment
        }
        
      } else { ## not enough values different from NA
        
        rps[, i, j] <- NA_real_
        
      }
      
    }
  }
  
  if (is.null(dat_dim)) {
    dim(rps) <- dim(exp)[time_dim]
  }
  
  return(rps)
}