RPS.R 16.8 KB
Newer Older
nperez's avatar
nperez committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408
#'Compute the Ranked Probability Score
#'
#'The Ranked Probability Score (RPS; Wilks, 2011) is defined as the sum of the
#'squared differences between the cumulative forecast probabilities (computed 
#'from the ensemble members) and the observations (defined as 0% if the category
#'did not happen and 100% if it happened). It can be used to evaluate the skill 
#'of multi-categorical probabilistic forecasts. The RPS ranges between 0 
#'(perfect forecast) and n-1 (worst possible forecast), where n is the number of
#'categories. In the case of a forecast divided into two categories (the lowest
#'number of categories that a probabilistic forecast can have), the RPS 
#'corresponds to the Brier Score (BS; Wilks, 2011), therefore ranging between 0 
#'and 1.\cr 
#'The function first calculates the probabilities for forecasts and observations, 
#'then use them to calculate RPS. Or, the probabilities of exp and obs can be 
#'provided directly to compute the score. If there is more than one dataset, RPS
#'will be computed for each pair of exp and obs data. The fraction of acceptable
#'NAs can be adjusted.
#'
#'@param exp A named numerical array of either the forecasts with at least time
#'  and member dimensions, or the probabilities with at least time and category
#'  dimensions. The probabilities can be generated by \code{s2dv::GetProbs}.
#'@param obs A named numerical array of either the observation with at least 
#'  time dimension, or the probabilities with at least time and category 
#'  dimensions. The probabilities can be generated by \code{s2dv::GetProbs}. The
#'  dimensions must be the same as 'exp' except 'memb_dim' and 'dat_dim'.
#'@param time_dim A character string indicating the name of the time dimension.
#'  The default value is 'sdate'.
#'@param memb_dim A character string indicating the name of the member dimension
#'  to compute the probabilities of the forecast. The default value is 'member'.
#'  If the data are probabilities, set memb_dim as NULL.
#'@param cat_dim A character string indicating the name of the category 
#'  dimension that is needed when the exp and obs are probabilities. The default
#'  value is NULL, which means that the data are not probabilities.
#'@param dat_dim A character string indicating the name of dataset dimension. 
#'  The length of this dimension can be different between 'exp' and 'obs'. 
#'  The default value is NULL.
#'@param prob_thresholds A numeric vector of the relative thresholds (from 0 to
#'  1) between the categories. The default value is c(1/3, 2/3), which 
#'  corresponds to tercile equiprobable categories.
#'@param indices_for_clim A vector of the indices to be taken along 'time_dim' 
#'  for computing the thresholds between the probabilistic categories. If NULL,
#'  the whole period is used. The default value is NULL.
#'@param Fair A logical indicating whether to compute the FairRPS (the 
#'  potential RPS that the forecast would have with an infinite ensemble size).
#'  The default value is FALSE.
#'@param nmemb A numeric value indicating the number of members used to compute the probabilities. This parameter is necessary when calculating FairRPS from probabilities. Default is NULL.
#'@param weights A named numerical array of the weights for 'exp' probability 
#'  calculation. If 'dat_dim' is NULL, the dimensions should include 'memb_dim' 
#'  and 'time_dim'. Else, the dimension should also include 'dat_dim'. The 
#'  default value is NULL. The ensemble should have at least 70 members or span 
#'  at least 10 time steps and have more than 45 members if consistency between
#'  the weighted and unweighted methodologies is desired. 
#'@param cross.val A logical indicating whether to compute the thresholds
#'  between probabilistic categories in cross-validation. The default value is
#'  FALSE.
#'@param return_mean A logical indicating whether to return the temporal mean
#'  of the RPS or not. If TRUE, the temporal mean is calculated along time_dim,
#'  if FALSE the time dimension is not aggregated. The default is TRUE.
#'@param na.rm A logical or numeric value between 0 and 1. If it is numeric, it 
#'  means the lower limit for the fraction of the non-NA values. 1 is equal to 
#'  FALSE (no NA is acceptable), 0 is equal to TRUE (all NAs are acceptable). 
#   The function returns NA if the fraction of non-NA values in the data is less
#'  than na.rm. Otherwise, RPS will be calculated. The default value is FALSE.
#'@param ncores An integer indicating the number of cores to use for parallel 
#'  computation. The default value is NULL.
#'
#'@return 
#'A numerical array of RPS with dimensions c(nexp, nobs, the rest dimensions of 
#''exp' except 'time_dim' and 'memb_dim' dimensions). nexp is the number of 
#'experiment (i.e., dat_dim in exp), and nobs is the number of observation 
#'(i.e., dat_dim in obs). If dat_dim is NULL, nexp and nobs are omitted.
#'
#'@references 
#'Wilks, 2011; https://doi.org/10.1016/B978-0-12-385022-5.00008-7
#'
#'@examples
#'# Use synthetic data
#'exp <- array(rnorm(1000), dim = c(lat = 3, lon = 2, member = 10, sdate = 50))
#'obs <- array(rnorm(1000), dim = c(lat = 3, lon = 2, sdate = 50))
#'res <- RPS(exp = exp, obs = obs)
#'# Use probabilities as inputs
#'exp_probs <- GetProbs(exp, time_dim = 'sdate', memb_dim = 'member')
#'obs_probs <- GetProbs(obs, time_dim = 'sdate', memb_dim = NULL)
#'res2 <- RPS(exp = exp_probs, obs = obs_probs, memb_dim = NULL, cat_dim = 'bin')
#'
#'
#'@import multiApply
#'@importFrom easyVerification convert2prob
#'@export
RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', cat_dim = NULL, 
                dat_dim = NULL, prob_thresholds = c(1/3, 2/3), indices_for_clim = NULL, 
                Fair = FALSE, nmemb = NULL, weights = NULL,
		cross.val = FALSE, return_mean = TRUE,
                na.rm = FALSE, ncores = NULL) {
  
  # Check inputs
  ## exp and obs (1)
  if (!is.array(exp) | !is.numeric(exp))
    stop('Parameter "exp" must be a numeric array.')
  if (!is.array(obs) | !is.numeric(obs))
    stop('Parameter "obs" must be a numeric array.')
  if (any(is.null(names(dim(exp)))) | any(nchar(names(dim(exp))) == 0) |
      any(is.null(names(dim(obs)))) | any(nchar(names(dim(obs))) == 0)) {
    stop("Parameter 'exp' and 'obs' must have dimension names.")
  }
  ## time_dim
  if (!is.character(time_dim) | length(time_dim) != 1)
    stop('Parameter "time_dim" must be a character string.')
  if (!time_dim %in% names(dim(exp)) | !time_dim %in% names(dim(obs))) {
    stop("Parameter 'time_dim' is not found in 'exp' or 'obs' dimension.")
  }
  ## memb_dim & cat_dim
  if (is.null(memb_dim) + is.null(cat_dim) != 1) {
    stop("Only one of the two parameters 'memb_dim' and 'cat_dim' can have value.")
  }
  ## memb_dim
  if (!is.null(memb_dim)) {
    if (!is.character(memb_dim) | length(memb_dim) > 1) {
      stop("Parameter 'memb_dim' must be a character string.")
    }
    if (!memb_dim %in% names(dim(exp))) {
      stop("Parameter 'memb_dim' is not found in 'exp' dimension.")
    }
  }
  ## cat_dim
  if (!is.null(cat_dim)) {
    if (!is.character(cat_dim) | length(cat_dim) > 1) {
      stop("Parameter 'cat_dim' must be a character string.")
    }
    if (!cat_dim %in% names(dim(exp)) | !cat_dim %in% names(dim(obs))) {
      stop("Parameter 'cat_dim' is not found in 'exp' or 'obs' dimension.")
    }
  }
  ## dat_dim
  if (!is.null(dat_dim)) {
    if (!is.character(dat_dim) | length(dat_dim) > 1) {
      stop("Parameter 'dat_dim' must be a character string.")
    }
    if (!dat_dim %in% names(dim(exp)) | !dat_dim %in% names(dim(obs))) {
      stop("Parameter 'dat_dim' is not found in 'exp' or 'obs' dimension.",
           " Set it as NULL if there is no dataset dimension.")
    }
  }
  ## exp and obs (2)
  name_exp <- sort(names(dim(exp)))
  name_obs <- sort(names(dim(obs)))
  if (!is.null(memb_dim)) {
    name_exp <- name_exp[-which(name_exp == memb_dim)]
    if (memb_dim %in% name_obs) {
      name_obs <- name_obs[-which(name_obs == memb_dim)]
    }
  }
  if (!is.null(dat_dim)) {
    name_exp <- name_exp[-which(name_exp == dat_dim)]
    name_obs <- name_obs[-which(name_obs == dat_dim)]
  }
  if (!identical(length(name_exp), length(name_obs)) |
      !identical(dim(exp)[name_exp], dim(obs)[name_obs])) {
    stop("Parameter 'exp' and 'obs' must have same length of ",
         "all dimensions except 'memb_dim' and 'dat_dim'.")
  }
  ## prob_thresholds
  if (!is.numeric(prob_thresholds) | !is.vector(prob_thresholds) |
      any(prob_thresholds <= 0) | any(prob_thresholds >= 1)) {
    stop("Parameter 'prob_thresholds' must be a numeric vector between 0 and 1.")
  }
  ## indices_for_clim
  if (is.null(indices_for_clim)) {
    indices_for_clim <- seq_len(dim(obs)[time_dim])
  } else {
    if (!is.numeric(indices_for_clim) | !is.vector(indices_for_clim)) {
      stop("Parameter 'indices_for_clim' must be NULL or a numeric vector.")
    } else if (length(indices_for_clim) > dim(obs)[time_dim] |
               max(indices_for_clim) > dim(obs)[time_dim] |
               any(indices_for_clim) < 1) {
      stop("Parameter 'indices_for_clim' should be the indices of 'time_dim'.")
    }
  }
  ## Fair
  if (!is.logical(Fair) | length(Fair) > 1) {
    stop("Parameter 'Fair' must be either TRUE or FALSE.")
  }
  if (Fair) {
    if (!is.null(cat_dim)) {
      if (cat_dim %in% names(dim(exp))) {
        if (is.null(nmemb)) {
  	  stop("Parameter 'nmemb' necessary to compute Fair",
	       "score from probabilities")
        }
      }
    }
  }
  ## return_mean
  if (!is.logical(return_mean) | length(return_mean) > 1) {
    stop("Parameter 'return_mean' must be either TRUE or FALSE.")
  }
  ## cross.val
  if (!is.logical(cross.val)  | length(cross.val) > 1) {
    stop("Parameter 'cross.val' must be either TRUE or FALSE.")
  }
  ## weights
  if (!is.null(weights) & is.null(cat_dim)) {
    if (!is.array(weights) | !is.numeric(weights))
      stop("Parameter 'weights' must be a named numeric array.")
    if (is.null(dat_dim)) {
      if (length(dim(weights)) != 2 | !all(names(dim(weights)) %in% c(memb_dim, time_dim)))
        stop("Parameter 'weights' must have two dimensions with the names of ",
             "'memb_dim' and 'time_dim'.")
      if (dim(weights)[memb_dim] != dim(exp)[memb_dim] |
          dim(weights)[time_dim] != dim(exp)[time_dim]) {
        stop("Parameter 'weights' must have the same dimension lengths ", 
             "as 'memb_dim' and 'time_dim' in 'exp'.")
      }
      weights <- Reorder(weights, c(time_dim, memb_dim))
      
    } else {
      if (length(dim(weights)) != 3 | !all(names(dim(weights)) %in% c(memb_dim, time_dim, dat_dim)))
        stop("Parameter 'weights' must have three dimensions with the names of ",
             "'memb_dim', 'time_dim' and 'dat_dim'.")
      if (dim(weights)[memb_dim] != dim(exp)[memb_dim] |
          dim(weights)[time_dim] != dim(exp)[time_dim] |
          dim(weights)[dat_dim] != dim(exp)[dat_dim]) {
        stop("Parameter 'weights' must have the same dimension lengths ", 
             "as 'memb_dim', 'time_dim' and 'dat_dim' in 'exp'.")
      }
      weights <- Reorder(weights, c(time_dim, memb_dim, dat_dim))
      
    }
  } else if (!is.null(weights) & !is.null(cat_dim)) {
    .warning(paste0("Parameter 'exp' and 'obs' are probabilities already, so parameter ",
                    "'weights' is not used. Change 'weights' to NULL."))
    weights <- NULL
  }
  ## na.rm
  if (!isTRUE(na.rm) & !isFALSE(na.rm) & !(is.numeric(na.rm) & na.rm >= 0 & na.rm <= 1)) {
    stop('"na.rm" should be TRUE, FALSE or a numeric between 0 and 1')
  }
  ## ncores
  if (!is.null(ncores)) {
    if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 |
        length(ncores) > 1) {
      stop("Parameter 'ncores' must be either NULL or a positive integer.")
    }
  }
  
  ###############################
  
  # Compute RPS
  
  ## Decide target_dims
  if (!is.null(memb_dim)) {
    target_dims_exp <- c(time_dim, memb_dim, dat_dim)
    if (!memb_dim %in% names(dim(obs))) {
      target_dims_obs <- c(time_dim, dat_dim)
    } else {
      target_dims_obs <- c(time_dim, memb_dim, dat_dim)
    }
  } else {  # cat_dim
    target_dims_exp <- target_dims_obs <- c(time_dim, cat_dim, dat_dim)
  }
  
  rps <- Apply(data = list(exp = exp, obs = obs), 
               target_dims = list(exp = target_dims_exp, 
                                  obs = target_dims_obs),
               fun = .RPS,
               dat_dim = dat_dim, time_dim = time_dim,
               memb_dim = memb_dim, cat_dim = cat_dim,
               prob_thresholds = prob_thresholds, nmemb = nmemb,
               indices_for_clim = indices_for_clim, Fair = Fair,
               weights = weights, cross.val = cross.val, 
               na.rm = na.rm, ncores = ncores)$output1
  
  if (return_mean) {
    rps <- MeanDims(rps, time_dim, na.rm = TRUE)
  } else {
    rps <- rps
  }
  
  return(rps)
}


.RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', cat_dim = NULL,
                 dat_dim = NULL, prob_thresholds = c(1/3, 2/3), indices_for_clim = NULL, 
                 Fair = FALSE, nmemb = NULL, weights = NULL,
		 cross.val = FALSE, na.rm = FALSE) {
  #--- if memb_dim: 
  # exp: [sdate, memb, (dat)]
  # obs: [sdate, (memb), (dat)]
  # weights: NULL or same as exp
  #--- if cat_dim:
  # exp: [sdate, bin, (dat)]
  # obs: [sdate, bin, (dat)]
  
  # Adjust dimensions to be [sdate, memb, dat] for both exp and obs
  if (!is.null(memb_dim)) {
    if (!memb_dim %in% names(dim(obs))) {
      obs <- InsertDim(obs, posdim = 2, lendim = 1, name = memb_dim)
    }
  }
  
  if (is.null(dat_dim)) {
    nexp <- 1
    nobs <- 1
    dim(exp) <- c(dim(exp), nexp = nexp)
    dim(obs) <- c(dim(obs), nobs = nobs)
    if (!is.null(weights)) dim(weights) <- c(dim(weights), nexp = nexp)
  } else {
    nexp <- as.numeric(dim(exp)[dat_dim])
    nobs <- as.numeric(dim(obs)[dat_dim])
  }
  
  rps <- array(dim = c(dim(exp)[time_dim], nexp = nexp, nobs = nobs))
  
  for (i in 1:nexp) {
    for (j in 1:nobs) {
      exp_data <- exp[, , i]
      obs_data <- obs[, , j]
      
      if (is.null(dim(exp_data))) dim(exp_data) <- c(dim(exp)[1:2])
      if (is.null(dim(obs_data))) dim(obs_data) <- c(dim(obs)[1:2])
      
      # Find the fraction of NAs
      ## If any member/bin is NA at this time step, it is not good value.
      exp_mean <- rowMeans(exp_data)
      obs_mean <- rowMeans(obs_data)
      good_values <- !is.na(exp_mean) & !is.na(obs_mean)
      
      if (isTRUE(na.rm)) {
        f_NAs <- 0
      } else if (isFALSE(na.rm)) {
        f_NAs <- 1
      } else {
        f_NAs <- na.rm
      }
      
      if (f_NAs <= sum(good_values) / length(obs_mean)) {
        
        exp_data <- exp_data[good_values, , drop = F]
        obs_data <- obs_data[good_values, , drop = F]
        
        # If the data inputs are forecast/observation, calculate probabilities
        if (is.null(cat_dim)) {
          if (!is.null(weights)) {
            weights_data <- weights[which(good_values), , i]
            if (is.null(dim(weights_data))) dim(weights_data) <- c(dim(weights)[1:2])
          } else {
            weights_data <- weights #NULL
          }
          
          # Subset indices_for_clim
          dum <- match(indices_for_clim, which(good_values))
          good_indices_for_clim <- dum[!is.na(dum)]
          
          exp_probs <- .GetProbs(data = exp_data, indices_for_quantiles = good_indices_for_clim, 
                                 prob_thresholds = prob_thresholds, weights = weights_data, 
                                 cross.val = cross.val)
          # exp_probs: [bin, sdate]
          obs_probs <- .GetProbs(data = obs_data, indices_for_quantiles = good_indices_for_clim, 
                                 prob_thresholds = prob_thresholds, weights = NULL, 
                                 cross.val = cross.val)
          # obs_probs: [bin, sdate]
          
        } else { # inputs are probabilities already
	  if (all(names(dim(exp_data)) == c(time_dim, memb_dim)) || 
	      all(names(dim(exp_data)) == c(time_dim, cat_dim))) {
            exp_probs <- t(exp_data)
            obs_probs <- t(obs_data)
	  } 
        }
        
        probs_exp_cumsum <- apply(exp_probs, 2, cumsum)
        probs_obs_cumsum <- apply(obs_probs, 2, cumsum)
        
        # rps: [sdate, nexp, nobs]
        rps [good_values, i, j] <- colSums((probs_exp_cumsum - probs_obs_cumsum)^2)
        if (Fair) { # FairRPS
          if (!is.null(memb_dim)) {
   	    if (memb_dim %in% names(dim(exp))) {
              ## adjustment <- rowSums(-1 * (1/R - 1/R.new) * ens.cum * (R - ens.cum)/R/(R - 1))
              ## [formula taken from SpecsVerification::EnsRps]
              R <- dim(exp)[2]  #memb
            }
	  } else {
	    R <- nmemb
	  }
	  warning("Applying fair correction.")
          adjustment <- (-1) / (R - 1) * probs_exp_cumsum * (1 - probs_exp_cumsum)
          adjustment <- colSums(adjustment)
          rps[, i, j] <- rps[, i, j] + adjustment
        }
        
      } else { ## not enough values different from NA
        
        rps[, i, j] <- NA_real_
        
      }
      
    }
  }
  
  if (is.null(dat_dim)) {
    dim(rps) <- dim(exp)[time_dim]
  }
  
  return(rps)
}