Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
#'Compute the Ranked Probability Score
#'
#'The Ranked Probability Score (RPS; Wilks, 2011) is defined as the sum of the
#'squared differences between the cumulative forecast probabilities (computed
#'from the ensemble members) and the observations (defined as 0% if the category
#'did not happen and 100% if it happened). It can be used to evaluate the skill
#'of multi-categorical probabilistic forecasts. The RPS ranges between 0
#'(perfect forecast) and n-1 (worst possible forecast), where n is the number of
#'categories. In the case of a forecast divided into two categories (the lowest
#'number of categories that a probabilistic forecast can have), the RPS
#'corresponds to the Brier Score (BS; Wilks, 2011), therefore ranging between 0
#'and 1.\cr
#'The function first calculates the probabilities for forecasts and observations,
#'then use them to calculate RPS. Or, the probabilities of exp and obs can be
#'provided directly to compute the score. If there is more than one dataset, RPS
#'will be computed for each pair of exp and obs data. The fraction of acceptable
#'NAs can be adjusted.
#'
#'@param exp A named numerical array of either the forecasts with at least time
#' and member dimensions, or the probabilities with at least time and category
#' dimensions. The probabilities can be generated by \code{s2dv::GetProbs}.
#'@param obs A named numerical array of either the observation with at least
#' time dimension, or the probabilities with at least time and category
#' dimensions. The probabilities can be generated by \code{s2dv::GetProbs}. The
#' dimensions must be the same as 'exp' except 'memb_dim' and 'dat_dim'.
#'@param time_dim A character string indicating the name of the time dimension.
#' The default value is 'sdate'.
#'@param memb_dim A character string indicating the name of the member dimension
#' to compute the probabilities of the forecast. The default value is 'member'.
#' If the data are probabilities, set memb_dim as NULL.
#'@param cat_dim A character string indicating the name of the category
#' dimension that is needed when the exp and obs are probabilities. The default
#' value is NULL, which means that the data are not probabilities.
#'@param dat_dim A character string indicating the name of dataset dimension.
#' The length of this dimension can be different between 'exp' and 'obs'.
#' The default value is NULL.
#'@param prob_thresholds A numeric vector of the relative thresholds (from 0 to
#' 1) between the categories. The default value is c(1/3, 2/3), which
#' corresponds to tercile equiprobable categories.
#'@param indices_for_clim A vector of the indices to be taken along 'time_dim'
#' for computing the thresholds between the probabilistic categories. If NULL,
#' the whole period is used. The default value is NULL.
#'@param Fair A logical indicating whether to compute the FairRPS (the
#' potential RPS that the forecast would have with an infinite ensemble size).
#' The default value is FALSE.
#'@param nmemb A numeric value indicating the number of members used to compute the probabilities. This parameter is necessary when calculating FairRPS from probabilities. Default is NULL.
#'@param weights A named numerical array of the weights for 'exp' probability
#' calculation. If 'dat_dim' is NULL, the dimensions should include 'memb_dim'
#' and 'time_dim'. Else, the dimension should also include 'dat_dim'. The
#' default value is NULL. The ensemble should have at least 70 members or span
#' at least 10 time steps and have more than 45 members if consistency between
#' the weighted and unweighted methodologies is desired.
#'@param cross.val A logical indicating whether to compute the thresholds
#' between probabilistic categories in cross-validation. The default value is
#' FALSE.
#'@param return_mean A logical indicating whether to return the temporal mean
#' of the RPS or not. If TRUE, the temporal mean is calculated along time_dim,
#' if FALSE the time dimension is not aggregated. The default is TRUE.
#'@param na.rm A logical or numeric value between 0 and 1. If it is numeric, it
#' means the lower limit for the fraction of the non-NA values. 1 is equal to
#' FALSE (no NA is acceptable), 0 is equal to TRUE (all NAs are acceptable).
# The function returns NA if the fraction of non-NA values in the data is less
#' than na.rm. Otherwise, RPS will be calculated. The default value is FALSE.
#'@param ncores An integer indicating the number of cores to use for parallel
#' computation. The default value is NULL.
#'
#'@return
#'A numerical array of RPS with dimensions c(nexp, nobs, the rest dimensions of
#''exp' except 'time_dim' and 'memb_dim' dimensions). nexp is the number of
#'experiment (i.e., dat_dim in exp), and nobs is the number of observation
#'(i.e., dat_dim in obs). If dat_dim is NULL, nexp and nobs are omitted.
#'
#'@references
#'Wilks, 2011; https://doi.org/10.1016/B978-0-12-385022-5.00008-7
#'
#'@examples
#'# Use synthetic data
#'exp <- array(rnorm(1000), dim = c(lat = 3, lon = 2, member = 10, sdate = 50))
#'obs <- array(rnorm(1000), dim = c(lat = 3, lon = 2, sdate = 50))
#'res <- RPS(exp = exp, obs = obs)
#'# Use probabilities as inputs
#'exp_probs <- GetProbs(exp, time_dim = 'sdate', memb_dim = 'member')
#'obs_probs <- GetProbs(obs, time_dim = 'sdate', memb_dim = NULL)
#'res2 <- RPS(exp = exp_probs, obs = obs_probs, memb_dim = NULL, cat_dim = 'bin')
#'
#'
#'@import multiApply
#'@importFrom easyVerification convert2prob
#'@export
RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', cat_dim = NULL,
dat_dim = NULL, prob_thresholds = c(1/3, 2/3), indices_for_clim = NULL,
Fair = FALSE, nmemb = NULL, weights = NULL,
cross.val = FALSE, return_mean = TRUE,
na.rm = FALSE, ncores = NULL) {
# Check inputs
## exp and obs (1)
if (!is.array(exp) | !is.numeric(exp))
stop('Parameter "exp" must be a numeric array.')
if (!is.array(obs) | !is.numeric(obs))
stop('Parameter "obs" must be a numeric array.')
if (any(is.null(names(dim(exp)))) | any(nchar(names(dim(exp))) == 0) |
any(is.null(names(dim(obs)))) | any(nchar(names(dim(obs))) == 0)) {
stop("Parameter 'exp' and 'obs' must have dimension names.")
}
## time_dim
if (!is.character(time_dim) | length(time_dim) != 1)
stop('Parameter "time_dim" must be a character string.')
if (!time_dim %in% names(dim(exp)) | !time_dim %in% names(dim(obs))) {
stop("Parameter 'time_dim' is not found in 'exp' or 'obs' dimension.")
}
## memb_dim & cat_dim
if (is.null(memb_dim) + is.null(cat_dim) != 1) {
stop("Only one of the two parameters 'memb_dim' and 'cat_dim' can have value.")
}
## memb_dim
if (!is.null(memb_dim)) {
if (!is.character(memb_dim) | length(memb_dim) > 1) {
stop("Parameter 'memb_dim' must be a character string.")
}
if (!memb_dim %in% names(dim(exp))) {
stop("Parameter 'memb_dim' is not found in 'exp' dimension.")
}
}
## cat_dim
if (!is.null(cat_dim)) {
if (!is.character(cat_dim) | length(cat_dim) > 1) {
stop("Parameter 'cat_dim' must be a character string.")
}
if (!cat_dim %in% names(dim(exp)) | !cat_dim %in% names(dim(obs))) {
stop("Parameter 'cat_dim' is not found in 'exp' or 'obs' dimension.")
}
}
## dat_dim
if (!is.null(dat_dim)) {
if (!is.character(dat_dim) | length(dat_dim) > 1) {
stop("Parameter 'dat_dim' must be a character string.")
}
if (!dat_dim %in% names(dim(exp)) | !dat_dim %in% names(dim(obs))) {
stop("Parameter 'dat_dim' is not found in 'exp' or 'obs' dimension.",
" Set it as NULL if there is no dataset dimension.")
}
}
## exp and obs (2)
name_exp <- sort(names(dim(exp)))
name_obs <- sort(names(dim(obs)))
if (!is.null(memb_dim)) {
name_exp <- name_exp[-which(name_exp == memb_dim)]
if (memb_dim %in% name_obs) {
name_obs <- name_obs[-which(name_obs == memb_dim)]
}
}
if (!is.null(dat_dim)) {
name_exp <- name_exp[-which(name_exp == dat_dim)]
name_obs <- name_obs[-which(name_obs == dat_dim)]
}
if (!identical(length(name_exp), length(name_obs)) |
!identical(dim(exp)[name_exp], dim(obs)[name_obs])) {
stop("Parameter 'exp' and 'obs' must have same length of ",
"all dimensions except 'memb_dim' and 'dat_dim'.")
}
## prob_thresholds
if (!is.numeric(prob_thresholds) | !is.vector(prob_thresholds) |
any(prob_thresholds <= 0) | any(prob_thresholds >= 1)) {
stop("Parameter 'prob_thresholds' must be a numeric vector between 0 and 1.")
}
## indices_for_clim
if (is.null(indices_for_clim)) {
indices_for_clim <- seq_len(dim(obs)[time_dim])
} else {
if (!is.numeric(indices_for_clim) | !is.vector(indices_for_clim)) {
stop("Parameter 'indices_for_clim' must be NULL or a numeric vector.")
} else if (length(indices_for_clim) > dim(obs)[time_dim] |
max(indices_for_clim) > dim(obs)[time_dim] |
any(indices_for_clim) < 1) {
stop("Parameter 'indices_for_clim' should be the indices of 'time_dim'.")
}
}
## Fair
if (!is.logical(Fair) | length(Fair) > 1) {
stop("Parameter 'Fair' must be either TRUE or FALSE.")
}
if (Fair) {
if (!is.null(cat_dim)) {
if (cat_dim %in% names(dim(exp))) {
if (is.null(nmemb)) {
stop("Parameter 'nmemb' necessary to compute Fair",
"score from probabilities")
}
}
}
}
## return_mean
if (!is.logical(return_mean) | length(return_mean) > 1) {
stop("Parameter 'return_mean' must be either TRUE or FALSE.")
}
## cross.val
if (!is.logical(cross.val) | length(cross.val) > 1) {
stop("Parameter 'cross.val' must be either TRUE or FALSE.")
}
## weights
if (!is.null(weights) & is.null(cat_dim)) {
if (!is.array(weights) | !is.numeric(weights))
stop("Parameter 'weights' must be a named numeric array.")
if (is.null(dat_dim)) {
if (length(dim(weights)) != 2 | !all(names(dim(weights)) %in% c(memb_dim, time_dim)))
stop("Parameter 'weights' must have two dimensions with the names of ",
"'memb_dim' and 'time_dim'.")
if (dim(weights)[memb_dim] != dim(exp)[memb_dim] |
dim(weights)[time_dim] != dim(exp)[time_dim]) {
stop("Parameter 'weights' must have the same dimension lengths ",
"as 'memb_dim' and 'time_dim' in 'exp'.")
}
weights <- Reorder(weights, c(time_dim, memb_dim))
} else {
if (length(dim(weights)) != 3 | !all(names(dim(weights)) %in% c(memb_dim, time_dim, dat_dim)))
stop("Parameter 'weights' must have three dimensions with the names of ",
"'memb_dim', 'time_dim' and 'dat_dim'.")
if (dim(weights)[memb_dim] != dim(exp)[memb_dim] |
dim(weights)[time_dim] != dim(exp)[time_dim] |
dim(weights)[dat_dim] != dim(exp)[dat_dim]) {
stop("Parameter 'weights' must have the same dimension lengths ",
"as 'memb_dim', 'time_dim' and 'dat_dim' in 'exp'.")
}
weights <- Reorder(weights, c(time_dim, memb_dim, dat_dim))
}
} else if (!is.null(weights) & !is.null(cat_dim)) {
.warning(paste0("Parameter 'exp' and 'obs' are probabilities already, so parameter ",
"'weights' is not used. Change 'weights' to NULL."))
weights <- NULL
}
## na.rm
if (!isTRUE(na.rm) & !isFALSE(na.rm) & !(is.numeric(na.rm) & na.rm >= 0 & na.rm <= 1)) {
stop('"na.rm" should be TRUE, FALSE or a numeric between 0 and 1')
}
## ncores
if (!is.null(ncores)) {
if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 |
length(ncores) > 1) {
stop("Parameter 'ncores' must be either NULL or a positive integer.")
}
}
###############################
# Compute RPS
## Decide target_dims
if (!is.null(memb_dim)) {
target_dims_exp <- c(time_dim, memb_dim, dat_dim)
if (!memb_dim %in% names(dim(obs))) {
target_dims_obs <- c(time_dim, dat_dim)
} else {
target_dims_obs <- c(time_dim, memb_dim, dat_dim)
}
} else { # cat_dim
target_dims_exp <- target_dims_obs <- c(time_dim, cat_dim, dat_dim)
}
rps <- Apply(data = list(exp = exp, obs = obs),
target_dims = list(exp = target_dims_exp,
obs = target_dims_obs),
fun = .RPS,
dat_dim = dat_dim, time_dim = time_dim,
memb_dim = memb_dim, cat_dim = cat_dim,
prob_thresholds = prob_thresholds, nmemb = nmemb,
indices_for_clim = indices_for_clim, Fair = Fair,
weights = weights, cross.val = cross.val,
na.rm = na.rm, ncores = ncores)$output1
if (return_mean) {
rps <- MeanDims(rps, time_dim, na.rm = TRUE)
} else {
rps <- rps
}
return(rps)
}
.RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', cat_dim = NULL,
dat_dim = NULL, prob_thresholds = c(1/3, 2/3), indices_for_clim = NULL,
Fair = FALSE, nmemb = NULL, weights = NULL,
cross.val = FALSE, na.rm = FALSE) {
#--- if memb_dim:
# exp: [sdate, memb, (dat)]
# obs: [sdate, (memb), (dat)]
# weights: NULL or same as exp
#--- if cat_dim:
# exp: [sdate, bin, (dat)]
# obs: [sdate, bin, (dat)]
# Adjust dimensions to be [sdate, memb, dat] for both exp and obs
if (!is.null(memb_dim)) {
if (!memb_dim %in% names(dim(obs))) {
obs <- InsertDim(obs, posdim = 2, lendim = 1, name = memb_dim)
}
}
if (is.null(dat_dim)) {
nexp <- 1
nobs <- 1
dim(exp) <- c(dim(exp), nexp = nexp)
dim(obs) <- c(dim(obs), nobs = nobs)
if (!is.null(weights)) dim(weights) <- c(dim(weights), nexp = nexp)
} else {
nexp <- as.numeric(dim(exp)[dat_dim])
nobs <- as.numeric(dim(obs)[dat_dim])
}
rps <- array(dim = c(dim(exp)[time_dim], nexp = nexp, nobs = nobs))
for (i in 1:nexp) {
for (j in 1:nobs) {
exp_data <- exp[, , i]
obs_data <- obs[, , j]
if (is.null(dim(exp_data))) dim(exp_data) <- c(dim(exp)[1:2])
if (is.null(dim(obs_data))) dim(obs_data) <- c(dim(obs)[1:2])
# Find the fraction of NAs
## If any member/bin is NA at this time step, it is not good value.
exp_mean <- rowMeans(exp_data)
obs_mean <- rowMeans(obs_data)
good_values <- !is.na(exp_mean) & !is.na(obs_mean)
if (isTRUE(na.rm)) {
f_NAs <- 0
} else if (isFALSE(na.rm)) {
f_NAs <- 1
} else {
f_NAs <- na.rm
}
if (f_NAs <= sum(good_values) / length(obs_mean)) {
exp_data <- exp_data[good_values, , drop = F]
obs_data <- obs_data[good_values, , drop = F]
# If the data inputs are forecast/observation, calculate probabilities
if (is.null(cat_dim)) {
if (!is.null(weights)) {
weights_data <- weights[which(good_values), , i]
if (is.null(dim(weights_data))) dim(weights_data) <- c(dim(weights)[1:2])
} else {
weights_data <- weights #NULL
}
# Subset indices_for_clim
dum <- match(indices_for_clim, which(good_values))
good_indices_for_clim <- dum[!is.na(dum)]
exp_probs <- .GetProbs(data = exp_data, indices_for_quantiles = good_indices_for_clim,
prob_thresholds = prob_thresholds, weights = weights_data,
cross.val = cross.val)
# exp_probs: [bin, sdate]
obs_probs <- .GetProbs(data = obs_data, indices_for_quantiles = good_indices_for_clim,
prob_thresholds = prob_thresholds, weights = NULL,
cross.val = cross.val)
# obs_probs: [bin, sdate]
} else { # inputs are probabilities already
if (all(names(dim(exp_data)) == c(time_dim, memb_dim)) ||
all(names(dim(exp_data)) == c(time_dim, cat_dim))) {
exp_probs <- t(exp_data)
obs_probs <- t(obs_data)
}
}
probs_exp_cumsum <- apply(exp_probs, 2, cumsum)
probs_obs_cumsum <- apply(obs_probs, 2, cumsum)
# rps: [sdate, nexp, nobs]
rps [good_values, i, j] <- colSums((probs_exp_cumsum - probs_obs_cumsum)^2)
if (Fair) { # FairRPS
if (!is.null(memb_dim)) {
if (memb_dim %in% names(dim(exp))) {
## adjustment <- rowSums(-1 * (1/R - 1/R.new) * ens.cum * (R - ens.cum)/R/(R - 1))
## [formula taken from SpecsVerification::EnsRps]
R <- dim(exp)[2] #memb
}
} else {
R <- nmemb
}
warning("Applying fair correction.")
adjustment <- (-1) / (R - 1) * probs_exp_cumsum * (1 - probs_exp_cumsum)
adjustment <- colSums(adjustment)
rps[, i, j] <- rps[, i, j] + adjustment
}
} else { ## not enough values different from NA
rps[, i, j] <- NA_real_
}
}
}
if (is.null(dat_dim)) {
dim(rps) <- dim(exp)[time_dim]
}
return(rps)
}