From 2c9ca69f6859d7cc77fb5035f346733451b3c1c2 Mon Sep 17 00:00:00 2001 From: jcos Date: Fri, 20 May 2022 15:59:28 +0200 Subject: [PATCH 1/5] basic workflow --- R/RPS.R | 84 ++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 71 insertions(+), 13 deletions(-) diff --git a/R/RPS.R b/R/RPS.R index 84956ac..a5d54d7 100644 --- a/R/RPS.R +++ b/R/RPS.R @@ -48,7 +48,7 @@ #'@export RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', prob_thresholds = c(1/3, 2/3), indices_for_clim = NULL, Fair = FALSE, - ncores = NULL) { + ncores = NULL, weights = NULL) { # Check inputs ## exp and obs (1) @@ -130,8 +130,8 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', fun = .RPS, prob_thresholds = prob_thresholds, indices_for_clim = indices_for_clim, Fair = Fair, - ncores = ncores)$output1 - + ncores = ncores, weights = weights)$output1 + browser() # Return only the mean RPS rps <- MeanDims(rps, time_dim, na.rm = FALSE) @@ -140,12 +140,10 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', .RPS <- function(exp, obs, prob_thresholds = c(1/3, 2/3), - indices_for_clim = NULL, Fair = FALSE) { + indices_for_clim = NULL, Fair = FALSE, weights = NULL) { # exp: [sdate, memb] # obs: [sdate, (memb)] - - exp_probs <- .get_probs(data = exp, indices_for_quantiles = indices_for_clim, - prob_thresholds = prob_thresholds) + exp_probs <- .get_probs(data = exp, indices_for_quantiles = indices_for_clim, prob_thresholds = prob_thresholds, weights = weights) # exp_probs: [bin, sdate] obs_probs <- .get_probs(data = obs, indices_for_quantiles = indices_for_clim, prob_thresholds = prob_thresholds) @@ -168,7 +166,7 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', return(rps) } -.get_probs <- function(data, indices_for_quantiles, prob_thresholds) { +.get_probs <- function(data, indices_for_quantiles, prob_thresholds, weights = NULL) { # if exp: [sdate, memb] # if obs: [sdate, (memb)] @@ -176,18 +174,78 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', if (length(dim(data)) == 1) dim(data) <- c(dim(data), 1) # Absolute thresholds - quantiles <- quantile(x = as.vector(data[indices_for_quantiles, ]), probs = prob_thresholds, type = 8, na.rm = TRUE) - + if (is.null(weights)){ + quantiles <- quantile(x = as.vector(data[indices_for_quantiles, ]), probs = prob_thresholds, type = 8, na.rm = TRUE) + } else { + # linear approach to compute weighted distribution thresholds. + weights <- s2dv::Reorder(data = weights, order = names(dim(data))) # move to check or in .rps ******* + sorted_arrays <- .sorted_distributions(data[indices_for_quantiles, ], weights[indices_for_quantiles, ]) + sorted_data <- sorted_arrays$data + sorted_weights <- sorted_arrays$weights + # approach 1 (interpolate from discrete cumulative weights distribution. NOT type 8) + # function vvvvvvvvvvvvvvvvvvv + # cumulative_weights <- cumsum(sorted_weights) - 0.5 * sorted_weights + # cumulative_weights <- cumulative_weights - cumulative_weights[1] # fix the 0 + # cumulative_weights <- cumulative_weights / cumulative_weights[length(cumulative_weights)] # fix the 1 + # # function ^^^^^^^^^^^^^^^^^^^ + # quantiles <- interp1(cumulative_weights, sorted_data, probs, "cubic") # library(pracma) + # approach 2 (interpolate from the polynomial fit of the cumulative weights distribution) + order <- 5 + fit_model <- lm(sorted_data ~ poly(cumsum(sorted_weights), order, raw=TRUE)) # order 10, but we should think what order we want to use + quantiles <- prob_thresholds*0 # make a list of zeros the length of prob_thresholds + for (i in 1:(order+1)) {quantiles <- quantiles+fit_model$coefficients[i]*prob_thresholds^(i-1)} + } # Probabilities probs <- array(dim = c(bin = length(quantiles) + 1, dim(data)[1])) # [bin, sdate] for (i_time in 1:dim(data)[1]) { - if (anyNA(data[i_time, ])) { - probs[, i_time] <- rep(NA, dim = length(quantiles) + 1) + if (is.null(weights)){ + if (anyNA(data[i_time, ])) { + probs[, i_time] <- rep(NA, dim = length(quantiles) + 1) + } else { + probs[, i_time] <- colMeans(easyVerification::convert2prob(data[i_time, ], threshold = quantiles)) + } } else { - probs[, i_time] <- colMeans(easyVerification::convert2prob(data[i_time, ], threshold = quantiles)) + sorted_arrays <- .sorted_distributions(data[i_time, ], weights[i_time, ]) + sorted_data <- sorted_arrays$data + sorted_weights <- sorted_arrays$weights + # approach 1. interpolate + cumulative_weights <- cumsum(sorted_weights) - 0.5 * sorted_weights + cumulative_weights <- cumulative_weights - cumulative_weights[1] # fix the 0 + cumulative_weights <- cumulative_weights / cumulative_weights[length(cumulative_weights)] # fix the 1 + # Solve issues for quantiles outside the sorted_data range: + xtra_length <- 10 + if (min(quantiles)max(sorted_data)){ + xtra <- seq(max(sorted_data), max(quantiles), length.out=xtra_length) + sorted_data <- c(sorted_data,xtra) + cumulative_weights <- c(cumulative_weights,numeric(xtra_length)) + } + integrated_probs <- interp1(sorted_data, cumulative_weights, quantiles, "cubic") + probs[, i_time] <- c(integrated_probs[1], integrated_probs[2]-integrated_probs[1], 1-integrated_probs[2]) + if (min(probs[, i_time])<0) {print(paste0("ERROR negative probability in i_time=",i_time))} ######################## checks + if (max(probs[, i_time])>1) {print(paste0("ERROR >1 probability in i_time=",i_time))} ######################## checks + + # approach 2. might give erroneous results for quantiles that fall outside the data range. Try normal distro + # order <- 5 + # fit_model <- lm(cumsum(sorted_weights) ~ poly(sorted_data, order, raw=TRUE)) # order 10, but we should think what order we want to use + # integrated_probs <- quantiles*0 + # for (i in 1:(order+1)) {integrated_probs <- integrated_probs+fit_model$coefficients[i]*quantiles^(i-1)} + # probs[, i_time] <- c(integrated_probs[1], integrated_probs[2]-integrated_probs[1], 1-integrated_probs[2]) } + } return(probs) } +.sorted_distributions <- function(data_vector, weights_vector){ + weights_vector <- as.vector(weights_vector) + data_vector <- as.vector(data_vector) + weights_vector <- weights_vector / sum(weights_vector) # normalize to 1 + sorter <- order(data_vector) + return(list("data" = data_vector[sorter], "weights" = weights_vector[sorter])) +} \ No newline at end of file -- GitLab From 7536e289fd004435716c600611bafa05273ae37b Mon Sep 17 00:00:00 2001 From: jcos Date: Thu, 26 May 2022 12:34:59 +0200 Subject: [PATCH 2/5] switch to approx() and use it in the quantiles and probs --- R/RPS.R | 105 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 56 insertions(+), 49 deletions(-) diff --git a/R/RPS.R b/R/RPS.R index a5d54d7..e323936 100644 --- a/R/RPS.R +++ b/R/RPS.R @@ -131,7 +131,7 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', prob_thresholds = prob_thresholds, indices_for_clim = indices_for_clim, Fair = Fair, ncores = ncores, weights = weights)$output1 - browser() + # browser() # Return only the mean RPS rps <- MeanDims(rps, time_dim, na.rm = FALSE) @@ -178,67 +178,70 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', quantiles <- quantile(x = as.vector(data[indices_for_quantiles, ]), probs = prob_thresholds, type = 8, na.rm = TRUE) } else { # linear approach to compute weighted distribution thresholds. + # browser() + #weights<-as.array(weights) weights <- s2dv::Reorder(data = weights, order = names(dim(data))) # move to check or in .rps ******* + # is there a better way of doing this: vvvvvvvvvv sorted_arrays <- .sorted_distributions(data[indices_for_quantiles, ], weights[indices_for_quantiles, ]) sorted_data <- sorted_arrays$data - sorted_weights <- sorted_arrays$weights - # approach 1 (interpolate from discrete cumulative weights distribution. NOT type 8) - # function vvvvvvvvvvvvvvvvvvv - # cumulative_weights <- cumsum(sorted_weights) - 0.5 * sorted_weights - # cumulative_weights <- cumulative_weights - cumulative_weights[1] # fix the 0 - # cumulative_weights <- cumulative_weights / cumulative_weights[length(cumulative_weights)] # fix the 1 - # # function ^^^^^^^^^^^^^^^^^^^ - # quantiles <- interp1(cumulative_weights, sorted_data, probs, "cubic") # library(pracma) + cumulative_weights <- sorted_arrays$cumulative_weights + quantiles <- approx(cumulative_weights, sorted_data, prob_thresholds, "linear")$y # library(pracma) + # approach 2 (interpolate from the polynomial fit of the cumulative weights distribution) - order <- 5 - fit_model <- lm(sorted_data ~ poly(cumsum(sorted_weights), order, raw=TRUE)) # order 10, but we should think what order we want to use - quantiles <- prob_thresholds*0 # make a list of zeros the length of prob_thresholds - for (i in 1:(order+1)) {quantiles <- quantiles+fit_model$coefficients[i]*prob_thresholds^(i-1)} + # order <- 5 # should it be fixed? should it be provided? should it be computed according to the number of points available? + # fit_model <- lm(sorted_data ~ poly(cumsum(sorted_weights), order, raw=TRUE)) + # quantiles <- numeric(length(prob_thresholds)) # make a list of zeros the length of prob_thresholds + # for (i in 1:(order+1)) {quantiles <- quantiles+fit_model$coefficients[i]*prob_thresholds^(i-1)} } # Probabilities + # browser() probs <- array(dim = c(bin = length(quantiles) + 1, dim(data)[1])) # [bin, sdate] for (i_time in 1:dim(data)[1]) { - if (is.null(weights)){ - if (anyNA(data[i_time, ])) { + if (anyNA(data[i_time, ])) { probs[, i_time] <- rep(NA, dim = length(quantiles) + 1) - } else { - probs[, i_time] <- colMeans(easyVerification::convert2prob(data[i_time, ], threshold = quantiles)) - } } else { - sorted_arrays <- .sorted_distributions(data[i_time, ], weights[i_time, ]) - sorted_data <- sorted_arrays$data - sorted_weights <- sorted_arrays$weights - # approach 1. interpolate - cumulative_weights <- cumsum(sorted_weights) - 0.5 * sorted_weights - cumulative_weights <- cumulative_weights - cumulative_weights[1] # fix the 0 - cumulative_weights <- cumulative_weights / cumulative_weights[length(cumulative_weights)] # fix the 1 - # Solve issues for quantiles outside the sorted_data range: - xtra_length <- 10 - if (min(quantiles)max(sorted_data)){ - xtra <- seq(max(sorted_data), max(quantiles), length.out=xtra_length) - sorted_data <- c(sorted_data,xtra) - cumulative_weights <- c(cumulative_weights,numeric(xtra_length)) + if (is.null(weights)){ + probs[, i_time] <- colMeans(easyVerification::convert2prob(data[i_time, ], threshold = quantiles)) + print("............probs...................") + print(probs[, i_time]) + print("...............................") + } else { + # function vvvv + sorted_arrays <- .sorted_distributions(data[i_time, ], weights[i_time, ]) + sorted_data <- sorted_arrays$data + cumulative_weights <- sorted_arrays$cumulative_weights + # find any quantiles that are outside the data range + integrated_probs <- array(dim = c(bin=length(quantiles))) + for (i_quant in 1:length(quantiles)) { + if (quantiles[i_quant] < min(sorted_data)) { + integrated_probs[i_quant] = 0 # append + } else if (max(sorted_data) < quantiles[i_quant]) { + integrated_probs[i_quant] = 1 # append + } else { + integrated_probs[i_quant] = approx(sorted_data, cumulative_weights, quantiles[i_quant], "linear")$y + } + } + probs[, i_time] <- append(integrated_probs, 1) - append(0, integrated_probs) + # print("..........quantiles.........") + # print(quantiles) + # print("..........integrated probs.........") + # print(integrated_probs) + print("..........probs.........") + print(probs[, i_time]) + print("..............................") + print("..............................") + if (min(probs[, i_time])<0) {print(paste0("ERROR negative probability in i_time=",i_time))} ######################## checks + if (max(probs[, i_time])>1) {print(paste0("ERROR >1 probability in i_time=",i_time))} ######################## checks + # approach 2. might give erroneous results for quantiles that fall outside the data range. Try normal distro + # order <- 5 + # fit_model <- lm(cumsum(sorted_weights) ~ poly(sorted_data, order, raw=TRUE)) # order 10, but we should think what order we want to use + # integrated_probs <- quantiles*0 + # for (i in 1:(order+1)) {integrated_probs <- integrated_probs+fit_model$coefficients[i]*quantiles^(i-1)} + # probs[, i_time] <- c(integrated_probs[1], integrated_probs[2]-integrated_probs[1], 1-integrated_probs[2]) } - integrated_probs <- interp1(sorted_data, cumulative_weights, quantiles, "cubic") - probs[, i_time] <- c(integrated_probs[1], integrated_probs[2]-integrated_probs[1], 1-integrated_probs[2]) - if (min(probs[, i_time])<0) {print(paste0("ERROR negative probability in i_time=",i_time))} ######################## checks - if (max(probs[, i_time])>1) {print(paste0("ERROR >1 probability in i_time=",i_time))} ######################## checks - - # approach 2. might give erroneous results for quantiles that fall outside the data range. Try normal distro - # order <- 5 - # fit_model <- lm(cumsum(sorted_weights) ~ poly(sorted_data, order, raw=TRUE)) # order 10, but we should think what order we want to use - # integrated_probs <- quantiles*0 - # for (i in 1:(order+1)) {integrated_probs <- integrated_probs+fit_model$coefficients[i]*quantiles^(i-1)} - # probs[, i_time] <- c(integrated_probs[1], integrated_probs[2]-integrated_probs[1], 1-integrated_probs[2]) } } - return(probs) } @@ -247,5 +250,9 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', data_vector <- as.vector(data_vector) weights_vector <- weights_vector / sum(weights_vector) # normalize to 1 sorter <- order(data_vector) - return(list("data" = data_vector[sorter], "weights" = weights_vector[sorter])) + sorted_weights <- weights_vector[sorter] + cumulative_weights <- cumsum(sorted_weights) - 0.5 * sorted_weights + cumulative_weights <- cumulative_weights - cumulative_weights[1] # fix the 0 + cumulative_weights <- cumulative_weights / cumulative_weights[length(cumulative_weights)] # fix the 1 + return(list("data" = data_vector[sorter], "cumulative_weights" = cumulative_weights)) } \ No newline at end of file -- GitLab From 281ff94a074846ace6d65a5ab2c1c06d1b54858d Mon Sep 17 00:00:00 2001 From: jcos Date: Wed, 8 Jun 2022 16:39:54 +0200 Subject: [PATCH 3/5] cleanup --- R/RPS.R | 39 ++++++--------------------------------- 1 file changed, 6 insertions(+), 33 deletions(-) diff --git a/R/RPS.R b/R/RPS.R index e323936..e0e19b4 100644 --- a/R/RPS.R +++ b/R/RPS.R @@ -122,7 +122,6 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', } else { target_dims_obs <- c(time_dim, memb_dim) } - rps <- Apply(data = list(exp = exp, obs = obs), target_dims = list(exp = c(time_dim, memb_dim), obs = target_dims_obs), @@ -177,24 +176,13 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', if (is.null(weights)){ quantiles <- quantile(x = as.vector(data[indices_for_quantiles, ]), probs = prob_thresholds, type = 8, na.rm = TRUE) } else { - # linear approach to compute weighted distribution thresholds. - # browser() - #weights<-as.array(weights) weights <- s2dv::Reorder(data = weights, order = names(dim(data))) # move to check or in .rps ******* - # is there a better way of doing this: vvvvvvvvvv sorted_arrays <- .sorted_distributions(data[indices_for_quantiles, ], weights[indices_for_quantiles, ]) sorted_data <- sorted_arrays$data cumulative_weights <- sorted_arrays$cumulative_weights - quantiles <- approx(cumulative_weights, sorted_data, prob_thresholds, "linear")$y # library(pracma) - - # approach 2 (interpolate from the polynomial fit of the cumulative weights distribution) - # order <- 5 # should it be fixed? should it be provided? should it be computed according to the number of points available? - # fit_model <- lm(sorted_data ~ poly(cumsum(sorted_weights), order, raw=TRUE)) - # quantiles <- numeric(length(prob_thresholds)) # make a list of zeros the length of prob_thresholds - # for (i in 1:(order+1)) {quantiles <- quantiles+fit_model$coefficients[i]*prob_thresholds^(i-1)} + quantiles <- approx(cumulative_weights, sorted_data, prob_thresholds, "linear")$y } # Probabilities - # browser() probs <- array(dim = c(bin = length(quantiles) + 1, dim(data)[1])) # [bin, sdate] for (i_time in 1:dim(data)[1]) { if (anyNA(data[i_time, ])) { @@ -202,42 +190,27 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', } else { if (is.null(weights)){ probs[, i_time] <- colMeans(easyVerification::convert2prob(data[i_time, ], threshold = quantiles)) - print("............probs...................") - print(probs[, i_time]) - print("...............................") } else { - # function vvvv sorted_arrays <- .sorted_distributions(data[i_time, ], weights[i_time, ]) sorted_data <- sorted_arrays$data cumulative_weights <- sorted_arrays$cumulative_weights # find any quantiles that are outside the data range integrated_probs <- array(dim = c(bin=length(quantiles))) for (i_quant in 1:length(quantiles)) { + # for thresholds falling under the distribution if (quantiles[i_quant] < min(sorted_data)) { - integrated_probs[i_quant] = 0 # append + integrated_probs[i_quant] = 0 + # for thresholds falling over the distribution } else if (max(sorted_data) < quantiles[i_quant]) { - integrated_probs[i_quant] = 1 # append + integrated_probs[i_quant] = 1 } else { integrated_probs[i_quant] = approx(sorted_data, cumulative_weights, quantiles[i_quant], "linear")$y } } probs[, i_time] <- append(integrated_probs, 1) - append(0, integrated_probs) - # print("..........quantiles.........") - # print(quantiles) - # print("..........integrated probs.........") - # print(integrated_probs) - print("..........probs.........") - print(probs[, i_time]) - print("..............................") - print("..............................") + # This shouldn't happen, idk if this cehck is needed then if (min(probs[, i_time])<0) {print(paste0("ERROR negative probability in i_time=",i_time))} ######################## checks if (max(probs[, i_time])>1) {print(paste0("ERROR >1 probability in i_time=",i_time))} ######################## checks - # approach 2. might give erroneous results for quantiles that fall outside the data range. Try normal distro - # order <- 5 - # fit_model <- lm(cumsum(sorted_weights) ~ poly(sorted_data, order, raw=TRUE)) # order 10, but we should think what order we want to use - # integrated_probs <- quantiles*0 - # for (i in 1:(order+1)) {integrated_probs <- integrated_probs+fit_model$coefficients[i]*quantiles^(i-1)} - # probs[, i_time] <- c(integrated_probs[1], integrated_probs[2]-integrated_probs[1], 1-integrated_probs[2]) } } -- GitLab From f4c5f17a6969c9fa40879cf4146f3ba0a02b5c4f Mon Sep 17 00:00:00 2001 From: aho Date: Fri, 10 Jun 2022 17:28:52 +0200 Subject: [PATCH 4/5] Add 'weights' documentation --- R/RPS.R | 53 ++++++++++++++++++++++++-------------- R/RPSS.R | 26 +++++++++++++++---- man/RPS.Rd | 5 ++++ man/RPSS.Rd | 5 ++++ tests/testthat/test-RPS.R | 21 ++++++++++++++- tests/testthat/test-RPSS.R | 21 ++++++++++++++- 6 files changed, 105 insertions(+), 26 deletions(-) diff --git a/R/RPS.R b/R/RPS.R index e0e19b4..00d0c10 100644 --- a/R/RPS.R +++ b/R/RPS.R @@ -28,6 +28,9 @@ #'@param Fair A logical indicating whether to compute the FairRPSS (the #' potential RPSS that the forecast would have with an infinite ensemble size). #' The default value is FALSE. +#'@param weights A named two-dimensional numerical array of the weights for each +#' member and time. The dimension names should include 'memb_dim' and +#' 'time_dim'. The default value is NULL. #'@param ncores An integer indicating the number of cores to use for parallel #' computation. The default value is NULL. #' @@ -47,8 +50,8 @@ #'@importFrom easyVerification convert2prob #'@export RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', - prob_thresholds = c(1/3, 2/3), indices_for_clim = NULL, Fair = FALSE, - ncores = NULL, weights = NULL) { + prob_thresholds = c(1/3, 2/3), indices_for_clim = NULL, Fair = FALSE, + weights = NULL, ncores = NULL) { # Check inputs ## exp and obs (1) @@ -106,6 +109,18 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', if (!is.logical(Fair) | length(Fair) > 1) { stop("Parameter 'Fair' must be either TRUE or FALSE.") } + ## weights + if (!is.null(weights)) { + if (!is.array(weights) | !is.numeric(weights)) + stop('Parameter "weights" must be a two-dimensional numeric array.') + if (length(dim(weights)) != 2 | any(!names(dim(weights)) %in% c(memb_dim, time_dim))) + stop("Parameter 'weights' must have two dimensions with the names of memb_dim and time_dim.") + if (dim(weights)[memb_dim] != dim(exp)[memb_dim] | + dim(weights)[time_dim] != dim(exp)[time_dim]) { + stop("Parameter 'weights' must have the same dimension lengths as memb_dim and time_dim in 'exp'.") + } + weights <- Reorder(weights, c(time_dim, memb_dim)) + } ## ncores if (!is.null(ncores)) { if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 | @@ -129,7 +144,7 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', fun = .RPS, prob_thresholds = prob_thresholds, indices_for_clim = indices_for_clim, Fair = Fair, - ncores = ncores, weights = weights)$output1 + weights = weights, ncores = ncores)$output1 # browser() # Return only the mean RPS rps <- MeanDims(rps, time_dim, na.rm = FALSE) @@ -145,7 +160,7 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', exp_probs <- .get_probs(data = exp, indices_for_quantiles = indices_for_clim, prob_thresholds = prob_thresholds, weights = weights) # exp_probs: [bin, sdate] obs_probs <- .get_probs(data = obs, indices_for_quantiles = indices_for_clim, - prob_thresholds = prob_thresholds) + prob_thresholds = prob_thresholds, weights = NULL) # obs_probs: [bin, sdate] probs_exp_cumsum <- apply(exp_probs, 2, cumsum) @@ -173,52 +188,52 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', if (length(dim(data)) == 1) dim(data) <- c(dim(data), 1) # Absolute thresholds - if (is.null(weights)){ + if (is.null(weights)) { quantiles <- quantile(x = as.vector(data[indices_for_quantiles, ]), probs = prob_thresholds, type = 8, na.rm = TRUE) } else { - weights <- s2dv::Reorder(data = weights, order = names(dim(data))) # move to check or in .rps ******* + # weights: [sdate, memb] sorted_arrays <- .sorted_distributions(data[indices_for_quantiles, ], weights[indices_for_quantiles, ]) sorted_data <- sorted_arrays$data cumulative_weights <- sorted_arrays$cumulative_weights quantiles <- approx(cumulative_weights, sorted_data, prob_thresholds, "linear")$y } + # Probabilities probs <- array(dim = c(bin = length(quantiles) + 1, dim(data)[1])) # [bin, sdate] for (i_time in 1:dim(data)[1]) { if (anyNA(data[i_time, ])) { - probs[, i_time] <- rep(NA, dim = length(quantiles) + 1) + probs[, i_time] <- rep(NA, dim = length(quantiles) + 1) } else { - if (is.null(weights)){ - probs[, i_time] <- colMeans(easyVerification::convert2prob(data[i_time, ], threshold = quantiles)) + if (is.null(weights)) { + probs[, i_time] <- colMeans(easyVerification::convert2prob(data[i_time, ], threshold = quantiles)) } else { sorted_arrays <- .sorted_distributions(data[i_time, ], weights[i_time, ]) sorted_data <- sorted_arrays$data cumulative_weights <- sorted_arrays$cumulative_weights # find any quantiles that are outside the data range - integrated_probs <- array(dim = c(bin=length(quantiles))) + integrated_probs <- array(dim = c(bin = length(quantiles))) for (i_quant in 1:length(quantiles)) { # for thresholds falling under the distribution if (quantiles[i_quant] < min(sorted_data)) { - integrated_probs[i_quant] = 0 + integrated_probs[i_quant] <- 0 # for thresholds falling over the distribution } else if (max(sorted_data) < quantiles[i_quant]) { - integrated_probs[i_quant] = 1 + integrated_probs[i_quant] <- 1 } else { - integrated_probs[i_quant] = approx(sorted_data, cumulative_weights, quantiles[i_quant], "linear")$y + integrated_probs[i_quant] <- approx(sorted_data, cumulative_weights, quantiles[i_quant], "linear")$y } } probs[, i_time] <- append(integrated_probs, 1) - append(0, integrated_probs) - # This shouldn't happen, idk if this cehck is needed then - if (min(probs[, i_time])<0) {print(paste0("ERROR negative probability in i_time=",i_time))} ######################## checks - if (max(probs[, i_time])>1) {print(paste0("ERROR >1 probability in i_time=",i_time))} ######################## checks + if (min(probs[, i_time]) < 0 | max(probs[, i_time]) > 1) { + stop(paste0("Probability in i_time = ", i_time, " is out of [0, 1].")) + } } } - } return(probs) } -.sorted_distributions <- function(data_vector, weights_vector){ +.sorted_distributions <- function(data_vector, weights_vector) { weights_vector <- as.vector(weights_vector) data_vector <- as.vector(data_vector) weights_vector <- weights_vector / sum(weights_vector) # normalize to 1 @@ -228,4 +243,4 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', cumulative_weights <- cumulative_weights - cumulative_weights[1] # fix the 0 cumulative_weights <- cumulative_weights / cumulative_weights[length(cumulative_weights)] # fix the 1 return(list("data" = data_vector[sorter], "cumulative_weights" = cumulative_weights)) -} \ No newline at end of file +} diff --git a/R/RPSS.R b/R/RPSS.R index 04c3137..c4e5744 100644 --- a/R/RPSS.R +++ b/R/RPSS.R @@ -34,6 +34,9 @@ #'@param Fair A logical indicating whether to compute the FairRPSS (the #' potential RPSS that the forecast would have with an infinite ensemble size). #' The default value is FALSE. +#'@param weights A named two-dimensional numerical array of the weights for each +#' member and time. The dimension names should include 'memb_dim' and +#' 'time_dim'. The default value is NULL. #'@param ncores An integer indicating the number of cores to use for parallel #' computation. The default value is NULL. #' @@ -62,7 +65,7 @@ #'@export RPSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', memb_dim = 'member', prob_thresholds = c(1/3, 2/3), indices_for_clim = NULL, Fair = FALSE, - ncores = NULL) { + weights = NULL, ncores = NULL) { # Check inputs ## exp, obs, and ref (1) @@ -135,6 +138,18 @@ RPSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', memb_dim = 'member', if (!is.logical(Fair) | length(Fair) > 1) { stop("Parameter 'Fair' must be either TRUE or FALSE.") } + ## weights + if (!is.null(weights)) { + if (!is.array(weights) | !is.numeric(weights)) + stop('Parameter "weights" must be a two-dimensional numeric array.') + if (length(dim(weights)) != 2 | any(!names(dim(weights)) %in% c(memb_dim, time_dim))) + stop("Parameter 'weights' must have two dimensions with the names of memb_dim and time_dim.") + if (dim(weights)[memb_dim] != dim(exp)[memb_dim] | + dim(weights)[time_dim] != dim(exp)[time_dim]) { + stop("Parameter 'weights' must have the same dimension lengths as memb_dim and time_dim in 'exp'.") + } + weights <- Reorder(weights, c(time_dim, memb_dim)) + } ## ncores if (!is.null(ncores)) { if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 | @@ -167,25 +182,26 @@ RPSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', memb_dim = 'member', fun = .RPSS, prob_thresholds = prob_thresholds, indices_for_clim = indices_for_clim, Fair = Fair, + weights = weights, ncores = ncores) return(output) } .RPSS <- function(exp, obs, ref = NULL, prob_thresholds = c(1/3, 2/3), - indices_for_clim = NULL, Fair = FALSE) { + indices_for_clim = NULL, Fair = FALSE, weights = NULL) { # exp: [sdate, memb] # obs: [sdate, (memb)] # ref: [sdate, memb] or NULL # RPS of the forecast rps_exp <- .RPS(exp = exp, obs = obs, prob_thresholds = prob_thresholds, - indices_for_clim = indices_for_clim, Fair = Fair) + indices_for_clim = indices_for_clim, Fair = Fair, weights = weights) # RPS of the reference forecast if (is.null(ref)) { ## using climatology as reference forecast obs_probs <- .get_probs(data = obs, indices_for_quantiles = indices_for_clim, - prob_thresholds = prob_thresholds) + prob_thresholds = prob_thresholds, weights = NULL) # obs_probs: [bin, sdate] clim_probs <- c(prob_thresholds[1], diff(prob_thresholds), 1 - prob_thresholds[length(prob_thresholds)]) @@ -209,7 +225,7 @@ RPSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', memb_dim = 'member', } else { # use "ref" as reference forecast rps_ref <- .RPS(exp = ref, obs = obs, prob_thresholds = prob_thresholds, - indices_for_clim = indices_for_clim, Fair = Fair) + indices_for_clim = indices_for_clim, Fair = Fair, weights = weights) } # RPSS diff --git a/man/RPS.Rd b/man/RPS.Rd index d1acc02..5062671 100644 --- a/man/RPS.Rd +++ b/man/RPS.Rd @@ -12,6 +12,7 @@ RPS( prob_thresholds = c(1/3, 2/3), indices_for_clim = NULL, Fair = FALSE, + weights = NULL, ncores = NULL ) } @@ -40,6 +41,10 @@ the whole period is used. The default value is NULL.} potential RPSS that the forecast would have with an infinite ensemble size). The default value is FALSE.} +\item{weights}{A named two-dimensional numerical array of the weights for each +member and time. The dimension names should include 'memb_dim' and +'time_dim'. The default value is NULL.} + \item{ncores}{An integer indicating the number of cores to use for parallel computation. The default value is NULL.} } diff --git a/man/RPSS.Rd b/man/RPSS.Rd index 6324ea2..ca33c07 100644 --- a/man/RPSS.Rd +++ b/man/RPSS.Rd @@ -13,6 +13,7 @@ RPSS( prob_thresholds = c(1/3, 2/3), indices_for_clim = NULL, Fair = FALSE, + weights = NULL, ncores = NULL ) } @@ -47,6 +48,10 @@ the whole period is used. The default value is NULL.} potential RPSS that the forecast would have with an infinite ensemble size). The default value is FALSE.} +\item{weights}{A named two-dimensional numerical array of the weights for each +member and time. The dimension names should include 'memb_dim' and +'time_dim'. The default value is NULL.} + \item{ncores}{An integer indicating the number of cores to use for parallel computation. The default value is NULL.} } diff --git a/tests/testthat/test-RPS.R b/tests/testthat/test-RPS.R index e5a5f0c..04df1bb 100644 --- a/tests/testthat/test-RPS.R +++ b/tests/testthat/test-RPS.R @@ -7,7 +7,8 @@ set.seed(1) exp1 <- array(rnorm(60), dim = c(member = 3, sdate = 10, lat = 2)) set.seed(2) obs1 <- array(rnorm(20), dim = c(sdate = 10, lat = 2)) - +set.seed(3) +weights1 <- array(abs(rnorm(30)), dim = c(member = 3, sdate = 10)) # dat2 set.seed(1) exp2 <- array(rnorm(20), dim = c(member = 2, sdate = 10)) @@ -76,6 +77,19 @@ test_that("1. Input checks", { RPS(exp1, obs1, Fair = 1), "Parameter 'Fair' must be either TRUE or FALSE." ) + # weights + expect_error( + RPS(exp1, obs1, weights = c(0, 1)), + 'Parameter "weights" must be a two-dimensional numeric array.' + ) + expect_error( + RPS(exp1, obs1, weights = array(1, dim = c(member = 3, time = 10))), + "Parameter 'weights' must have two dimensions with the names of memb_dim and time_dim." + ) + expect_error( + RPS(exp1, obs1, weights = array(1, dim = c(member = 3, sdate = 1))), + "Parameter 'weights' must have the same dimension lengths as memb_dim and time_dim in 'exp'." + ) # ncores expect_error( RPS(exp2, obs2, ncores = 1.5), @@ -111,6 +125,11 @@ as.vector(RPS(exp1, obs1, prob_thresholds = seq(0.1, 0.9, 0.1))), c(1.600000, 1.888889), tolerance = 0.0001 ) +expect_equal( +as.vector(RPS(exp1, obs1, weights = weights1)), +c(0.3692964, 0.5346627), +tolerance = 0.0001 +) }) diff --git a/tests/testthat/test-RPSS.R b/tests/testthat/test-RPSS.R index b4c0b7a..c63add0 100644 --- a/tests/testthat/test-RPSS.R +++ b/tests/testthat/test-RPSS.R @@ -9,7 +9,8 @@ set.seed(2) obs1 <- array(rnorm(20), dim = c(sdate = 10, lat = 2)) set.seed(3) ref1 <- array(rnorm(40), dim = c(member = 2, sdate = 10, lat = 2)) - +set.seed(4) +weights1 <- array(abs(rnorm(30)), dim = c(member = 3, sdate = 10)) # dat2 set.seed(1) exp2 <- array(rnorm(20), dim = c(member = 2, sdate = 10)) @@ -87,6 +88,19 @@ test_that("1. Input checks", { RPSS(exp1, obs1, Fair = 1), "Parameter 'Fair' must be either TRUE or FALSE." ) + # weights + expect_error( + RPS(exp1, obs1, weights = c(0, 1)), + 'Parameter "weights" must be a two-dimensional numeric array.' + ) + expect_error( + RPS(exp1, obs1, weights = array(1, dim = c(member = 3, time = 10))), + "Parameter 'weights' must have two dimensions with the names of memb_dim and time_dim." + ) + expect_error( + RPS(exp1, obs1, weights = array(1, dim = c(member = 3, sdate = 1))), + "Parameter 'weights' must have the same dimension lengths as memb_dim and time_dim in 'exp'." + ) # ncores expect_error( RPSS(exp2, obs2, ncores = 1.5), @@ -154,6 +168,11 @@ c(0.5259259, 0.4771242), tolerance = 0.0001 ) expect_equal( +as.vector(RPSS(exp1, obs1, ref1, weights = weights1)$rpss), +c(0.6596424, 0.4063579), +tolerance = 0.0001 +) +expect_equal( as.vector(RPSS(exp1, obs1, ref1)$sign), c(FALSE, FALSE) ) -- GitLab From 7d2c324dae7639a6a996816dc298521a74182551 Mon Sep 17 00:00:00 2001 From: jcos Date: Thu, 16 Jun 2022 17:39:41 +0200 Subject: [PATCH 5/5] docu: add thresholds to use the W method consistently --- R/RPS.R | 4 +++- R/RPSS.R | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/R/RPS.R b/R/RPS.R index 00d0c10..8ded53e 100644 --- a/R/RPS.R +++ b/R/RPS.R @@ -30,7 +30,9 @@ #' The default value is FALSE. #'@param weights A named two-dimensional numerical array of the weights for each #' member and time. The dimension names should include 'memb_dim' and -#' 'time_dim'. The default value is NULL. +#' 'time_dim'. The default value is NULL. The ensemble should have at least 70 +#' members or span at least 10 time steps and have more than 45 members if +#' consistency between the weighted and unweighted methodologies is desired. #'@param ncores An integer indicating the number of cores to use for parallel #' computation. The default value is NULL. #' diff --git a/R/RPSS.R b/R/RPSS.R index c4e5744..3b24777 100644 --- a/R/RPSS.R +++ b/R/RPSS.R @@ -36,7 +36,9 @@ #' The default value is FALSE. #'@param weights A named two-dimensional numerical array of the weights for each #' member and time. The dimension names should include 'memb_dim' and -#' 'time_dim'. The default value is NULL. +#' 'time_dim'. The default value is NULL. The ensemble should have at least 70 +#' members or span at least 10 time steps and have more than 45 members if +#' consistency between the weighted and unweighted methodologies is desired. #'@param ncores An integer indicating the number of cores to use for parallel #' computation. The default value is NULL. #' -- GitLab