From e90115e54c3a894a4422144b3353307d9ca6202b Mon Sep 17 00:00:00 2001 From: aho Date: Tue, 28 Mar 2023 17:34:26 +0200 Subject: [PATCH 1/4] Create GetProbs() --- NAMESPACE | 1 + R/GetProbs.R | 251 ++++++++++++++++++++++++++++++++ R/ROCSS.R | 8 +- R/RPS.R | 4 +- R/RPSS.R | 2 +- R/Utils.R | 86 ----------- man/GetProbs.Rd | 67 +++++++++ tests/testthat/test-GetProbs.R | 259 +++++++++++++++++++++++++++++++++ 8 files changed, 585 insertions(+), 93 deletions(-) create mode 100644 R/GetProbs.R create mode 100644 man/GetProbs.Rd create mode 100644 tests/testthat/test-GetProbs.R diff --git a/NAMESPACE b/NAMESPACE index 6224a15..d74cc97 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -36,6 +36,7 @@ export(EuroAtlanticTC) export(Filter) export(GMST) export(GSAT) +export(GetProbs) export(Histo2Hindcast) export(InsertDim) export(LeapYear) diff --git a/R/GetProbs.R b/R/GetProbs.R new file mode 100644 index 0000000..1f7651d --- /dev/null +++ b/R/GetProbs.R @@ -0,0 +1,251 @@ +#'Compute forecast probability +#' +#'Compute the forecast probability based on the relative thresholds. A certain +#'period can be specified to calculate the quantiles between each probabilistic +#'category. If data has ensemble, all the members are used together to +#'calculate the probabilities. Weights of each member and time can be provided. +#'Cross-validation can be chosen when absolute threshold is calculated. +#' +#'@param data A named numerical array of the forecast or observation with at +#' least time dimension. +#'@param time_dim A character string indicating the name of the time dimension. +#' The default value is 'sdate'. +#'@param memb_dim A character string indicating the name of the member dimension +#' to compute the probabilities of the forecast. The default value is 'member'. +#'@param dat_dim A character string indicating the name of dataset dimension. +#' The default value is NULL, which means no dataset dimension. +#'@param prob_thresholds A numeric vector of the relative thresholds (from 0 to +#' 1) between the categories. The default value is c(1/3, 2/3), which +#' corresponds to tercile equiprobable categories. +#'@param indices_for_quantiles A vector of the indices to be taken along 'time_dim' +#' for computing the thresholds between the probabilistic categories. If NULL, +#' the whole period is used. The default value is NULL. +#'@param weights A named numerical array of the weights for 'data' with +#' dimensions 'time_dim' and 'memb_dim' (if 'data' has it). The default value is +#' NULL. The ensemble should have at least 70 members or span at least 10 time +#' steps and have more than 45 members if consistency between the weighted and +#' unweighted methodologies is desired. +#'@param cross.val A logical indicating whether to compute the thresholds between +#' probabilistic categories in cross-validation. The default value is FALSE. +#'@param ncores An integer indicating the number of cores to use for parallel +#' computation. The default value is NULL. +#' +#'@return +#'A numerical array of probabilities with dimensions c(bin, the rest dimensions +#'of 'data' except 'memb_dim'). 'bin' dimension has the length of categories, +#'i.e., \code{length(prob_thresholds) + 1}. +#' +#'@examples +#'data <- array(rnorm(2000), dim = c(ensemble = 25, sdate = 20, time = 4)) +#'res <- GetProbs(data = data, time_dim = 'sdate', memb_dim = 'ensemble', indices_for_quantiles = 4:17) +#' +#'@import multiApply +#'@importFrom easyVerification convert2prob +#'@export +GetProbs <- function(data, time_dim = 'sdate', memb_dim = 'member', indices_for_quantiles = NULL, + prob_thresholds = c(1/3, 2/3), weights = NULL, cross.val = FALSE, ncores = NULL) { + + # Check inputs + ## data + if (is.null(data)) { + stop("Parameter 'data' cannot be NULL.") + } + if (!is.numeric(data)) { + stop("Parameter 'data' must be a numeric array.") + } + if (any(is.null(names(dim(data)))) | any(nchar(names(dim(data))) == 0)) { + stop("Parameter 'data' must have dimension names.") + } + ## time_dim + if (!is.character(time_dim) | length(time_dim) != 1) + stop('Parameter "time_dim" must be a character string.') + if (!time_dim %in% names(dim(data))) { + stop("Parameter 'time_dim' is not found in 'data' dimensions.") + } + ## memb_dim + if (!is.null(memb_dim)) { + if (!is.character(memb_dim) | length(memb_dim) > 1) { + stop("Parameter 'memb_dim' must be a character string.") + } + if (!memb_dim %in% names(dim(data))) { + stop("Parameter 'memb_dim' is not found in 'data' dimensions. If no member ", + "dimension exists, set it as NULL.") + } + } + ## prob_thresholds + if (!is.numeric(prob_thresholds) | !is.vector(prob_thresholds) | + any(prob_thresholds <= 0) | any(prob_thresholds >= 1)) { + stop("Parameter 'prob_thresholds' must be a numeric vector between 0 and 1.") + } + ## indices_for_quantiles + if (is.null(indices_for_quantiles)) { + indices_for_quantiles <- 1:dim(data)[time_dim] + } else { + if (!is.numeric(indices_for_quantiles) | !is.vector(indices_for_quantiles)) { + stop("Parameter 'indices_for_quantiles' must be NULL or a numeric vector.") + } else if (length(indices_for_quantiles) > dim(data)[time_dim] | + max(indices_for_quantiles) > dim(data)[time_dim] | + any(indices_for_quantiles) < 1) { + stop("Parameter 'indices_for_quantiles' should be the indices of 'time_dim'.") + } + } + ## weights + if (!is.null(weights)) { + if (!is.array(weights) | !is.numeric(weights)) + stop("Parameter 'weights' must be a named numeric array.") + +# if (is.null(dat_dim)) { + if (!is.null(memb_dim)) { + lendim_weights <- 2 + namesdim_weights <- c(time_dim, memb_dim) + } else { + lendim_weights <- 1 + namesdim_weights <- c(time_dim) + } + if (length(dim(weights)) != lendim_weights | + any(!names(dim(weights)) %in% namesdim_weights)) { + stop(paste0("Parameter 'weights' must have dimension ", + paste0(namesdim_weights, collapse = ' and '), ".")) + } + if (any(dim(weights)[namesdim_weights] != dim(data)[namesdim_weights])) { + stop(paste0("Parameter 'weights' must have the same dimension length as ", + paste0(namesdim_weights, collapse = ' and '), " dimension in 'data'.")) + } + weights <- Reorder(weights, namesdim_weights) + +# } else { +# if (length(dim(weights)) != 3 | any(!names(dim(weights)) %in% c(memb_dim, time_dim, dat_dim))) +# stop("Parameter 'weights' must have three dimensions with the names of 'memb_dim', 'time_dim' and 'dat_dim'.") +# if (dim(weights)[memb_dim] != dim(exp)[memb_dim] | +# dim(weights)[time_dim] != dim(exp)[time_dim] | +# dim(weights)[dat_dim] != dim(exp)[dat_dim]) { +# stop(paste0("Parameter 'weights' must have the same dimension lengths ", +# "as 'memb_dim', 'time_dim' and 'dat_dim' in 'exp'.")) +# } +# weights <- Reorder(weights, c(time_dim, memb_dim, dat_dim)) +# } + } + ## cross.val + if (!is.logical(cross.val) | length(cross.val) > 1) { + stop("Parameter 'cross.val' must be either TRUE or FALSE.") + } + ## ncores + if (!is.null(ncores)) { + if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 | + length(ncores) > 1) { + stop("Parameter 'ncores' must be either NULL or a positive integer.") + } + } + + ############################### + + res <- Apply(data = list(data = data), + target_dims = c(time_dim, memb_dim), #, dat_dim), + output_dims = c("bin", time_dim), + fun = .GetProbs, +# dat_dim = dat_dim, + time_dim = time_dim, + memb_dim = memb_dim, + prob_thresholds = prob_thresholds, + indices_for_quantiles = indices_for_quantiles, + weights = weights, cross.val = cross.val, ncores = ncores)$output1 + + return(res) +} + +.GetProbs <- function(data, time_dim = 'sdate', memb_dim = 'member', indices_for_quantiles = NULL, + prob_thresholds = c(1/3, 2/3), weights = NULL, cross.val = FALSE) { + # .GetProbs() is used in RPS, RPSS, ROCSS + # data + ## if data is exp: [sdate, memb] + ## if data is obs: [sdate, (memb)] + # weights: [sdate, (memb)], same as data + + # Add dim [memb = 1] to data if it doesn't have memb_dim + if (length(dim(data)) == 1) { + dim(data) <- c(dim(data), 1) + if (!is.null(weights)) dim(weights) <- c(dim(weights), 1) + } + # Absolute thresholds + if (cross.val) { + quantiles <- array(NA, dim = c(bin = length(prob_thresholds), sdate = dim(data)[1])) + for (i_time in 1:dim(data)[1]) { + if (is.null(weights)) { + quantiles[, i_time] <- quantile(x = as.vector(data[indices_for_quantiles[which(indices_for_quantiles != i_time)], ]), + probs = prob_thresholds, type = 8, na.rm = TRUE) + } else { + # weights: [sdate, memb] + sorted_arrays <- .sorted_distributions(data[indices_for_quantiles[which(indices_for_quantiles != i_time)], ], + weights[indices_for_quantiles[which(indices_for_quantiles != i_time)], ]) + sorted_data <- sorted_arrays$data + cumulative_weights <- sorted_arrays$cumulative_weights + quantiles[, i_time] <- approx(cumulative_weights, sorted_data, prob_thresholds, "linear")$y + } + } + + } else { + if (is.null(weights)) { + quantiles <- quantile(x = as.vector(data[indices_for_quantiles, ]), + probs = prob_thresholds, type = 8, na.rm = TRUE) + } else { + # weights: [sdate, memb] + sorted_arrays <- .sorted_distributions(data[indices_for_quantiles, ], + weights[indices_for_quantiles, ]) + sorted_data <- sorted_arrays$data + cumulative_weights <- sorted_arrays$cumulative_weights + quantiles <- approx(cumulative_weights, sorted_data, prob_thresholds, "linear")$y + } + quantiles <- array(rep(quantiles, dim(data)[1]), dim = c(bin = length(quantiles), dim(data)[1])) + } + # quantiles: [bin-1, sdate] + + # Probabilities + probs <- array(dim = c(dim(quantiles)[1] + 1, dim(data)[1])) # [bin, sdate] + for (i_time in 1:dim(data)[1]) { + if (anyNA(data[i_time, ])) { + probs[, i_time] <- rep(NA, dim = dim(quantiles)[1] + 1) + } else { + if (is.null(weights)) { + probs[, i_time] <- colMeans(easyVerification::convert2prob(data[i_time, ], + threshold = quantiles[, i_time])) + } else { + sorted_arrays <- .sorted_distributions(data[i_time, ], weights[i_time, ]) + sorted_data <- sorted_arrays$data + cumulative_weights <- sorted_arrays$cumulative_weights + # find any quantiles that are outside the data range + integrated_probs <- array(dim = dim(quantiles)) + for (i_quant in 1:dim(quantiles)[1]) { + # for thresholds falling under the distribution + if (quantiles[i_quant, i_time] < min(sorted_data)) { + integrated_probs[i_quant, i_time] <- 0 + # for thresholds falling over the distribution + } else if (max(sorted_data) < quantiles[i_quant, i_time]) { + integrated_probs[i_quant, i_time] <- 1 + } else { + integrated_probs[i_quant, i_time] <- approx(sorted_data, cumulative_weights, + quantiles[i_quant, i_time], "linear")$y + } + } + probs[, i_time] <- append(integrated_probs[, i_time], 1) - append(0, integrated_probs[, i_time]) + if (min(probs[, i_time]) < 0 | max(probs[, i_time]) > 1) { + stop(paste0("Probability in i_time = ", i_time, " is out of [0, 1].")) + } + } + } + } + + return(probs) +} + +.sorted_distributions <- function(data_vector, weights_vector) { + weights_vector <- as.vector(weights_vector) + data_vector <- as.vector(data_vector) + weights_vector <- weights_vector / sum(weights_vector) # normalize to 1 + sorter <- order(data_vector) + sorted_weights <- weights_vector[sorter] + cumulative_weights <- cumsum(sorted_weights) - 0.5 * sorted_weights + cumulative_weights <- cumulative_weights - cumulative_weights[1] # fix the 0 + cumulative_weights <- cumulative_weights / cumulative_weights[length(cumulative_weights)] # fix the 1 + return(list("data" = data_vector[sorter], "cumulative_weights" = cumulative_weights)) +} + diff --git a/R/ROCSS.R b/R/ROCSS.R index 7831a88..1e9f5e2 100644 --- a/R/ROCSS.R +++ b/R/ROCSS.R @@ -262,13 +262,13 @@ ROCSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', memb_dim = 'member', for (exp_i in 1:nexp) { for (obs_i in 1:nobs) { - # Input dim for .get_probs + # Input dim for .GetProbs ## if exp: [sdate, memb] ## if obs: [sdate, (memb)] - exp_probs <- .get_probs(ClimProjDiags::Subset(exp, dat_dim, exp_i, drop = 'selected'), + exp_probs <- .GetProbs(ClimProjDiags::Subset(exp, dat_dim, exp_i, drop = 'selected'), indices_for_quantiles = indices_for_clim, prob_thresholds = prob_thresholds, cross.val = cross.val) - obs_probs <- .get_probs(data = ClimProjDiags::Subset(obs, dat_dim, obs_i, drop = 'selected'), + obs_probs <- .GetProbs(data = ClimProjDiags::Subset(obs, dat_dim, obs_i, drop = 'selected'), indices_for_quantiles = indices_for_clim, prob_thresholds = prob_thresholds, cross.val = cross.val) ## exp_probs and obs_probs: [bin, sdate] @@ -278,7 +278,7 @@ ROCSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', memb_dim = 'member', obs = Reorder(obs_probs, c(time_dim, 'bin')))[1:ncats]) if (!is.null(ref)) { - ref_probs <- .get_probs(ClimProjDiags::Subset(ref, dat_dim, exp_i, drop = 'selected'), + ref_probs <- .GetProbs(ClimProjDiags::Subset(ref, dat_dim, exp_i, drop = 'selected'), indices_for_quantiles = indices_for_clim, prob_thresholds = prob_thresholds, cross.val = cross.val) rocs_ref[exp_i, obs_i, ] <- unlist(EnsRoca(ens = Reorder(ref_probs, c(time_dim, 'bin')), diff --git a/R/RPS.R b/R/RPS.R index 32d88a4..a12f6a5 100644 --- a/R/RPS.R +++ b/R/RPS.R @@ -239,10 +239,10 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', dat_dim = NUL weights_data <- weights } - exp_probs <- .get_probs(data = exp_data, indices_for_quantiles = indices_for_clim, + exp_probs <- .GetProbs(data = exp_data, indices_for_quantiles = indices_for_clim, prob_thresholds = prob_thresholds, weights = weights_data, cross.val = cross.val) # exp_probs: [bin, sdate] - obs_probs <- .get_probs(data = obs_data, indices_for_quantiles = indices_for_clim, + obs_probs <- .GetProbs(data = obs_data, indices_for_quantiles = indices_for_clim, prob_thresholds = prob_thresholds, weights = NULL, cross.val = cross.val) # obs_probs: [bin, sdate] probs_exp_cumsum <- apply(exp_probs, 2, cumsum) diff --git a/R/RPSS.R b/R/RPSS.R index efd7950..16d038f 100644 --- a/R/RPSS.R +++ b/R/RPSS.R @@ -351,7 +351,7 @@ RPSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', memb_dim = 'member', obs_data <- obs[ , , j] if (is.null(dim(obs_data))) dim(obs_data) <- c(dim(obs)[1:2]) # obs_probs: [bin, sdate] - obs_probs <- .get_probs(data = obs_data, indices_for_quantiles = indices_for_clim, + obs_probs <- .GetProbs(data = obs_data, indices_for_quantiles = indices_for_clim, prob_thresholds = prob_thresholds, weights = NULL, cross.val = cross.val) # clim_probs: [bin, sdate] clim_probs <- c(prob_thresholds[1], diff(prob_thresholds), 1 - prob_thresholds[length(prob_thresholds)]) diff --git a/R/Utils.R b/R/Utils.R index 8770af9..cb6eb34 100644 --- a/R/Utils.R +++ b/R/Utils.R @@ -1809,89 +1809,3 @@ GradientCatsColorBar <- function(nmap, brks = NULL, cols = NULL, vertical = TRUE } -.get_probs <- function(data, indices_for_quantiles, prob_thresholds, weights = NULL, cross.val = FALSE) { - # if exp: [sdate, memb] - # if obs: [sdate, (memb)] - - # Add dim [memb = 1] to obs if it doesn't have memb_dim - if (length(dim(data)) == 1) dim(data) <- c(dim(data), 1) - - # Absolute thresholds - if (cross.val) { - quantiles <- array(NA, dim = c(bin = length(prob_thresholds), sdate = dim(data)[1])) - for (i in 1:dim(data)[1]) { - if (is.null(weights)) { - quantiles[,i] <- quantile(x = as.vector(data[indices_for_quantiles[which(indices_for_quantiles != i)], ]), - probs = prob_thresholds, type = 8, na.rm = TRUE) - } else { - # weights: [sdate, memb] - sorted_arrays <- .sorted_distributions(data[indices_for_quantiles[which(indices_for_quantiles != i)], ], - weights[indices_for_quantiles[which(indices_for_quantiles != i)], ]) - sorted_data <- sorted_arrays$data - cumulative_weights <- sorted_arrays$cumulative_weights - quantiles[,i] <- approx(cumulative_weights, sorted_data, prob_thresholds, "linear")$y - } - } - } else { - if (is.null(weights)) { - quantiles <- quantile(x = as.vector(data[indices_for_quantiles, ]), probs = prob_thresholds, type = 8, na.rm = TRUE) - } else { - # weights: [sdate, memb] - sorted_arrays <- .sorted_distributions(data[indices_for_quantiles, ], weights[indices_for_quantiles, ]) - sorted_data <- sorted_arrays$data - cumulative_weights <- sorted_arrays$cumulative_weights - quantiles <- approx(cumulative_weights, sorted_data, prob_thresholds, "linear")$y - } - quantiles <- array(rep(quantiles, dim(data)[1]),dim = c(bin = length(quantiles), dim(data)[1])) - } - - # quantiles: [bin-1, sdate] - # Probabilities - probs <- array(dim = c(dim(quantiles)[1] + 1, dim(data)[1])) # [bin, sdate] - for (i_time in 1:dim(data)[1]) { - if (anyNA(data[i_time, ])) { - probs[, i_time] <- rep(NA, dim = dim(quantiles)[1] + 1) - } else { - if (is.null(weights)) { - probs[, i_time] <- colMeans(easyVerification::convert2prob(data[i_time, ], - threshold = quantiles[,i_time])) - } else { - sorted_arrays <- .sorted_distributions(data[i_time, ], weights[i_time, ]) - sorted_data <- sorted_arrays$data - cumulative_weights <- sorted_arrays$cumulative_weights - # find any quantiles that are outside the data range - integrated_probs <- array(dim = dim(quantiles)) - for (i_quant in 1:dim(quantiles)[1]) { - # for thresholds falling under the distribution - if (quantiles[i_quant, i_time] < min(sorted_data)) { - integrated_probs[i_quant, i_time] <- 0 - # for thresholds falling over the distribution - } else if (max(sorted_data) < quantiles[i_quant, i_time]) { - integrated_probs[i_quant, i_time] <- 1 - } else { - integrated_probs[i_quant, i_time] <- approx(sorted_data, cumulative_weights, quantiles[i_quant, i_time], - "linear")$y - } - } - probs[, i_time] <- append(integrated_probs[,i_time], 1) - append(0, integrated_probs[,i_time]) - if (min(probs[, i_time]) < 0 | max(probs[, i_time]) > 1) { - stop(paste0("Probability in i_time = ", i_time, " is out of [0, 1].")) - } - } - } - } - return(probs) -} - -.sorted_distributions <- function(data_vector, weights_vector) { - weights_vector <- as.vector(weights_vector) - data_vector <- as.vector(data_vector) - weights_vector <- weights_vector / sum(weights_vector) # normalize to 1 - sorter <- order(data_vector) - sorted_weights <- weights_vector[sorter] - cumulative_weights <- cumsum(sorted_weights) - 0.5 * sorted_weights - cumulative_weights <- cumulative_weights - cumulative_weights[1] # fix the 0 - cumulative_weights <- cumulative_weights / cumulative_weights[length(cumulative_weights)] # fix the 1 - return(list("data" = data_vector[sorter], "cumulative_weights" = cumulative_weights)) -} - diff --git a/man/GetProbs.Rd b/man/GetProbs.Rd new file mode 100644 index 0000000..c967a2e --- /dev/null +++ b/man/GetProbs.Rd @@ -0,0 +1,67 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/GetProbs.R +\name{GetProbs} +\alias{GetProbs} +\title{Compute forecast probability} +\usage{ +GetProbs( + data, + time_dim = "sdate", + memb_dim = "member", + indices_for_quantiles = NULL, + prob_thresholds = c(1/3, 2/3), + weights = NULL, + cross.val = FALSE, + ncores = NULL +) +} +\arguments{ +\item{data}{A named numerical array of the forecast or observation with at +least time dimension.} + +\item{time_dim}{A character string indicating the name of the time dimension. +The default value is 'sdate'.} + +\item{memb_dim}{A character string indicating the name of the member dimension +to compute the probabilities of the forecast. The default value is 'member'.} + +\item{indices_for_quantiles}{A vector of the indices to be taken along 'time_dim' +for computing the thresholds between the probabilistic categories. If NULL, +the whole period is used. The default value is NULL.} + +\item{prob_thresholds}{A numeric vector of the relative thresholds (from 0 to +1) between the categories. The default value is c(1/3, 2/3), which +corresponds to tercile equiprobable categories.} + +\item{weights}{A named numerical array of the weights for 'data' with +dimensions 'time_dim' and 'memb_dim' (if 'data' has it). The default value is +NULL. The ensemble should have at least 70 members or span at least 10 time +steps and have more than 45 members if consistency between the weighted and +unweighted methodologies is desired.} + +\item{cross.val}{A logical indicating whether to compute the thresholds between +probabilistic categories in cross-validation. The default value is FALSE.} + +\item{ncores}{An integer indicating the number of cores to use for parallel +computation. The default value is NULL.} + +\item{dat_dim}{A character string indicating the name of dataset dimension. +The default value is NULL, which means no dataset dimension.} +} +\value{ +A numerical array of probabilities with dimensions c(bin, the rest dimensions +of 'data' except 'memb_dim'). 'bin' dimension has the length of categories, +i.e., \code{length(prob_thresholds) + 1}. +} +\description{ +Compute the forecast probability based on the relative thresholds. A certain +period can be specified to calculate the quantiles between each probabilistic +category. If data has ensemble, all the members are used together to +calculate the probabilities. Weights of each member and time can be provided. +Cross-validation can be chosen when absolute threshold is calculated. +} +\examples{ +data <- array(rnorm(2000), dim = c(ensemble = 25, sdate = 20, time = 4)) +res <- GetProbs(data = data, time_dim = 'sdate', memb_dim = 'ensemble', indices_for_quantiles = 4:17) + +} diff --git a/tests/testthat/test-GetProbs.R b/tests/testthat/test-GetProbs.R new file mode 100644 index 0000000..252dd29 --- /dev/null +++ b/tests/testthat/test-GetProbs.R @@ -0,0 +1,259 @@ +context("s2dv::GetProbs tests") + +############################################## + +# dat1 +set.seed(1) +data1 <- array(rnorm(60), dim = c(member = 3, sdate = 10, time = 2)) +set.seed(2) +weights1 <- array(abs(rnorm(30)), dim = c(member = 3, sdate = 10)) + +# dat2 +set.seed(1) +data2 <- array(rnorm(20), dim = c(sdate = 10, time = 2)) +set.seed(2) +weights2 <- array(abs(rnorm(10)), dim = c(sdate = 10)) + +############################################## + +test_that("1. Input checks", { + # data + expect_error( + GetProbs(c()), + "Parameter 'data' cannot be NULL." + ) + expect_error( + GetProbs(c(data2)), + "Parameter 'data' must have dimension names." + ) + # time_dim + expect_error( + GetProbs(data1, time_dim = 1), + 'Parameter "time_dim" must be a character string.' + ) + expect_error( + GetProbs(data1, time_dim = 'ftime'), + "Parameter 'time_dim' is not found in 'data' dimensions." + ) + # memb_dim + expect_error( + GetProbs(data1, memb_dim = TRUE), + "Parameter 'memb_dim' must be a character string." + ) + expect_error( + GetProbs(data1, memb_dim = 'memb'), + "Parameter 'memb_dim' is not found in 'data' dimensions." + ) + # prob_thresholds + expect_error( + GetProbs(data1, prob_thresholds = 1), + "Parameter 'prob_thresholds' must be a numeric vector between 0 and 1." + ) + # indices_for_clim + expect_error( + GetProbs(data1, indices_for_quantiles = array(1:6, dim = c(2, 3))), + "Parameter 'indices_for_quantiles' must be NULL or a numeric vector." + ) + expect_error( + GetProbs(data1, indices_for_quantiles = 3:11), + "Parameter 'indices_for_quantiles' should be the indices of 'time_dim'." + ) + # cross.val + expect_error( + GetProbs(data1, cross.val = 1), + "Parameter 'cross.val' must be either TRUE or FALSE." + ) + # weights + expect_error( + GetProbs(data1, weights = c(0, 1)), + "Parameter 'weights' must be a named numeric array." + ) + expect_error( + GetProbs(data1, weights = array(1, dim = c(member = 3, time = 10))), + "Parameter 'weights' must have dimension sdate and member." + ) + expect_error( + GetProbs(data1, weights = array(1, dim = c(member = 3, sdate = 1))), + "Parameter 'weights' must have the same dimension length as sdate and member dimension in 'data'." + ) +# expect_error( +# GetProbs(data3, weights = array(1, dim = c(member = 3, time = 10, dataset = 3)), dat_dim = 'dataset'), +# "Parameter 'weights' must have three dimensions with the names of 'memb_dim', 'time_dim' and 'dat_dim'." +# ) + # ncores + expect_error( + GetProbs(data1, ncores = 1.5), + "Parameter 'ncores' must be either NULL or a positive integer." + ) + +}) + +############################################## +test_that("2. Output checks: dat1", { + +expect_equal( +dim(GetProbs(data1)), +c(bin = 3, sdate = 10, time = 2) +) +expect_equal( +c(GetProbs(data1)[, 10, 2]), +c(0.3333333, 0.3333333, 0.3333333), +tolerance = 0.0001 +) +expect_equal( +c(GetProbs(data1)[, 2, 2]), +c(0.6666667, 0.3333333, 0.0000000), +tolerance = 0.0001 +) + +# indices_for_quantiles +expect_equal( +dim(GetProbs(data1, indices_for_quantiles = 4:7)), +c(bin = 3, sdate = 10, time = 2) +) +expect_equal( +c(GetProbs(data1, indices_for_quantiles = 4:7)[, 10, 2]), +c(0.3333333, 0.6666667, 0.0000000), +tolerance = 0.0001 +) + +# prob_thresholds +expect_equal( +dim(GetProbs(data1, prob_thresholds = c(0.25, 0.5, 0.75))), +c(bin = 4, sdate = 10, time = 2) +) +expect_equal( +c(GetProbs(data1, prob_thresholds = c(0.25, 0.5, 0.75))[, 10, 2]), +c(0.3333333, 0.3333333, 0.3333333, 0.0000000), +tolerance = 0.0001 +) +expect_equal( +c(GetProbs(data1, prob_thresholds = c(0.25, 0.5, 0.75))[, 3, 2]), +c(0.0000000, 0.6666667, 0.0000000, 0.3333333), +tolerance = 0.0001 +) + +# weights +expect_equal( +dim(GetProbs(data1, weights = weights1)), +c(bin = 3, sdate = 10, time = 2) +) +expect_equal( +c(GetProbs(data1, weights = weights1)[, 10, 2]), +c(0.3327220, 0.5296149, 0.1376631), +tolerance = 0.0001 +) +expect_equal( +sum(c(GetProbs(data1, weights = weights1))), +20 +) + +# cross.val +expect_equal( +dim(GetProbs(data1, cross.val = T)), +c(bin = 3, sdate = 10, time = 2) +) +expect_equal( +c(GetProbs(data1, cross.val = T)[, 10, 2]), +c(0.3333333, 0.3333333, 0.3333333), +tolerance = 0.0001 +) +expect_equal( +c(GetProbs(data1, cross.val = T)[, 4, 2]), +c(0.0000000, 0.6666667, 0.3333333), +tolerance = 0.0001 +) + +# cross.val + weights +expect_equal( +dim(GetProbs(data1, cross.val = T, weights = weights1)), +c(bin = 3, sdate = 10, time = 2) +) +expect_equal( +c(GetProbs(data1, cross.val = T, weights = weights1)[, 10, 2]), +c(0.3335612, 0.5277459, 0.1386929), +tolerance = 0.0001 +) + +}) + + +############################################## +test_that("3. Output checks: dat2", { + +expect_equal( +dim(GetProbs(data2, memb_dim = NULL)), +c(bin = 3, sdate = 10, time = 2) +) +expect_equal( +c(GetProbs(data2, memb_dim = NULL)[, 10, 2]), +c(0, 1, 0) +) +expect_equal( +unique(c(GetProbs(data2, memb_dim = NULL))), +c(1, 0) +) + +# indices_for_quantiles +expect_equal( +dim(GetProbs(data2, memb_dim = NULL, indices_for_quantiles = 4:7)), +c(bin = 3, sdate = 10, time = 2) +) +expect_equal( +c(GetProbs(data2, memb_dim = NULL, indices_for_quantiles = 4:7)[, 10, 2]), +c(0, 0, 1) +) + +# prob_thresholds +expect_equal( +dim(GetProbs(data2, memb_dim = NULL, prob_thresholds = c(0.25, 0.5, 0.75))), +c(bin = 4, sdate = 10, time = 2) +) +expect_equal( +c(GetProbs(data2, memb_dim = NULL, prob_thresholds = c(0.25, 0.5, 0.75))[, 10, 2]), +c(0, 0, 1, 0) +) +expect_equal( +c(GetProbs(data2, memb_dim = NULL, prob_thresholds = c(0.25, 0.5, 0.75))[, 3, 2]), +c(1, 0, 0, 0) +) + +# weights +expect_equal( +dim(GetProbs(data2, memb_dim = NULL, weights = weights2)), +c(bin = 3, sdate = 10, time = 2) +) +expect_equal( +c(GetProbs(data2, memb_dim = NULL, weights = weights2)[, 10, 2]), +c(0, 1, 0) +) +expect_equal( +sum(c(GetProbs(data2, memb_dim = NULL, weights = weights2))), +20 +) + +# cross.val +expect_equal( +dim(GetProbs(data2, memb_dim = NULL, cross.val = T)), +c(bin = 3, sdate = 10, time = 2) +) +expect_equal( +c(GetProbs(data2, memb_dim = NULL, cross.val = T)[, 10, 2]), +c(0, 1, 0) +) +expect_equal( +c(GetProbs(data2, memb_dim = NULL, cross.val = T)[, 4, 2]), +c(1, 0, 0) +) + +# cross.val + weights +expect_equal( +dim(GetProbs(data2, memb_dim = NULL, cross.val = T, weights = weights2)), +c(bin = 3, sdate = 10, time = 2) +) +expect_equal( +c(GetProbs(data2, memb_dim = NULL, cross.val = T, weights = weights2)[, 10, 2]), +c(0, 1, 0) +) + +}) -- GitLab From f49d8a1102cb1d1609ccafd79dad9933b7c75189 Mon Sep 17 00:00:00 2001 From: aho Date: Tue, 28 Mar 2023 17:35:01 +0200 Subject: [PATCH 2/4] Don't ignore unit tests in pipeline --- .Rbuildignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.Rbuildignore b/.Rbuildignore index 6008b57..4212858 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -10,7 +10,7 @@ README\.md$ vignettes .gitlab-ci.yml # unit tests should be ignored when building the package for CRAN -^tests$ +#^tests$ # CDO is not in windbuilder, so we can test the unit tests by winbuilder # but test-CDORemap.R and test-Load.R needs to be hidden #tests/testthat/test-CDORemap.R -- GitLab From 92d5776f89b56508a08980dd59ed0740d5bd91fc Mon Sep 17 00:00:00 2001 From: Carlos Delgado Date: Wed, 29 Mar 2023 09:24:40 +0200 Subject: [PATCH 3/4] corrected indices_for_quantiles check to avoid warning --- R/GetProbs.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/GetProbs.R b/R/GetProbs.R index 1f7651d..351654c 100644 --- a/R/GetProbs.R +++ b/R/GetProbs.R @@ -85,7 +85,7 @@ GetProbs <- function(data, time_dim = 'sdate', memb_dim = 'member', indices_for_ stop("Parameter 'indices_for_quantiles' must be NULL or a numeric vector.") } else if (length(indices_for_quantiles) > dim(data)[time_dim] | max(indices_for_quantiles) > dim(data)[time_dim] | - any(indices_for_quantiles) < 1) { + any(indices_for_quantiles < 1)) { stop("Parameter 'indices_for_quantiles' should be the indices of 'time_dim'.") } } -- GitLab From d1c4feebe53be356b3a6bf3dfa55d3d052c05b79 Mon Sep 17 00:00:00 2001 From: Carlos Delgado Date: Wed, 29 Mar 2023 11:05:08 +0200 Subject: [PATCH 4/4] documentation --- R/GetProbs.R | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/R/GetProbs.R b/R/GetProbs.R index 351654c..6a4dcfe 100644 --- a/R/GetProbs.R +++ b/R/GetProbs.R @@ -1,39 +1,45 @@ -#'Compute forecast probability +#'Compute probabilistic forecasts or the corresponding observations #' -#'Compute the forecast probability based on the relative thresholds. A certain -#'period can be specified to calculate the quantiles between each probabilistic -#'category. If data has ensemble, all the members are used together to -#'calculate the probabilities. Weights of each member and time can be provided. -#'Cross-validation can be chosen when absolute threshold is calculated. +#'Compute probabilistic forecasts from an ensemble based on the relative thresholds, +#'or the probabilistic observations (i.e., which probabilistic category was observed). +#'A reference period can be specified to calculate the absolute thresholds between +#'each probabilistic category. The absolute thresholds can be computed in cross-validation +#'mode. If data is an ensemble, the probabilities are calculated as the percentage of +#'members that fall into each category. For observations (or forecast without member +#'dimension), 1 means that the event happened, while 0 indicates that the event did +#'not happen. Weighted probabilities can be computed if the weights are provided for +#'each ensemble member and time step. #' -#'@param data A named numerical array of the forecast or observation with at -#' least time dimension. +#'@param data A named numerical array of the forecasts or observations with, at +#' least, time dimension. #'@param time_dim A character string indicating the name of the time dimension. #' The default value is 'sdate'. #'@param memb_dim A character string indicating the name of the member dimension -#' to compute the probabilities of the forecast. The default value is 'member'. +#' to compute the probabilities of the forecast, or NULL if there is no member +#' dimension (e.g., for observations, or for forecast with only one ensemble +#' member). The default value is 'member'. #'@param dat_dim A character string indicating the name of dataset dimension. #' The default value is NULL, which means no dataset dimension. #'@param prob_thresholds A numeric vector of the relative thresholds (from 0 to #' 1) between the categories. The default value is c(1/3, 2/3), which #' corresponds to tercile equiprobable categories. #'@param indices_for_quantiles A vector of the indices to be taken along 'time_dim' -#' for computing the thresholds between the probabilistic categories. If NULL, -#' the whole period is used. The default value is NULL. +#' for computing the absolute thresholds between the probabilistic categories. +#' If NULL, the whole period is used. The default value is NULL. #'@param weights A named numerical array of the weights for 'data' with -#' dimensions 'time_dim' and 'memb_dim' (if 'data' has it). The default value is +#' dimensions 'time_dim' and 'memb_dim' (if 'data' has them). The default value is #' NULL. The ensemble should have at least 70 members or span at least 10 time #' steps and have more than 45 members if consistency between the weighted and #' unweighted methodologies is desired. #'@param cross.val A logical indicating whether to compute the thresholds between -#' probabilistic categories in cross-validation. The default value is FALSE. +#' probabilistic categories in cross-validation mode. The default value is FALSE. #'@param ncores An integer indicating the number of cores to use for parallel #' computation. The default value is NULL. #' #'@return #'A numerical array of probabilities with dimensions c(bin, the rest dimensions -#'of 'data' except 'memb_dim'). 'bin' dimension has the length of categories, -#'i.e., \code{length(prob_thresholds) + 1}. +#'of 'data' except 'memb_dim'). 'bin' dimension has the length of probabilistic +#'categories, i.e., \code{length(prob_thresholds) + 1}. #' #'@examples #'data <- array(rnorm(2000), dim = c(ensemble = 25, sdate = 20, time = 4)) -- GitLab