From e90115e54c3a894a4422144b3353307d9ca6202b Mon Sep 17 00:00:00 2001
From: aho <an.ho@bsc.es>
Date: Tue, 28 Mar 2023 17:34:26 +0200
Subject: [PATCH 1/4] Create GetProbs()

---
 NAMESPACE                      |   1 +
 R/GetProbs.R                   | 251 ++++++++++++++++++++++++++++++++
 R/ROCSS.R                      |   8 +-
 R/RPS.R                        |   4 +-
 R/RPSS.R                       |   2 +-
 R/Utils.R                      |  86 -----------
 man/GetProbs.Rd                |  67 +++++++++
 tests/testthat/test-GetProbs.R | 259 +++++++++++++++++++++++++++++++++
 8 files changed, 585 insertions(+), 93 deletions(-)
 create mode 100644 R/GetProbs.R
 create mode 100644 man/GetProbs.Rd
 create mode 100644 tests/testthat/test-GetProbs.R

diff --git a/NAMESPACE b/NAMESPACE
index 6224a15..d74cc97 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -36,6 +36,7 @@ export(EuroAtlanticTC)
 export(Filter)
 export(GMST)
 export(GSAT)
+export(GetProbs)
 export(Histo2Hindcast)
 export(InsertDim)
 export(LeapYear)
diff --git a/R/GetProbs.R b/R/GetProbs.R
new file mode 100644
index 0000000..1f7651d
--- /dev/null
+++ b/R/GetProbs.R
@@ -0,0 +1,251 @@
+#'Compute forecast probability
+#'
+#'Compute the forecast probability based on the relative thresholds. A certain 
+#'period can be specified to calculate the quantiles between each probabilistic
+#'category. If data has ensemble, all the members are used together to 
+#'calculate the probabilities. Weights of each member and time can be provided.
+#'Cross-validation can be chosen when absolute threshold is calculated.
+#'
+#'@param data A named numerical array of the forecast or observation with at 
+#'  least time dimension.
+#'@param time_dim A character string indicating the name of the time dimension.
+#'  The default value is 'sdate'.
+#'@param memb_dim A character string indicating the name of the member dimension
+#'  to compute the probabilities of the forecast. The default value is 'member'.
+#'@param dat_dim A character string indicating the name of dataset dimension. 
+#'  The default value is NULL, which means no dataset dimension.
+#'@param prob_thresholds A numeric vector of the relative thresholds (from 0 to
+#'  1) between the categories. The default value is c(1/3, 2/3), which 
+#'  corresponds to tercile equiprobable categories.
+#'@param indices_for_quantiles A vector of the indices to be taken along 'time_dim' 
+#'  for computing the thresholds between the probabilistic categories. If NULL,
+#'  the whole period is used. The default value is NULL.
+#'@param weights A named numerical array of the weights for 'data' with 
+#'  dimensions 'time_dim' and 'memb_dim' (if 'data' has it). The default value is 
+#'  NULL. The ensemble should have at least 70 members or span at least 10 time 
+#'  steps and have more than 45 members if consistency between the weighted and
+#'  unweighted methodologies is desired.
+#'@param cross.val A logical indicating whether to compute the thresholds between 
+#'  probabilistic categories in cross-validation. The default value is FALSE.
+#'@param ncores An integer indicating the number of cores to use for parallel 
+#'  computation. The default value is NULL.
+#'
+#'@return 
+#'A numerical array of probabilities with dimensions c(bin, the rest dimensions
+#'of 'data' except 'memb_dim'). 'bin' dimension has the length of categories, 
+#'i.e., \code{length(prob_thresholds) + 1}.
+#'
+#'@examples
+#'data <- array(rnorm(2000), dim = c(ensemble = 25, sdate = 20, time = 4))
+#'res <- GetProbs(data = data, time_dim = 'sdate', memb_dim = 'ensemble', indices_for_quantiles = 4:17)
+#'
+#'@import multiApply
+#'@importFrom easyVerification convert2prob
+#'@export
+GetProbs <- function(data, time_dim = 'sdate', memb_dim = 'member', indices_for_quantiles = NULL, 
+                     prob_thresholds = c(1/3, 2/3), weights = NULL, cross.val = FALSE, ncores = NULL) {
+
+  # Check inputs
+  ## data
+  if (is.null(data)) {
+    stop("Parameter 'data' cannot be NULL.")
+  }
+  if (!is.numeric(data)) {
+    stop("Parameter 'data' must be a numeric array.")
+  }
+  if (any(is.null(names(dim(data)))) | any(nchar(names(dim(data))) == 0)) {
+    stop("Parameter 'data' must have dimension names.")
+  }
+  ## time_dim
+  if (!is.character(time_dim) | length(time_dim) != 1)
+    stop('Parameter "time_dim" must be a character string.')
+  if (!time_dim %in% names(dim(data))) {
+    stop("Parameter 'time_dim' is not found in 'data' dimensions.")
+  }
+  ## memb_dim
+  if (!is.null(memb_dim)) {
+    if (!is.character(memb_dim) | length(memb_dim) > 1) {
+      stop("Parameter 'memb_dim' must be a character string.")
+    }
+    if (!memb_dim %in% names(dim(data))) {
+      stop("Parameter 'memb_dim' is not found in 'data' dimensions. If no member ",
+           "dimension exists, set it as NULL.")
+    }
+  }
+  ## prob_thresholds
+  if (!is.numeric(prob_thresholds) | !is.vector(prob_thresholds) |
+      any(prob_thresholds <= 0) | any(prob_thresholds >= 1)) {
+    stop("Parameter 'prob_thresholds' must be a numeric vector between 0 and 1.")
+  }
+  ## indices_for_quantiles
+  if (is.null(indices_for_quantiles)) {
+    indices_for_quantiles <- 1:dim(data)[time_dim]
+  } else {
+    if (!is.numeric(indices_for_quantiles) | !is.vector(indices_for_quantiles)) {
+      stop("Parameter 'indices_for_quantiles' must be NULL or a numeric vector.")
+    } else if (length(indices_for_quantiles) > dim(data)[time_dim] |
+               max(indices_for_quantiles) > dim(data)[time_dim] |
+               any(indices_for_quantiles) < 1) {
+      stop("Parameter 'indices_for_quantiles' should be the indices of 'time_dim'.")
+    }
+  }
+  ## weights
+  if (!is.null(weights)) {
+    if (!is.array(weights) | !is.numeric(weights))
+      stop("Parameter 'weights' must be a named numeric array.")
+
+#    if (is.null(dat_dim)) {
+      if (!is.null(memb_dim)) {
+        lendim_weights <- 2
+        namesdim_weights <- c(time_dim, memb_dim)
+      } else {
+        lendim_weights <- 1
+        namesdim_weights <- c(time_dim)
+      }
+      if (length(dim(weights)) != lendim_weights | 
+          any(!names(dim(weights)) %in% namesdim_weights)) {
+        stop(paste0("Parameter 'weights' must have dimension ", 
+                    paste0(namesdim_weights, collapse = ' and '), "."))
+      }
+      if (any(dim(weights)[namesdim_weights] != dim(data)[namesdim_weights])) {
+        stop(paste0("Parameter 'weights' must have the same dimension length as ", 
+                    paste0(namesdim_weights, collapse = ' and '), " dimension in 'data'."))
+      }
+      weights <- Reorder(weights, namesdim_weights)
+
+#    } else {
+#      if (length(dim(weights)) != 3 | any(!names(dim(weights)) %in% c(memb_dim, time_dim, dat_dim)))
+#        stop("Parameter 'weights' must have three dimensions with the names of 'memb_dim', 'time_dim' and 'dat_dim'.")
+#      if (dim(weights)[memb_dim] != dim(exp)[memb_dim] |
+#          dim(weights)[time_dim] != dim(exp)[time_dim] |
+#          dim(weights)[dat_dim] != dim(exp)[dat_dim]) {
+#        stop(paste0("Parameter 'weights' must have the same dimension lengths ", 
+#                    "as 'memb_dim', 'time_dim' and 'dat_dim' in 'exp'."))
+#      }
+#      weights <- Reorder(weights, c(time_dim, memb_dim, dat_dim))
+#    }
+  }
+  ## cross.val
+  if (!is.logical(cross.val)  | length(cross.val) > 1) {
+    stop("Parameter 'cross.val' must be either TRUE or FALSE.")
+  }
+  ## ncores
+  if (!is.null(ncores)) {
+    if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 |
+      length(ncores) > 1) {
+      stop("Parameter 'ncores' must be either NULL or a positive integer.")
+    }
+  }
+
+  ###############################
+
+  res <- Apply(data = list(data = data), 
+               target_dims = c(time_dim, memb_dim), #, dat_dim), 
+               output_dims = c("bin", time_dim),
+               fun = .GetProbs,
+#               dat_dim = dat_dim, 
+               time_dim = time_dim,
+               memb_dim = memb_dim,
+               prob_thresholds = prob_thresholds, 
+               indices_for_quantiles = indices_for_quantiles, 
+               weights = weights, cross.val = cross.val, ncores = ncores)$output1
+ 
+  return(res)
+}
+
+.GetProbs <- function(data, time_dim = 'sdate', memb_dim = 'member', indices_for_quantiles = NULL,
+                      prob_thresholds = c(1/3, 2/3), weights = NULL, cross.val = FALSE) {
+  # .GetProbs() is used in RPS, RPSS, ROCSS
+  # data
+  ## if data is exp: [sdate, memb]
+  ## if data is obs: [sdate, (memb)]
+  # weights: [sdate, (memb)], same as data
+
+  # Add dim [memb = 1] to data if it doesn't have memb_dim
+  if (length(dim(data)) == 1) {
+    dim(data) <- c(dim(data), 1)
+    if (!is.null(weights)) dim(weights) <- c(dim(weights), 1)
+  }
+  # Absolute thresholds
+  if (cross.val) {
+    quantiles <- array(NA, dim = c(bin = length(prob_thresholds), sdate = dim(data)[1]))
+    for (i_time in 1:dim(data)[1]) {
+      if (is.null(weights)) {
+        quantiles[, i_time] <- quantile(x = as.vector(data[indices_for_quantiles[which(indices_for_quantiles != i_time)], ]),
+                                        probs = prob_thresholds, type = 8, na.rm = TRUE)
+      } else {
+        # weights: [sdate, memb]
+        sorted_arrays <- .sorted_distributions(data[indices_for_quantiles[which(indices_for_quantiles != i_time)], ], 
+                                               weights[indices_for_quantiles[which(indices_for_quantiles != i_time)], ])
+        sorted_data <- sorted_arrays$data
+        cumulative_weights <- sorted_arrays$cumulative_weights
+        quantiles[, i_time] <- approx(cumulative_weights, sorted_data, prob_thresholds, "linear")$y
+      }
+    }
+
+  } else {
+    if (is.null(weights)) {
+      quantiles <- quantile(x = as.vector(data[indices_for_quantiles, ]), 
+                            probs = prob_thresholds, type = 8, na.rm = TRUE)
+    } else {
+      # weights: [sdate, memb]
+      sorted_arrays <- .sorted_distributions(data[indices_for_quantiles, ], 
+                                             weights[indices_for_quantiles, ])
+      sorted_data <- sorted_arrays$data
+      cumulative_weights <- sorted_arrays$cumulative_weights
+      quantiles <- approx(cumulative_weights, sorted_data, prob_thresholds, "linear")$y
+    }
+    quantiles <- array(rep(quantiles, dim(data)[1]), dim = c(bin = length(quantiles), dim(data)[1]))
+  }
+  # quantiles: [bin-1, sdate]
+
+  # Probabilities
+  probs <- array(dim = c(dim(quantiles)[1] + 1, dim(data)[1])) # [bin, sdate]
+  for (i_time in 1:dim(data)[1]) {
+    if (anyNA(data[i_time, ])) {
+      probs[, i_time] <- rep(NA, dim = dim(quantiles)[1] + 1)
+    } else {
+      if (is.null(weights)) {
+        probs[, i_time] <- colMeans(easyVerification::convert2prob(data[i_time, ], 
+                                                                   threshold = quantiles[, i_time]))
+      } else {
+        sorted_arrays <- .sorted_distributions(data[i_time, ], weights[i_time, ])
+        sorted_data <- sorted_arrays$data
+        cumulative_weights <- sorted_arrays$cumulative_weights
+        # find any quantiles that are outside the data range
+        integrated_probs <- array(dim = dim(quantiles))
+        for (i_quant in 1:dim(quantiles)[1]) {
+          # for thresholds falling under the distribution
+          if (quantiles[i_quant, i_time] < min(sorted_data)) {
+            integrated_probs[i_quant, i_time] <- 0 
+          # for thresholds falling over the distribution
+          } else if (max(sorted_data) < quantiles[i_quant, i_time]) {
+            integrated_probs[i_quant, i_time] <- 1
+          } else {
+            integrated_probs[i_quant, i_time] <- approx(sorted_data, cumulative_weights, 
+                                                        quantiles[i_quant, i_time], "linear")$y
+          }
+        }
+        probs[, i_time] <- append(integrated_probs[, i_time], 1) - append(0, integrated_probs[, i_time])
+        if (min(probs[, i_time]) < 0 | max(probs[, i_time]) > 1) {
+          stop(paste0("Probability in i_time = ", i_time, " is out of [0, 1]."))
+        } 
+      }
+    }
+  }
+
+  return(probs)
+}
+
+.sorted_distributions <- function(data_vector, weights_vector) {
+  weights_vector <- as.vector(weights_vector)
+  data_vector <- as.vector(data_vector)
+  weights_vector <- weights_vector / sum(weights_vector) # normalize to 1
+  sorter <- order(data_vector)
+  sorted_weights <- weights_vector[sorter]
+  cumulative_weights <- cumsum(sorted_weights) - 0.5 * sorted_weights
+  cumulative_weights <- cumulative_weights - cumulative_weights[1] # fix the 0
+  cumulative_weights <- cumulative_weights / cumulative_weights[length(cumulative_weights)] # fix the 1
+  return(list("data" = data_vector[sorter], "cumulative_weights" = cumulative_weights))
+}
+
diff --git a/R/ROCSS.R b/R/ROCSS.R
index 7831a88..1e9f5e2 100644
--- a/R/ROCSS.R
+++ b/R/ROCSS.R
@@ -262,13 +262,13 @@ ROCSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', memb_dim = 'member',
   for (exp_i in 1:nexp) {
     for (obs_i in 1:nobs) {
  
-      # Input dim for .get_probs
+      # Input dim for .GetProbs
       ## if exp: [sdate, memb]
       ## if obs: [sdate, (memb)]
-      exp_probs <- .get_probs(ClimProjDiags::Subset(exp, dat_dim, exp_i, drop = 'selected'), 
+      exp_probs <- .GetProbs(ClimProjDiags::Subset(exp, dat_dim, exp_i, drop = 'selected'), 
                               indices_for_quantiles = indices_for_clim, 
                               prob_thresholds = prob_thresholds, cross.val = cross.val)
-      obs_probs <- .get_probs(data = ClimProjDiags::Subset(obs, dat_dim, obs_i, drop = 'selected'), 
+      obs_probs <- .GetProbs(data = ClimProjDiags::Subset(obs, dat_dim, obs_i, drop = 'selected'), 
                               indices_for_quantiles = indices_for_clim, 
                               prob_thresholds = prob_thresholds, cross.val = cross.val)
       ## exp_probs and obs_probs: [bin, sdate]
@@ -278,7 +278,7 @@ ROCSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', memb_dim = 'member',
                                                  obs = Reorder(obs_probs, c(time_dim, 'bin')))[1:ncats])
     
       if (!is.null(ref)) {
-        ref_probs <- .get_probs(ClimProjDiags::Subset(ref, dat_dim, exp_i, drop = 'selected'),
+        ref_probs <- .GetProbs(ClimProjDiags::Subset(ref, dat_dim, exp_i, drop = 'selected'),
                                 indices_for_quantiles = indices_for_clim, 
                                 prob_thresholds = prob_thresholds, cross.val = cross.val)
         rocs_ref[exp_i, obs_i, ] <- unlist(EnsRoca(ens = Reorder(ref_probs, c(time_dim, 'bin')),
diff --git a/R/RPS.R b/R/RPS.R
index 32d88a4..a12f6a5 100644
--- a/R/RPS.R
+++ b/R/RPS.R
@@ -239,10 +239,10 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', dat_dim = NUL
         weights_data <- weights
       }
 
-      exp_probs <- .get_probs(data = exp_data, indices_for_quantiles = indices_for_clim, 
+      exp_probs <- .GetProbs(data = exp_data, indices_for_quantiles = indices_for_clim, 
                               prob_thresholds = prob_thresholds, weights = weights_data, cross.val = cross.val)
       # exp_probs: [bin, sdate]
-      obs_probs <- .get_probs(data = obs_data, indices_for_quantiles = indices_for_clim, 
+      obs_probs <- .GetProbs(data = obs_data, indices_for_quantiles = indices_for_clim, 
                               prob_thresholds = prob_thresholds, weights = NULL, cross.val = cross.val)
       # obs_probs: [bin, sdate]
       probs_exp_cumsum <- apply(exp_probs, 2, cumsum)
diff --git a/R/RPSS.R b/R/RPSS.R
index efd7950..16d038f 100644
--- a/R/RPSS.R
+++ b/R/RPSS.R
@@ -351,7 +351,7 @@ RPSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', memb_dim = 'member',
       obs_data <- obs[ , , j]
       if (is.null(dim(obs_data))) dim(obs_data) <- c(dim(obs)[1:2])
       # obs_probs: [bin, sdate] 
-      obs_probs <- .get_probs(data = obs_data, indices_for_quantiles = indices_for_clim, 
+      obs_probs <- .GetProbs(data = obs_data, indices_for_quantiles = indices_for_clim, 
                               prob_thresholds = prob_thresholds, weights = NULL, cross.val = cross.val)
       # clim_probs: [bin, sdate]
       clim_probs <- c(prob_thresholds[1], diff(prob_thresholds), 1 - prob_thresholds[length(prob_thresholds)])
diff --git a/R/Utils.R b/R/Utils.R
index 8770af9..cb6eb34 100644
--- a/R/Utils.R
+++ b/R/Utils.R
@@ -1809,89 +1809,3 @@ GradientCatsColorBar <- function(nmap, brks = NULL, cols = NULL, vertical = TRUE
 
 }
 
-.get_probs <- function(data, indices_for_quantiles, prob_thresholds, weights = NULL, cross.val = FALSE) {
-  # if exp: [sdate, memb]
-  # if obs: [sdate, (memb)]
-
-  # Add dim [memb = 1] to obs if it doesn't have memb_dim
-  if (length(dim(data)) == 1) dim(data) <- c(dim(data), 1)
-  
-  # Absolute thresholds
-  if (cross.val) {
-    quantiles <- array(NA, dim = c(bin = length(prob_thresholds), sdate = dim(data)[1]))
-    for (i in 1:dim(data)[1]) {
-      if (is.null(weights)) {
-        quantiles[,i] <- quantile(x = as.vector(data[indices_for_quantiles[which(indices_for_quantiles != i)], ]),
-                              probs = prob_thresholds, type = 8, na.rm = TRUE)
-      } else {
-        # weights: [sdate, memb]
-        sorted_arrays <- .sorted_distributions(data[indices_for_quantiles[which(indices_for_quantiles != i)], ], 
-                                               weights[indices_for_quantiles[which(indices_for_quantiles != i)], ])
-        sorted_data <- sorted_arrays$data
-        cumulative_weights <- sorted_arrays$cumulative_weights
-        quantiles[,i] <- approx(cumulative_weights, sorted_data, prob_thresholds, "linear")$y
-      }
-    }
-  } else {
-    if (is.null(weights)) {
-      quantiles <- quantile(x = as.vector(data[indices_for_quantiles, ]), probs = prob_thresholds, type = 8, na.rm = TRUE)
-    } else {
-      # weights: [sdate, memb]
-      sorted_arrays <- .sorted_distributions(data[indices_for_quantiles, ], weights[indices_for_quantiles, ])
-      sorted_data <- sorted_arrays$data
-      cumulative_weights <- sorted_arrays$cumulative_weights
-      quantiles <- approx(cumulative_weights, sorted_data, prob_thresholds, "linear")$y
-    }
-    quantiles <- array(rep(quantiles, dim(data)[1]),dim = c(bin = length(quantiles), dim(data)[1]))
-  }
-  
-  # quantiles: [bin-1, sdate]
-  # Probabilities
-  probs <- array(dim = c(dim(quantiles)[1] + 1, dim(data)[1])) # [bin, sdate]
-  for (i_time in 1:dim(data)[1]) {
-    if (anyNA(data[i_time, ])) {
-      probs[, i_time] <- rep(NA, dim = dim(quantiles)[1] + 1)
-    } else {
-      if (is.null(weights)) {
-        probs[, i_time] <- colMeans(easyVerification::convert2prob(data[i_time, ], 
-                                                                   threshold = quantiles[,i_time]))
-      } else {
-        sorted_arrays <- .sorted_distributions(data[i_time, ], weights[i_time, ])
-        sorted_data <- sorted_arrays$data
-        cumulative_weights <- sorted_arrays$cumulative_weights
-        # find any quantiles that are outside the data range
-        integrated_probs <- array(dim = dim(quantiles))
-        for (i_quant in 1:dim(quantiles)[1]) {
-          # for thresholds falling under the distribution
-          if (quantiles[i_quant, i_time] < min(sorted_data)) {
-            integrated_probs[i_quant, i_time] <- 0 
-          # for thresholds falling over the distribution
-          } else if (max(sorted_data) < quantiles[i_quant, i_time]) {
-            integrated_probs[i_quant, i_time] <- 1
-          } else {
-            integrated_probs[i_quant, i_time] <- approx(sorted_data, cumulative_weights, quantiles[i_quant, i_time], 
-                                                "linear")$y
-          }
-        }
-        probs[, i_time] <- append(integrated_probs[,i_time], 1) - append(0, integrated_probs[,i_time])
-        if (min(probs[, i_time]) < 0 | max(probs[, i_time]) > 1) {
-          stop(paste0("Probability in i_time = ", i_time, " is out of [0, 1]."))
-        } 
-      }
-    }
-  }
-  return(probs)
-}
-
-.sorted_distributions <- function(data_vector, weights_vector) {
-  weights_vector <- as.vector(weights_vector)
-  data_vector <- as.vector(data_vector)
-  weights_vector <- weights_vector / sum(weights_vector) # normalize to 1
-  sorter <- order(data_vector)
-  sorted_weights <- weights_vector[sorter]
-  cumulative_weights <- cumsum(sorted_weights) - 0.5 * sorted_weights
-  cumulative_weights <- cumulative_weights - cumulative_weights[1] # fix the 0
-  cumulative_weights <- cumulative_weights / cumulative_weights[length(cumulative_weights)] # fix the 1
-  return(list("data" = data_vector[sorter], "cumulative_weights" = cumulative_weights))
-}
-
diff --git a/man/GetProbs.Rd b/man/GetProbs.Rd
new file mode 100644
index 0000000..c967a2e
--- /dev/null
+++ b/man/GetProbs.Rd
@@ -0,0 +1,67 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/GetProbs.R
+\name{GetProbs}
+\alias{GetProbs}
+\title{Compute forecast probability}
+\usage{
+GetProbs(
+  data,
+  time_dim = "sdate",
+  memb_dim = "member",
+  indices_for_quantiles = NULL,
+  prob_thresholds = c(1/3, 2/3),
+  weights = NULL,
+  cross.val = FALSE,
+  ncores = NULL
+)
+}
+\arguments{
+\item{data}{A named numerical array of the forecast or observation with at 
+least time dimension.}
+
+\item{time_dim}{A character string indicating the name of the time dimension.
+The default value is 'sdate'.}
+
+\item{memb_dim}{A character string indicating the name of the member dimension
+to compute the probabilities of the forecast. The default value is 'member'.}
+
+\item{indices_for_quantiles}{A vector of the indices to be taken along 'time_dim' 
+for computing the thresholds between the probabilistic categories. If NULL,
+the whole period is used. The default value is NULL.}
+
+\item{prob_thresholds}{A numeric vector of the relative thresholds (from 0 to
+1) between the categories. The default value is c(1/3, 2/3), which 
+corresponds to tercile equiprobable categories.}
+
+\item{weights}{A named numerical array of the weights for 'data' with 
+dimensions 'time_dim' and 'memb_dim' (if 'data' has it). The default value is 
+NULL. The ensemble should have at least 70 members or span at least 10 time 
+steps and have more than 45 members if consistency between the weighted and
+unweighted methodologies is desired.}
+
+\item{cross.val}{A logical indicating whether to compute the thresholds between 
+probabilistic categories in cross-validation. The default value is FALSE.}
+
+\item{ncores}{An integer indicating the number of cores to use for parallel 
+computation. The default value is NULL.}
+
+\item{dat_dim}{A character string indicating the name of dataset dimension. 
+The default value is NULL, which means no dataset dimension.}
+}
+\value{
+A numerical array of probabilities with dimensions c(bin, the rest dimensions
+of 'data' except 'memb_dim'). 'bin' dimension has the length of categories, 
+i.e., \code{length(prob_thresholds) + 1}.
+}
+\description{
+Compute the forecast probability based on the relative thresholds. A certain 
+period can be specified to calculate the quantiles between each probabilistic
+category. If data has ensemble, all the members are used together to 
+calculate the probabilities. Weights of each member and time can be provided.
+Cross-validation can be chosen when absolute threshold is calculated.
+}
+\examples{
+data <- array(rnorm(2000), dim = c(ensemble = 25, sdate = 20, time = 4))
+res <- GetProbs(data = data, time_dim = 'sdate', memb_dim = 'ensemble', indices_for_quantiles = 4:17)
+
+}
diff --git a/tests/testthat/test-GetProbs.R b/tests/testthat/test-GetProbs.R
new file mode 100644
index 0000000..252dd29
--- /dev/null
+++ b/tests/testthat/test-GetProbs.R
@@ -0,0 +1,259 @@
+context("s2dv::GetProbs tests")
+
+##############################################
+
+# dat1
+set.seed(1)
+data1 <- array(rnorm(60), dim = c(member = 3, sdate = 10, time = 2))
+set.seed(2)
+weights1 <- array(abs(rnorm(30)), dim = c(member = 3, sdate = 10))
+
+# dat2
+set.seed(1)
+data2 <- array(rnorm(20), dim = c(sdate = 10, time = 2))
+set.seed(2)
+weights2 <- array(abs(rnorm(10)), dim = c(sdate = 10))
+
+##############################################
+
+test_that("1. Input checks", {
+  # data
+  expect_error(
+  GetProbs(c()),
+  "Parameter 'data' cannot be NULL."
+  )
+  expect_error(
+  GetProbs(c(data2)),
+  "Parameter 'data' must have dimension names."
+  )
+  # time_dim
+  expect_error(
+  GetProbs(data1, time_dim = 1),
+  'Parameter "time_dim" must be a character string.'
+  )
+  expect_error(
+  GetProbs(data1, time_dim = 'ftime'),
+  "Parameter 'time_dim' is not found in 'data' dimensions."
+  )
+  # memb_dim
+  expect_error(
+  GetProbs(data1, memb_dim = TRUE),
+  "Parameter 'memb_dim' must be a character string."
+  )
+  expect_error(
+  GetProbs(data1, memb_dim = 'memb'),
+  "Parameter 'memb_dim' is not found in 'data' dimensions."
+  )
+  # prob_thresholds
+  expect_error(
+  GetProbs(data1, prob_thresholds = 1),
+  "Parameter 'prob_thresholds' must be a numeric vector between 0 and 1."
+  )
+  # indices_for_clim
+  expect_error(
+  GetProbs(data1, indices_for_quantiles = array(1:6, dim = c(2, 3))),
+  "Parameter 'indices_for_quantiles' must be NULL or a numeric vector."
+  )
+  expect_error(
+  GetProbs(data1, indices_for_quantiles = 3:11),
+  "Parameter 'indices_for_quantiles' should be the indices of 'time_dim'."
+  )
+  # cross.val
+  expect_error(
+  GetProbs(data1, cross.val = 1),
+  "Parameter 'cross.val' must be either TRUE or FALSE."
+  )
+  # weights
+  expect_error(
+  GetProbs(data1, weights = c(0, 1)),
+  "Parameter 'weights' must be a named numeric array."
+  )
+  expect_error(
+  GetProbs(data1, weights = array(1, dim = c(member = 3, time = 10))),
+  "Parameter 'weights' must have dimension sdate and member."
+  )
+  expect_error(
+  GetProbs(data1, weights = array(1, dim = c(member = 3, sdate = 1))),
+  "Parameter 'weights' must have the same dimension length as sdate and member dimension in 'data'."
+  )
+#  expect_error(
+#  GetProbs(data3, weights = array(1, dim = c(member = 3, time = 10, dataset = 3)), dat_dim = 'dataset'),
+#  "Parameter 'weights' must have three dimensions with the names of 'memb_dim', 'time_dim' and 'dat_dim'."
+#  )
+  # ncores
+  expect_error(
+  GetProbs(data1, ncores = 1.5),
+  "Parameter 'ncores' must be either NULL or a positive integer."
+  )
+
+})
+
+##############################################
+test_that("2. Output checks: dat1", {
+
+expect_equal(
+dim(GetProbs(data1)),
+c(bin = 3, sdate = 10, time = 2)
+)
+expect_equal(
+c(GetProbs(data1)[, 10, 2]),
+c(0.3333333, 0.3333333, 0.3333333),
+tolerance = 0.0001
+)
+expect_equal(
+c(GetProbs(data1)[, 2, 2]),
+c(0.6666667, 0.3333333, 0.0000000),
+tolerance = 0.0001
+)
+
+# indices_for_quantiles
+expect_equal(
+dim(GetProbs(data1, indices_for_quantiles = 4:7)),
+c(bin = 3, sdate = 10, time = 2)
+)
+expect_equal(
+c(GetProbs(data1, indices_for_quantiles = 4:7)[, 10, 2]),
+c(0.3333333, 0.6666667, 0.0000000),
+tolerance = 0.0001
+)
+
+# prob_thresholds
+expect_equal(
+dim(GetProbs(data1, prob_thresholds = c(0.25, 0.5, 0.75))), 
+c(bin = 4, sdate = 10, time = 2)
+)
+expect_equal(
+c(GetProbs(data1, prob_thresholds = c(0.25, 0.5, 0.75))[, 10, 2]),
+c(0.3333333, 0.3333333, 0.3333333, 0.0000000),
+tolerance = 0.0001
+)
+expect_equal(
+c(GetProbs(data1, prob_thresholds = c(0.25, 0.5, 0.75))[, 3, 2]),
+c(0.0000000, 0.6666667, 0.0000000, 0.3333333),
+tolerance = 0.0001
+)
+
+# weights
+expect_equal(
+dim(GetProbs(data1, weights = weights1)),
+c(bin = 3, sdate = 10, time = 2)
+)
+expect_equal(
+c(GetProbs(data1, weights = weights1)[, 10, 2]),
+c(0.3327220, 0.5296149, 0.1376631),
+tolerance = 0.0001
+)
+expect_equal(
+sum(c(GetProbs(data1, weights = weights1))),
+20
+)
+
+# cross.val
+expect_equal(
+dim(GetProbs(data1, cross.val = T)),
+c(bin = 3, sdate = 10, time = 2)
+)
+expect_equal(
+c(GetProbs(data1, cross.val = T)[, 10, 2]),
+c(0.3333333, 0.3333333, 0.3333333),
+tolerance = 0.0001
+)
+expect_equal(
+c(GetProbs(data1, cross.val = T)[, 4, 2]),
+c(0.0000000, 0.6666667, 0.3333333),
+tolerance = 0.0001
+)
+
+# cross.val + weights
+expect_equal(
+dim(GetProbs(data1, cross.val = T, weights = weights1)),
+c(bin = 3, sdate = 10, time = 2)
+)
+expect_equal(
+c(GetProbs(data1, cross.val = T, weights = weights1)[, 10, 2]),
+c(0.3335612, 0.5277459, 0.1386929),
+tolerance = 0.0001
+)
+
+})
+
+
+##############################################
+test_that("3. Output checks: dat2", {
+
+expect_equal(
+dim(GetProbs(data2, memb_dim = NULL)),
+c(bin = 3, sdate = 10, time = 2)
+)
+expect_equal(
+c(GetProbs(data2, memb_dim = NULL)[, 10, 2]),
+c(0, 1, 0)
+)
+expect_equal(
+unique(c(GetProbs(data2, memb_dim = NULL))),
+c(1, 0)
+)
+
+# indices_for_quantiles
+expect_equal(
+dim(GetProbs(data2, memb_dim = NULL, indices_for_quantiles = 4:7)),
+c(bin = 3, sdate = 10, time = 2)
+)
+expect_equal(
+c(GetProbs(data2, memb_dim = NULL, indices_for_quantiles = 4:7)[, 10, 2]),
+c(0, 0, 1)
+)
+
+# prob_thresholds
+expect_equal(
+dim(GetProbs(data2, memb_dim = NULL, prob_thresholds = c(0.25, 0.5, 0.75))),
+c(bin = 4, sdate = 10, time = 2)
+)
+expect_equal(
+c(GetProbs(data2, memb_dim = NULL, prob_thresholds = c(0.25, 0.5, 0.75))[, 10, 2]),
+c(0, 0, 1, 0)
+)
+expect_equal(
+c(GetProbs(data2, memb_dim = NULL, prob_thresholds = c(0.25, 0.5, 0.75))[, 3, 2]),
+c(1, 0, 0, 0)
+)
+
+# weights
+expect_equal(
+dim(GetProbs(data2, memb_dim = NULL, weights = weights2)),
+c(bin = 3, sdate = 10, time = 2)
+)
+expect_equal(
+c(GetProbs(data2, memb_dim = NULL, weights = weights2)[, 10, 2]),
+c(0, 1, 0)
+)
+expect_equal(
+sum(c(GetProbs(data2, memb_dim = NULL, weights = weights2))),
+20
+)
+
+# cross.val
+expect_equal(
+dim(GetProbs(data2, memb_dim = NULL, cross.val = T)),
+c(bin = 3, sdate = 10, time = 2)
+)
+expect_equal(
+c(GetProbs(data2, memb_dim = NULL, cross.val = T)[, 10, 2]),
+c(0, 1, 0)
+)
+expect_equal(
+c(GetProbs(data2, memb_dim = NULL, cross.val = T)[, 4, 2]),
+c(1, 0, 0)
+)
+
+# cross.val + weights
+expect_equal(
+dim(GetProbs(data2, memb_dim = NULL, cross.val = T, weights = weights2)),
+c(bin = 3, sdate = 10, time = 2)
+)
+expect_equal(
+c(GetProbs(data2, memb_dim = NULL, cross.val = T, weights = weights2)[, 10, 2]),
+c(0, 1, 0)
+)
+
+})
-- 
GitLab


From f49d8a1102cb1d1609ccafd79dad9933b7c75189 Mon Sep 17 00:00:00 2001
From: aho <an.ho@bsc.es>
Date: Tue, 28 Mar 2023 17:35:01 +0200
Subject: [PATCH 2/4] Don't ignore unit tests in pipeline

---
 .Rbuildignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.Rbuildignore b/.Rbuildignore
index 6008b57..4212858 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -10,7 +10,7 @@ README\.md$
 vignettes
 .gitlab-ci.yml
 # unit tests should be ignored when building the package for CRAN
-^tests$
+#^tests$
 # CDO is not in windbuilder, so we can test the unit tests by winbuilder
 # but test-CDORemap.R and test-Load.R needs to be hidden
 #tests/testthat/test-CDORemap.R
-- 
GitLab


From 92d5776f89b56508a08980dd59ed0740d5bd91fc Mon Sep 17 00:00:00 2001
From: Carlos Delgado <carlos.delgado@bsc.es>
Date: Wed, 29 Mar 2023 09:24:40 +0200
Subject: [PATCH 3/4] corrected indices_for_quantiles check to avoid warning

---
 R/GetProbs.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/GetProbs.R b/R/GetProbs.R
index 1f7651d..351654c 100644
--- a/R/GetProbs.R
+++ b/R/GetProbs.R
@@ -85,7 +85,7 @@ GetProbs <- function(data, time_dim = 'sdate', memb_dim = 'member', indices_for_
       stop("Parameter 'indices_for_quantiles' must be NULL or a numeric vector.")
     } else if (length(indices_for_quantiles) > dim(data)[time_dim] |
                max(indices_for_quantiles) > dim(data)[time_dim] |
-               any(indices_for_quantiles) < 1) {
+               any(indices_for_quantiles < 1)) {
       stop("Parameter 'indices_for_quantiles' should be the indices of 'time_dim'.")
     }
   }
-- 
GitLab


From d1c4feebe53be356b3a6bf3dfa55d3d052c05b79 Mon Sep 17 00:00:00 2001
From: Carlos Delgado <carlos.delgado@bsc.es>
Date: Wed, 29 Mar 2023 11:05:08 +0200
Subject: [PATCH 4/4] documentation

---
 R/GetProbs.R | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/R/GetProbs.R b/R/GetProbs.R
index 351654c..6a4dcfe 100644
--- a/R/GetProbs.R
+++ b/R/GetProbs.R
@@ -1,39 +1,45 @@
-#'Compute forecast probability
+#'Compute probabilistic forecasts or the corresponding observations
 #'
-#'Compute the forecast probability based on the relative thresholds. A certain 
-#'period can be specified to calculate the quantiles between each probabilistic
-#'category. If data has ensemble, all the members are used together to 
-#'calculate the probabilities. Weights of each member and time can be provided.
-#'Cross-validation can be chosen when absolute threshold is calculated.
+#'Compute probabilistic forecasts from an ensemble based on the relative thresholds, 
+#'or the probabilistic observations (i.e., which probabilistic category was observed). 
+#'A reference period can be specified to calculate the absolute thresholds between 
+#'each probabilistic category. The absolute thresholds can be computed in cross-validation 
+#'mode. If data is an ensemble, the probabilities are calculated as the percentage of 
+#'members that fall into each category. For observations (or forecast without member 
+#'dimension), 1 means that the event happened, while 0 indicates that the event did 
+#'not happen. Weighted probabilities can be computed if the weights are provided for 
+#'each ensemble member and time step.
 #'
-#'@param data A named numerical array of the forecast or observation with at 
-#'  least time dimension.
+#'@param data A named numerical array of the forecasts or observations with, at 
+#'  least, time dimension.
 #'@param time_dim A character string indicating the name of the time dimension.
 #'  The default value is 'sdate'.
 #'@param memb_dim A character string indicating the name of the member dimension
-#'  to compute the probabilities of the forecast. The default value is 'member'.
+#'  to compute the probabilities of the forecast, or NULL if there is no member
+#'  dimension (e.g., for observations, or for forecast with only one ensemble
+#'  member). The default value is 'member'.
 #'@param dat_dim A character string indicating the name of dataset dimension. 
 #'  The default value is NULL, which means no dataset dimension.
 #'@param prob_thresholds A numeric vector of the relative thresholds (from 0 to
 #'  1) between the categories. The default value is c(1/3, 2/3), which 
 #'  corresponds to tercile equiprobable categories.
 #'@param indices_for_quantiles A vector of the indices to be taken along 'time_dim' 
-#'  for computing the thresholds between the probabilistic categories. If NULL,
-#'  the whole period is used. The default value is NULL.
+#'  for computing the absolute thresholds between the probabilistic categories. 
+#'  If NULL, the whole period is used. The default value is NULL.
 #'@param weights A named numerical array of the weights for 'data' with 
-#'  dimensions 'time_dim' and 'memb_dim' (if 'data' has it). The default value is 
+#'  dimensions 'time_dim' and 'memb_dim' (if 'data' has them). The default value is 
 #'  NULL. The ensemble should have at least 70 members or span at least 10 time 
 #'  steps and have more than 45 members if consistency between the weighted and
 #'  unweighted methodologies is desired.
 #'@param cross.val A logical indicating whether to compute the thresholds between 
-#'  probabilistic categories in cross-validation. The default value is FALSE.
+#'  probabilistic categories in cross-validation mode. The default value is FALSE.
 #'@param ncores An integer indicating the number of cores to use for parallel 
 #'  computation. The default value is NULL.
 #'
 #'@return 
 #'A numerical array of probabilities with dimensions c(bin, the rest dimensions
-#'of 'data' except 'memb_dim'). 'bin' dimension has the length of categories, 
-#'i.e., \code{length(prob_thresholds) + 1}.
+#'of 'data' except 'memb_dim'). 'bin' dimension has the length of probabilistic 
+#'categories, i.e., \code{length(prob_thresholds) + 1}.
 #'
 #'@examples
 #'data <- array(rnorm(2000), dim = c(ensemble = 25, sdate = 20, time = 4))
-- 
GitLab