From 2c9ca69f6859d7cc77fb5035f346733451b3c1c2 Mon Sep 17 00:00:00 2001
From: jcos <josep.cos@bsc.es>
Date: Fri, 20 May 2022 15:59:28 +0200
Subject: [PATCH 1/5] basic workflow

---
 R/RPS.R | 84 ++++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 71 insertions(+), 13 deletions(-)

diff --git a/R/RPS.R b/R/RPS.R
index 84956ac..a5d54d7 100644
--- a/R/RPS.R
+++ b/R/RPS.R
@@ -48,7 +48,7 @@
 #'@export
 RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', 
                 prob_thresholds = c(1/3, 2/3), indices_for_clim = NULL, Fair = FALSE, 
-                ncores = NULL) {
+                ncores = NULL, weights = NULL) {
   
   # Check inputs
   ## exp and obs (1)
@@ -130,8 +130,8 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member',
                fun = .RPS,
                prob_thresholds = prob_thresholds, 
                indices_for_clim = indices_for_clim, Fair = Fair,
-               ncores = ncores)$output1
-
+               ncores = ncores, weights = weights)$output1
+  browser()
   # Return only the mean RPS
   rps <- MeanDims(rps, time_dim, na.rm = FALSE)
  
@@ -140,12 +140,10 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member',
 
 
 .RPS <- function(exp, obs, prob_thresholds = c(1/3, 2/3), 
-                 indices_for_clim = NULL, Fair = FALSE) {
+                 indices_for_clim = NULL, Fair = FALSE, weights = NULL) {
   # exp: [sdate, memb]
   # obs: [sdate, (memb)]
-
-  exp_probs <- .get_probs(data = exp, indices_for_quantiles = indices_for_clim, 
-                          prob_thresholds = prob_thresholds)
+  exp_probs <- .get_probs(data = exp, indices_for_quantiles = indices_for_clim, prob_thresholds = prob_thresholds, weights = weights)
   # exp_probs: [bin, sdate] 
   obs_probs <- .get_probs(data = obs, indices_for_quantiles = indices_for_clim, 
                           prob_thresholds = prob_thresholds)
@@ -168,7 +166,7 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member',
   return(rps)
 }
 
-.get_probs <- function(data, indices_for_quantiles, prob_thresholds) {
+.get_probs <- function(data, indices_for_quantiles, prob_thresholds, weights = NULL) {
   # if exp: [sdate, memb]
   # if obs: [sdate, (memb)]
 
@@ -176,18 +174,78 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member',
   if (length(dim(data)) == 1) dim(data) <- c(dim(data), 1)
 
   # Absolute thresholds
-  quantiles <- quantile(x = as.vector(data[indices_for_quantiles, ]), probs = prob_thresholds, type = 8, na.rm = TRUE)
-
+  if (is.null(weights)){
+    quantiles <- quantile(x = as.vector(data[indices_for_quantiles, ]), probs = prob_thresholds, type = 8, na.rm = TRUE)
+  } else {
+    # linear approach to compute weighted distribution thresholds.
+    weights <- s2dv::Reorder(data = weights, order = names(dim(data))) # move to check or in .rps *******
+    sorted_arrays <- .sorted_distributions(data[indices_for_quantiles, ], weights[indices_for_quantiles, ])
+    sorted_data <- sorted_arrays$data
+    sorted_weights <- sorted_arrays$weights
+    # approach 1 (interpolate from discrete cumulative weights distribution. NOT type 8)
+    # function vvvvvvvvvvvvvvvvvvv
+    # cumulative_weights <- cumsum(sorted_weights) - 0.5 * sorted_weights
+    # cumulative_weights <- cumulative_weights - cumulative_weights[1] # fix the 0
+    # cumulative_weights <- cumulative_weights / cumulative_weights[length(cumulative_weights)] # fix the 1
+    # # function ^^^^^^^^^^^^^^^^^^^
+    # quantiles <- interp1(cumulative_weights, sorted_data, probs, "cubic") # library(pracma)
+    # approach 2 (interpolate from the polynomial fit of the cumulative weights distribution)
+    order <- 5
+    fit_model <- lm(sorted_data ~ poly(cumsum(sorted_weights), order, raw=TRUE)) # order 10, but we should think what order we want to use
+    quantiles <- prob_thresholds*0 # make a list of zeros the length of prob_thresholds
+    for (i in 1:(order+1)) {quantiles <- quantiles+fit_model$coefficients[i]*prob_thresholds^(i-1)}
+  }
   # Probabilities
   probs <- array(dim = c(bin = length(quantiles) + 1, dim(data)[1])) # [bin, sdate]
   for (i_time in 1:dim(data)[1]) {
-    if (anyNA(data[i_time, ])) {
-      probs[, i_time] <- rep(NA, dim = length(quantiles) + 1)
+    if (is.null(weights)){
+      if (anyNA(data[i_time, ])) {
+        probs[, i_time] <- rep(NA, dim = length(quantiles) + 1)
+      } else {
+        probs[, i_time] <- colMeans(easyVerification::convert2prob(data[i_time, ], threshold = quantiles))
+      }
     } else {
-      probs[, i_time] <- colMeans(easyVerification::convert2prob(data[i_time, ], threshold = quantiles))
+      sorted_arrays <- .sorted_distributions(data[i_time, ], weights[i_time, ])
+      sorted_data <- sorted_arrays$data
+      sorted_weights <- sorted_arrays$weights
+      # approach 1. interpolate
+      cumulative_weights <- cumsum(sorted_weights) - 0.5 * sorted_weights
+      cumulative_weights <- cumulative_weights - cumulative_weights[1] # fix the 0
+      cumulative_weights <- cumulative_weights / cumulative_weights[length(cumulative_weights)] # fix the 1
+      # Solve issues for quantiles outside the sorted_data range:
+      xtra_length <- 10
+      if (min(quantiles)<min(sorted_data)){
+        xtra <- seq(min(quantiles), min(sorted_data), length.out=xtra_length)
+        sorted_data <- c(xtra,sorted_data)
+        cumulative_weights <- c(numeric(xtra_length),cumulative_weights)
+      }
+      if (max(quantiles)>max(sorted_data)){
+        xtra <- seq(max(sorted_data), max(quantiles), length.out=xtra_length)
+        sorted_data <- c(sorted_data,xtra)
+        cumulative_weights <- c(cumulative_weights,numeric(xtra_length))
+      }
+      integrated_probs <- interp1(sorted_data, cumulative_weights, quantiles, "cubic")
+      probs[, i_time] <- c(integrated_probs[1], integrated_probs[2]-integrated_probs[1], 1-integrated_probs[2])
+      if (min(probs[, i_time])<0) {print(paste0("ERROR negative probability in i_time=",i_time))} ######################## checks
+      if (max(probs[, i_time])>1) {print(paste0("ERROR >1 probability in i_time=",i_time))} ######################## checks
+      
+      # approach 2. might give erroneous results for quantiles that fall outside the data range. Try normal distro
+      # order <- 5
+      # fit_model <- lm(cumsum(sorted_weights) ~ poly(sorted_data, order, raw=TRUE)) # order 10, but we should think what order we want to use
+      # integrated_probs <- quantiles*0
+      # for (i in 1:(order+1)) {integrated_probs <- integrated_probs+fit_model$coefficients[i]*quantiles^(i-1)}
+      # probs[, i_time] <- c(integrated_probs[1], integrated_probs[2]-integrated_probs[1], 1-integrated_probs[2])
     }
+
   }
 
   return(probs)
 }
 
+.sorted_distributions <- function(data_vector, weights_vector){
+  weights_vector <- as.vector(weights_vector)
+  data_vector <- as.vector(data_vector)
+  weights_vector <- weights_vector / sum(weights_vector) # normalize to 1
+  sorter <- order(data_vector)
+  return(list("data" = data_vector[sorter], "weights" = weights_vector[sorter]))
+}
\ No newline at end of file
-- 
GitLab


From 7536e289fd004435716c600611bafa05273ae37b Mon Sep 17 00:00:00 2001
From: jcos <josep.cos@bsc.es>
Date: Thu, 26 May 2022 12:34:59 +0200
Subject: [PATCH 2/5] switch to approx() and use it in the quantiles and probs

---
 R/RPS.R | 105 ++++++++++++++++++++++++++++++--------------------------
 1 file changed, 56 insertions(+), 49 deletions(-)

diff --git a/R/RPS.R b/R/RPS.R
index a5d54d7..e323936 100644
--- a/R/RPS.R
+++ b/R/RPS.R
@@ -131,7 +131,7 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member',
                prob_thresholds = prob_thresholds, 
                indices_for_clim = indices_for_clim, Fair = Fair,
                ncores = ncores, weights = weights)$output1
-  browser()
+  # browser()
   # Return only the mean RPS
   rps <- MeanDims(rps, time_dim, na.rm = FALSE)
  
@@ -178,67 +178,70 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member',
     quantiles <- quantile(x = as.vector(data[indices_for_quantiles, ]), probs = prob_thresholds, type = 8, na.rm = TRUE)
   } else {
     # linear approach to compute weighted distribution thresholds.
+    # browser()
+    #weights<-as.array(weights)
     weights <- s2dv::Reorder(data = weights, order = names(dim(data))) # move to check or in .rps *******
+    # is there a better way of doing this: vvvvvvvvvv
     sorted_arrays <- .sorted_distributions(data[indices_for_quantiles, ], weights[indices_for_quantiles, ])
     sorted_data <- sorted_arrays$data
-    sorted_weights <- sorted_arrays$weights
-    # approach 1 (interpolate from discrete cumulative weights distribution. NOT type 8)
-    # function vvvvvvvvvvvvvvvvvvv
-    # cumulative_weights <- cumsum(sorted_weights) - 0.5 * sorted_weights
-    # cumulative_weights <- cumulative_weights - cumulative_weights[1] # fix the 0
-    # cumulative_weights <- cumulative_weights / cumulative_weights[length(cumulative_weights)] # fix the 1
-    # # function ^^^^^^^^^^^^^^^^^^^
-    # quantiles <- interp1(cumulative_weights, sorted_data, probs, "cubic") # library(pracma)
+    cumulative_weights <- sorted_arrays$cumulative_weights
+    quantiles <- approx(cumulative_weights, sorted_data, prob_thresholds, "linear")$y # library(pracma)
+
     # approach 2 (interpolate from the polynomial fit of the cumulative weights distribution)
-    order <- 5
-    fit_model <- lm(sorted_data ~ poly(cumsum(sorted_weights), order, raw=TRUE)) # order 10, but we should think what order we want to use
-    quantiles <- prob_thresholds*0 # make a list of zeros the length of prob_thresholds
-    for (i in 1:(order+1)) {quantiles <- quantiles+fit_model$coefficients[i]*prob_thresholds^(i-1)}
+    # order <- 5 # should it be fixed? should it be provided? should it be computed according to the number of points available?
+    # fit_model <- lm(sorted_data ~ poly(cumsum(sorted_weights), order, raw=TRUE)) 
+    # quantiles <- numeric(length(prob_thresholds)) # make a list of zeros the length of prob_thresholds
+    # for (i in 1:(order+1)) {quantiles <- quantiles+fit_model$coefficients[i]*prob_thresholds^(i-1)}
   }
   # Probabilities
+  # browser()
   probs <- array(dim = c(bin = length(quantiles) + 1, dim(data)[1])) # [bin, sdate]
   for (i_time in 1:dim(data)[1]) {
-    if (is.null(weights)){
-      if (anyNA(data[i_time, ])) {
+    if (anyNA(data[i_time, ])) {
         probs[, i_time] <- rep(NA, dim = length(quantiles) + 1)
-      } else {
-        probs[, i_time] <- colMeans(easyVerification::convert2prob(data[i_time, ], threshold = quantiles))
-      }
     } else {
-      sorted_arrays <- .sorted_distributions(data[i_time, ], weights[i_time, ])
-      sorted_data <- sorted_arrays$data
-      sorted_weights <- sorted_arrays$weights
-      # approach 1. interpolate
-      cumulative_weights <- cumsum(sorted_weights) - 0.5 * sorted_weights
-      cumulative_weights <- cumulative_weights - cumulative_weights[1] # fix the 0
-      cumulative_weights <- cumulative_weights / cumulative_weights[length(cumulative_weights)] # fix the 1
-      # Solve issues for quantiles outside the sorted_data range:
-      xtra_length <- 10
-      if (min(quantiles)<min(sorted_data)){
-        xtra <- seq(min(quantiles), min(sorted_data), length.out=xtra_length)
-        sorted_data <- c(xtra,sorted_data)
-        cumulative_weights <- c(numeric(xtra_length),cumulative_weights)
-      }
-      if (max(quantiles)>max(sorted_data)){
-        xtra <- seq(max(sorted_data), max(quantiles), length.out=xtra_length)
-        sorted_data <- c(sorted_data,xtra)
-        cumulative_weights <- c(cumulative_weights,numeric(xtra_length))
+      if (is.null(weights)){
+          probs[, i_time] <- colMeans(easyVerification::convert2prob(data[i_time, ], threshold = quantiles))
+          print("............probs...................")
+          print(probs[, i_time])
+          print("...............................")
+      } else {
+        # function vvvv
+        sorted_arrays <- .sorted_distributions(data[i_time, ], weights[i_time, ])
+        sorted_data <- sorted_arrays$data
+        cumulative_weights <- sorted_arrays$cumulative_weights
+        # find any quantiles that are outside the data range
+        integrated_probs <- array(dim = c(bin=length(quantiles)))
+        for (i_quant in 1:length(quantiles)) {
+          if (quantiles[i_quant] < min(sorted_data)) {
+            integrated_probs[i_quant] = 0 # append
+          } else if (max(sorted_data) < quantiles[i_quant]) {
+            integrated_probs[i_quant] = 1 # append
+          } else {
+            integrated_probs[i_quant] = approx(sorted_data, cumulative_weights, quantiles[i_quant], "linear")$y
+          }
+        }
+        probs[, i_time] <- append(integrated_probs, 1) - append(0, integrated_probs)
+        # print("..........quantiles.........")
+        # print(quantiles)
+        # print("..........integrated probs.........")
+        # print(integrated_probs)
+        print("..........probs.........")
+        print(probs[, i_time])
+        print("..............................")
+        print("..............................")
+        if (min(probs[, i_time])<0) {print(paste0("ERROR negative probability in i_time=",i_time))} ######################## checks
+        if (max(probs[, i_time])>1) {print(paste0("ERROR >1 probability in i_time=",i_time))} ######################## checks
+        # approach 2. might give erroneous results for quantiles that fall outside the data range. Try normal distro
+        # order <- 5
+        # fit_model <- lm(cumsum(sorted_weights) ~ poly(sorted_data, order, raw=TRUE)) # order 10, but we should think what order we want to use
+        # integrated_probs <- quantiles*0
+        # for (i in 1:(order+1)) {integrated_probs <- integrated_probs+fit_model$coefficients[i]*quantiles^(i-1)}
+        # probs[, i_time] <- c(integrated_probs[1], integrated_probs[2]-integrated_probs[1], 1-integrated_probs[2])
       }
-      integrated_probs <- interp1(sorted_data, cumulative_weights, quantiles, "cubic")
-      probs[, i_time] <- c(integrated_probs[1], integrated_probs[2]-integrated_probs[1], 1-integrated_probs[2])
-      if (min(probs[, i_time])<0) {print(paste0("ERROR negative probability in i_time=",i_time))} ######################## checks
-      if (max(probs[, i_time])>1) {print(paste0("ERROR >1 probability in i_time=",i_time))} ######################## checks
-      
-      # approach 2. might give erroneous results for quantiles that fall outside the data range. Try normal distro
-      # order <- 5
-      # fit_model <- lm(cumsum(sorted_weights) ~ poly(sorted_data, order, raw=TRUE)) # order 10, but we should think what order we want to use
-      # integrated_probs <- quantiles*0
-      # for (i in 1:(order+1)) {integrated_probs <- integrated_probs+fit_model$coefficients[i]*quantiles^(i-1)}
-      # probs[, i_time] <- c(integrated_probs[1], integrated_probs[2]-integrated_probs[1], 1-integrated_probs[2])
     }
 
   }
-
   return(probs)
 }
 
@@ -247,5 +250,9 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member',
   data_vector <- as.vector(data_vector)
   weights_vector <- weights_vector / sum(weights_vector) # normalize to 1
   sorter <- order(data_vector)
-  return(list("data" = data_vector[sorter], "weights" = weights_vector[sorter]))
+  sorted_weights <- weights_vector[sorter]
+  cumulative_weights <- cumsum(sorted_weights) - 0.5 * sorted_weights
+  cumulative_weights <- cumulative_weights - cumulative_weights[1] # fix the 0
+  cumulative_weights <- cumulative_weights / cumulative_weights[length(cumulative_weights)] # fix the 1
+  return(list("data" = data_vector[sorter], "cumulative_weights" = cumulative_weights))
 }
\ No newline at end of file
-- 
GitLab


From 281ff94a074846ace6d65a5ab2c1c06d1b54858d Mon Sep 17 00:00:00 2001
From: jcos <josep.cos@bsc.es>
Date: Wed, 8 Jun 2022 16:39:54 +0200
Subject: [PATCH 3/5] cleanup

---
 R/RPS.R | 39 ++++++---------------------------------
 1 file changed, 6 insertions(+), 33 deletions(-)

diff --git a/R/RPS.R b/R/RPS.R
index e323936..e0e19b4 100644
--- a/R/RPS.R
+++ b/R/RPS.R
@@ -122,7 +122,6 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member',
   } else {
     target_dims_obs <- c(time_dim, memb_dim)
   }
-
   rps <- Apply(data = list(exp = exp, obs = obs), 
                target_dims = list(exp = c(time_dim, memb_dim), 
                                   obs = target_dims_obs),
@@ -177,24 +176,13 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member',
   if (is.null(weights)){
     quantiles <- quantile(x = as.vector(data[indices_for_quantiles, ]), probs = prob_thresholds, type = 8, na.rm = TRUE)
   } else {
-    # linear approach to compute weighted distribution thresholds.
-    # browser()
-    #weights<-as.array(weights)
     weights <- s2dv::Reorder(data = weights, order = names(dim(data))) # move to check or in .rps *******
-    # is there a better way of doing this: vvvvvvvvvv
     sorted_arrays <- .sorted_distributions(data[indices_for_quantiles, ], weights[indices_for_quantiles, ])
     sorted_data <- sorted_arrays$data
     cumulative_weights <- sorted_arrays$cumulative_weights
-    quantiles <- approx(cumulative_weights, sorted_data, prob_thresholds, "linear")$y # library(pracma)
-
-    # approach 2 (interpolate from the polynomial fit of the cumulative weights distribution)
-    # order <- 5 # should it be fixed? should it be provided? should it be computed according to the number of points available?
-    # fit_model <- lm(sorted_data ~ poly(cumsum(sorted_weights), order, raw=TRUE)) 
-    # quantiles <- numeric(length(prob_thresholds)) # make a list of zeros the length of prob_thresholds
-    # for (i in 1:(order+1)) {quantiles <- quantiles+fit_model$coefficients[i]*prob_thresholds^(i-1)}
+    quantiles <- approx(cumulative_weights, sorted_data, prob_thresholds, "linear")$y
   }
   # Probabilities
-  # browser()
   probs <- array(dim = c(bin = length(quantiles) + 1, dim(data)[1])) # [bin, sdate]
   for (i_time in 1:dim(data)[1]) {
     if (anyNA(data[i_time, ])) {
@@ -202,42 +190,27 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member',
     } else {
       if (is.null(weights)){
           probs[, i_time] <- colMeans(easyVerification::convert2prob(data[i_time, ], threshold = quantiles))
-          print("............probs...................")
-          print(probs[, i_time])
-          print("...............................")
       } else {
-        # function vvvv
         sorted_arrays <- .sorted_distributions(data[i_time, ], weights[i_time, ])
         sorted_data <- sorted_arrays$data
         cumulative_weights <- sorted_arrays$cumulative_weights
         # find any quantiles that are outside the data range
         integrated_probs <- array(dim = c(bin=length(quantiles)))
         for (i_quant in 1:length(quantiles)) {
+          # for thresholds falling under the distribution
           if (quantiles[i_quant] < min(sorted_data)) {
-            integrated_probs[i_quant] = 0 # append
+            integrated_probs[i_quant] = 0 
+          # for thresholds falling over the distribution
           } else if (max(sorted_data) < quantiles[i_quant]) {
-            integrated_probs[i_quant] = 1 # append
+            integrated_probs[i_quant] = 1
           } else {
             integrated_probs[i_quant] = approx(sorted_data, cumulative_weights, quantiles[i_quant], "linear")$y
           }
         }
         probs[, i_time] <- append(integrated_probs, 1) - append(0, integrated_probs)
-        # print("..........quantiles.........")
-        # print(quantiles)
-        # print("..........integrated probs.........")
-        # print(integrated_probs)
-        print("..........probs.........")
-        print(probs[, i_time])
-        print("..............................")
-        print("..............................")
+        # This shouldn't happen, idk if this cehck is needed then
         if (min(probs[, i_time])<0) {print(paste0("ERROR negative probability in i_time=",i_time))} ######################## checks
         if (max(probs[, i_time])>1) {print(paste0("ERROR >1 probability in i_time=",i_time))} ######################## checks
-        # approach 2. might give erroneous results for quantiles that fall outside the data range. Try normal distro
-        # order <- 5
-        # fit_model <- lm(cumsum(sorted_weights) ~ poly(sorted_data, order, raw=TRUE)) # order 10, but we should think what order we want to use
-        # integrated_probs <- quantiles*0
-        # for (i in 1:(order+1)) {integrated_probs <- integrated_probs+fit_model$coefficients[i]*quantiles^(i-1)}
-        # probs[, i_time] <- c(integrated_probs[1], integrated_probs[2]-integrated_probs[1], 1-integrated_probs[2])
       }
     }
 
-- 
GitLab


From f4c5f17a6969c9fa40879cf4146f3ba0a02b5c4f Mon Sep 17 00:00:00 2001
From: aho <an.ho@bsc.es>
Date: Fri, 10 Jun 2022 17:28:52 +0200
Subject: [PATCH 4/5] Add 'weights' documentation

---
 R/RPS.R                    | 53 ++++++++++++++++++++++++--------------
 R/RPSS.R                   | 26 +++++++++++++++----
 man/RPS.Rd                 |  5 ++++
 man/RPSS.Rd                |  5 ++++
 tests/testthat/test-RPS.R  | 21 ++++++++++++++-
 tests/testthat/test-RPSS.R | 21 ++++++++++++++-
 6 files changed, 105 insertions(+), 26 deletions(-)

diff --git a/R/RPS.R b/R/RPS.R
index e0e19b4..00d0c10 100644
--- a/R/RPS.R
+++ b/R/RPS.R
@@ -28,6 +28,9 @@
 #'@param Fair A logical indicating whether to compute the FairRPSS (the 
 #'  potential RPSS that the forecast would have with an infinite ensemble size).
 #'  The default value is FALSE.
+#'@param weights A named two-dimensional numerical array of the weights for each
+#'  member and time. The dimension names should include 'memb_dim' and 
+#'  'time_dim'. The default value is NULL.
 #'@param ncores An integer indicating the number of cores to use for parallel 
 #'  computation. The default value is NULL.
 #'
@@ -47,8 +50,8 @@
 #'@importFrom easyVerification convert2prob
 #'@export
 RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', 
-                prob_thresholds = c(1/3, 2/3), indices_for_clim = NULL, Fair = FALSE, 
-                ncores = NULL, weights = NULL) {
+                prob_thresholds = c(1/3, 2/3), indices_for_clim = NULL, Fair = FALSE,
+                weights = NULL, ncores = NULL) {
   
   # Check inputs
   ## exp and obs (1)
@@ -106,6 +109,18 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member',
   if (!is.logical(Fair)  | length(Fair) > 1) {
     stop("Parameter 'Fair' must be either TRUE or FALSE.")
   }
+  ## weights
+  if (!is.null(weights)) {
+    if (!is.array(weights) | !is.numeric(weights))
+      stop('Parameter "weights" must be a two-dimensional numeric array.')
+    if (length(dim(weights)) != 2 | any(!names(dim(weights)) %in% c(memb_dim, time_dim)))
+      stop("Parameter 'weights' must have two dimensions with the names of memb_dim and time_dim.")
+    if (dim(weights)[memb_dim] != dim(exp)[memb_dim] |
+        dim(weights)[time_dim] != dim(exp)[time_dim]) {
+      stop("Parameter 'weights' must have the same dimension lengths as memb_dim and time_dim in 'exp'.")
+    }
+    weights <- Reorder(weights, c(time_dim, memb_dim))
+  }
   ## ncores
   if (!is.null(ncores)) {
     if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 |
@@ -129,7 +144,7 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member',
                fun = .RPS,
                prob_thresholds = prob_thresholds, 
                indices_for_clim = indices_for_clim, Fair = Fair,
-               ncores = ncores, weights = weights)$output1
+               weights = weights, ncores = ncores)$output1
   # browser()
   # Return only the mean RPS
   rps <- MeanDims(rps, time_dim, na.rm = FALSE)
@@ -145,7 +160,7 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member',
   exp_probs <- .get_probs(data = exp, indices_for_quantiles = indices_for_clim, prob_thresholds = prob_thresholds, weights = weights)
   # exp_probs: [bin, sdate] 
   obs_probs <- .get_probs(data = obs, indices_for_quantiles = indices_for_clim, 
-                          prob_thresholds = prob_thresholds)
+                          prob_thresholds = prob_thresholds, weights = NULL)
   # obs_probs: [bin, sdate]
 
   probs_exp_cumsum <- apply(exp_probs, 2, cumsum)
@@ -173,52 +188,52 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member',
   if (length(dim(data)) == 1) dim(data) <- c(dim(data), 1)
 
   # Absolute thresholds
-  if (is.null(weights)){
+  if (is.null(weights)) {
     quantiles <- quantile(x = as.vector(data[indices_for_quantiles, ]), probs = prob_thresholds, type = 8, na.rm = TRUE)
   } else {
-    weights <- s2dv::Reorder(data = weights, order = names(dim(data))) # move to check or in .rps *******
+    # weights: [sdate, memb]
     sorted_arrays <- .sorted_distributions(data[indices_for_quantiles, ], weights[indices_for_quantiles, ])
     sorted_data <- sorted_arrays$data
     cumulative_weights <- sorted_arrays$cumulative_weights
     quantiles <- approx(cumulative_weights, sorted_data, prob_thresholds, "linear")$y
   }
+
   # Probabilities
   probs <- array(dim = c(bin = length(quantiles) + 1, dim(data)[1])) # [bin, sdate]
   for (i_time in 1:dim(data)[1]) {
     if (anyNA(data[i_time, ])) {
-        probs[, i_time] <- rep(NA, dim = length(quantiles) + 1)
+      probs[, i_time] <- rep(NA, dim = length(quantiles) + 1)
     } else {
-      if (is.null(weights)){
-          probs[, i_time] <- colMeans(easyVerification::convert2prob(data[i_time, ], threshold = quantiles))
+      if (is.null(weights)) {
+        probs[, i_time] <- colMeans(easyVerification::convert2prob(data[i_time, ], threshold = quantiles))
       } else {
         sorted_arrays <- .sorted_distributions(data[i_time, ], weights[i_time, ])
         sorted_data <- sorted_arrays$data
         cumulative_weights <- sorted_arrays$cumulative_weights
         # find any quantiles that are outside the data range
-        integrated_probs <- array(dim = c(bin=length(quantiles)))
+        integrated_probs <- array(dim = c(bin = length(quantiles)))
         for (i_quant in 1:length(quantiles)) {
           # for thresholds falling under the distribution
           if (quantiles[i_quant] < min(sorted_data)) {
-            integrated_probs[i_quant] = 0 
+            integrated_probs[i_quant] <- 0 
           # for thresholds falling over the distribution
           } else if (max(sorted_data) < quantiles[i_quant]) {
-            integrated_probs[i_quant] = 1
+            integrated_probs[i_quant] <- 1
           } else {
-            integrated_probs[i_quant] = approx(sorted_data, cumulative_weights, quantiles[i_quant], "linear")$y
+            integrated_probs[i_quant] <- approx(sorted_data, cumulative_weights, quantiles[i_quant], "linear")$y
           }
         }
         probs[, i_time] <- append(integrated_probs, 1) - append(0, integrated_probs)
-        # This shouldn't happen, idk if this cehck is needed then
-        if (min(probs[, i_time])<0) {print(paste0("ERROR negative probability in i_time=",i_time))} ######################## checks
-        if (max(probs[, i_time])>1) {print(paste0("ERROR >1 probability in i_time=",i_time))} ######################## checks
+        if (min(probs[, i_time]) < 0 | max(probs[, i_time]) > 1) {
+          stop(paste0("Probability in i_time = ", i_time, " is out of [0, 1]."))
+        } 
       }
     }
-
   }
   return(probs)
 }
 
-.sorted_distributions <- function(data_vector, weights_vector){
+.sorted_distributions <- function(data_vector, weights_vector) {
   weights_vector <- as.vector(weights_vector)
   data_vector <- as.vector(data_vector)
   weights_vector <- weights_vector / sum(weights_vector) # normalize to 1
@@ -228,4 +243,4 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member',
   cumulative_weights <- cumulative_weights - cumulative_weights[1] # fix the 0
   cumulative_weights <- cumulative_weights / cumulative_weights[length(cumulative_weights)] # fix the 1
   return(list("data" = data_vector[sorter], "cumulative_weights" = cumulative_weights))
-}
\ No newline at end of file
+}
diff --git a/R/RPSS.R b/R/RPSS.R
index 04c3137..c4e5744 100644
--- a/R/RPSS.R
+++ b/R/RPSS.R
@@ -34,6 +34,9 @@
 #'@param Fair A logical indicating whether to compute the FairRPSS (the 
 #'  potential RPSS that the forecast would have with an infinite ensemble size).
 #'  The default value is FALSE.
+#'@param weights A named two-dimensional numerical array of the weights for each
+#'  member and time. The dimension names should include 'memb_dim' and 
+#'  'time_dim'. The default value is NULL.
 #'@param ncores An integer indicating the number of cores to use for parallel 
 #'  computation. The default value is NULL.
 #'
@@ -62,7 +65,7 @@
 #'@export
 RPSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', memb_dim = 'member',
                  prob_thresholds = c(1/3, 2/3), indices_for_clim = NULL, Fair = FALSE, 
-                 ncores = NULL) {
+                 weights = NULL, ncores = NULL) {
  
   # Check inputs
   ## exp, obs, and ref (1)
@@ -135,6 +138,18 @@ RPSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', memb_dim = 'member',
   if (!is.logical(Fair)  | length(Fair) > 1) {
     stop("Parameter 'Fair' must be either TRUE or FALSE.")
   }
+  ## weights
+  if (!is.null(weights)) {
+    if (!is.array(weights) | !is.numeric(weights))
+      stop('Parameter "weights" must be a two-dimensional numeric array.')
+    if (length(dim(weights)) != 2 | any(!names(dim(weights)) %in% c(memb_dim, time_dim)))
+      stop("Parameter 'weights' must have two dimensions with the names of memb_dim and time_dim.")
+    if (dim(weights)[memb_dim] != dim(exp)[memb_dim] |
+        dim(weights)[time_dim] != dim(exp)[time_dim]) {
+      stop("Parameter 'weights' must have the same dimension lengths as memb_dim and time_dim in 'exp'.")
+    }
+    weights <- Reorder(weights, c(time_dim, memb_dim))
+  }
   ## ncores
   if (!is.null(ncores)) {
     if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 |
@@ -167,25 +182,26 @@ RPSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', memb_dim = 'member',
                   fun = .RPSS,
                   prob_thresholds = prob_thresholds,
                   indices_for_clim = indices_for_clim, Fair = Fair,
+                  weights = weights,
                   ncores = ncores)
   
   return(output)
 }
 
 .RPSS <- function(exp, obs, ref = NULL, prob_thresholds = c(1/3, 2/3), 
-                  indices_for_clim = NULL, Fair = FALSE) {
+                  indices_for_clim = NULL, Fair = FALSE, weights = NULL) {
   # exp: [sdate, memb]
   # obs: [sdate, (memb)]
   # ref: [sdate, memb] or NULL
 
   # RPS of the forecast
   rps_exp <- .RPS(exp = exp, obs = obs, prob_thresholds = prob_thresholds, 
-                  indices_for_clim = indices_for_clim, Fair = Fair)
+                  indices_for_clim = indices_for_clim, Fair = Fair, weights = weights)
   
   # RPS of the reference forecast
   if (is.null(ref)) { ## using climatology as reference forecast
     obs_probs <- .get_probs(data = obs, indices_for_quantiles = indices_for_clim, 
-                            prob_thresholds = prob_thresholds)
+                            prob_thresholds = prob_thresholds, weights = NULL)
     # obs_probs: [bin, sdate] 
 
     clim_probs <- c(prob_thresholds[1], diff(prob_thresholds), 1 - prob_thresholds[length(prob_thresholds)])
@@ -209,7 +225,7 @@ RPSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', memb_dim = 'member',
 
   } else { # use "ref" as reference forecast
     rps_ref <- .RPS(exp = ref, obs = obs, prob_thresholds = prob_thresholds, 
-                    indices_for_clim = indices_for_clim, Fair = Fair)
+                    indices_for_clim = indices_for_clim, Fair = Fair, weights = weights)
   }
   
   # RPSS
diff --git a/man/RPS.Rd b/man/RPS.Rd
index d1acc02..5062671 100644
--- a/man/RPS.Rd
+++ b/man/RPS.Rd
@@ -12,6 +12,7 @@ RPS(
   prob_thresholds = c(1/3, 2/3),
   indices_for_clim = NULL,
   Fair = FALSE,
+  weights = NULL,
   ncores = NULL
 )
 }
@@ -40,6 +41,10 @@ the whole period is used. The default value is NULL.}
 potential RPSS that the forecast would have with an infinite ensemble size).
 The default value is FALSE.}
 
+\item{weights}{A named two-dimensional numerical array of the weights for each
+member and time. The dimension names should include 'memb_dim' and 
+'time_dim'. The default value is NULL.}
+
 \item{ncores}{An integer indicating the number of cores to use for parallel 
 computation. The default value is NULL.}
 }
diff --git a/man/RPSS.Rd b/man/RPSS.Rd
index 6324ea2..ca33c07 100644
--- a/man/RPSS.Rd
+++ b/man/RPSS.Rd
@@ -13,6 +13,7 @@ RPSS(
   prob_thresholds = c(1/3, 2/3),
   indices_for_clim = NULL,
   Fair = FALSE,
+  weights = NULL,
   ncores = NULL
 )
 }
@@ -47,6 +48,10 @@ the whole period is used. The default value is NULL.}
 potential RPSS that the forecast would have with an infinite ensemble size).
 The default value is FALSE.}
 
+\item{weights}{A named two-dimensional numerical array of the weights for each
+member and time. The dimension names should include 'memb_dim' and 
+'time_dim'. The default value is NULL.}
+
 \item{ncores}{An integer indicating the number of cores to use for parallel 
 computation. The default value is NULL.}
 }
diff --git a/tests/testthat/test-RPS.R b/tests/testthat/test-RPS.R
index e5a5f0c..04df1bb 100644
--- a/tests/testthat/test-RPS.R
+++ b/tests/testthat/test-RPS.R
@@ -7,7 +7,8 @@ set.seed(1)
 exp1 <- array(rnorm(60), dim = c(member = 3, sdate = 10, lat = 2))
 set.seed(2)
 obs1 <- array(rnorm(20), dim = c(sdate = 10, lat = 2))
-
+set.seed(3)
+weights1 <- array(abs(rnorm(30)), dim = c(member = 3, sdate = 10))
 # dat2
 set.seed(1)
 exp2 <- array(rnorm(20), dim = c(member = 2, sdate = 10))
@@ -76,6 +77,19 @@ test_that("1. Input checks", {
   RPS(exp1, obs1, Fair = 1),
   "Parameter 'Fair' must be either TRUE or FALSE."
   )
+  # weights
+  expect_error(
+  RPS(exp1, obs1, weights = c(0, 1)),
+  'Parameter "weights" must be a two-dimensional numeric array.'
+  )
+  expect_error(
+  RPS(exp1, obs1, weights = array(1, dim = c(member = 3, time = 10))),
+  "Parameter 'weights' must have two dimensions with the names of memb_dim and time_dim."
+  )
+  expect_error(
+  RPS(exp1, obs1, weights = array(1, dim = c(member = 3, sdate = 1))),
+  "Parameter 'weights' must have the same dimension lengths as memb_dim and time_dim in 'exp'."
+  )
   # ncores
   expect_error(
   RPS(exp2, obs2, ncores = 1.5),
@@ -111,6 +125,11 @@ as.vector(RPS(exp1, obs1, prob_thresholds = seq(0.1, 0.9, 0.1))),
 c(1.600000, 1.888889),
 tolerance = 0.0001
 )
+expect_equal(
+as.vector(RPS(exp1, obs1, weights = weights1)),
+c(0.3692964, 0.5346627),
+tolerance = 0.0001
+)
 
 })
 
diff --git a/tests/testthat/test-RPSS.R b/tests/testthat/test-RPSS.R
index b4c0b7a..c63add0 100644
--- a/tests/testthat/test-RPSS.R
+++ b/tests/testthat/test-RPSS.R
@@ -9,7 +9,8 @@ set.seed(2)
 obs1 <- array(rnorm(20), dim = c(sdate = 10, lat = 2))
 set.seed(3)
 ref1 <- array(rnorm(40), dim = c(member = 2, sdate = 10, lat = 2))
-
+set.seed(4)
+weights1 <- array(abs(rnorm(30)), dim = c(member = 3, sdate = 10))
 # dat2
 set.seed(1)
 exp2 <- array(rnorm(20), dim = c(member = 2, sdate = 10))
@@ -87,6 +88,19 @@ test_that("1. Input checks", {
   RPSS(exp1, obs1, Fair = 1),
   "Parameter 'Fair' must be either TRUE or FALSE."
   )
+  # weights
+  expect_error(
+  RPS(exp1, obs1, weights = c(0, 1)),
+  'Parameter "weights" must be a two-dimensional numeric array.'
+  )
+  expect_error(
+  RPS(exp1, obs1, weights = array(1, dim = c(member = 3, time = 10))),
+  "Parameter 'weights' must have two dimensions with the names of memb_dim and time_dim."
+  )
+  expect_error(
+  RPS(exp1, obs1, weights = array(1, dim = c(member = 3, sdate = 1))),
+  "Parameter 'weights' must have the same dimension lengths as memb_dim and time_dim in 'exp'."
+  )
   # ncores
   expect_error(
   RPSS(exp2, obs2, ncores = 1.5),
@@ -154,6 +168,11 @@ c(0.5259259, 0.4771242),
 tolerance = 0.0001
 )
 expect_equal(
+as.vector(RPSS(exp1, obs1, ref1, weights = weights1)$rpss),
+c(0.6596424, 0.4063579),
+tolerance = 0.0001
+)
+expect_equal(
 as.vector(RPSS(exp1, obs1, ref1)$sign),
 c(FALSE, FALSE)
 )
-- 
GitLab


From 7d2c324dae7639a6a996816dc298521a74182551 Mon Sep 17 00:00:00 2001
From: jcos <josep.cos@bsc.es>
Date: Thu, 16 Jun 2022 17:39:41 +0200
Subject: [PATCH 5/5] docu: add thresholds to use the W method consistently

---
 R/RPS.R  | 4 +++-
 R/RPSS.R | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/R/RPS.R b/R/RPS.R
index 00d0c10..8ded53e 100644
--- a/R/RPS.R
+++ b/R/RPS.R
@@ -30,7 +30,9 @@
 #'  The default value is FALSE.
 #'@param weights A named two-dimensional numerical array of the weights for each
 #'  member and time. The dimension names should include 'memb_dim' and 
-#'  'time_dim'. The default value is NULL.
+#'  'time_dim'. The default value is NULL. The ensemble should have at least 70 
+#'  members or span at least 10 time steps and have more than 45 members if 
+#'  consistency between the weighted and unweighted methodologies is desired.
 #'@param ncores An integer indicating the number of cores to use for parallel 
 #'  computation. The default value is NULL.
 #'
diff --git a/R/RPSS.R b/R/RPSS.R
index c4e5744..3b24777 100644
--- a/R/RPSS.R
+++ b/R/RPSS.R
@@ -36,7 +36,9 @@
 #'  The default value is FALSE.
 #'@param weights A named two-dimensional numerical array of the weights for each
 #'  member and time. The dimension names should include 'memb_dim' and 
-#'  'time_dim'. The default value is NULL.
+#'  'time_dim'. The default value is NULL. The ensemble should have at least 70 
+#'  members or span at least 10 time steps and have more than 45 members if 
+#'  consistency between the weighted and unweighted methodologies is desired.
 #'@param ncores An integer indicating the number of cores to use for parallel 
 #'  computation. The default value is NULL.
 #'
-- 
GitLab