From 2c9ca69f6859d7cc77fb5035f346733451b3c1c2 Mon Sep 17 00:00:00 2001
From: jcos <josep.cos@bsc.es>
Date: Fri, 20 May 2022 15:59:28 +0200
Subject: [PATCH 1/3] basic workflow

---
 R/RPS.R | 84 ++++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 71 insertions(+), 13 deletions(-)

diff --git a/R/RPS.R b/R/RPS.R
index 84956ac..a5d54d7 100644
--- a/R/RPS.R
+++ b/R/RPS.R
@@ -48,7 +48,7 @@
 #'@export
 RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', 
                 prob_thresholds = c(1/3, 2/3), indices_for_clim = NULL, Fair = FALSE, 
-                ncores = NULL) {
+                ncores = NULL, weights = NULL) {
   
   # Check inputs
   ## exp and obs (1)
@@ -130,8 +130,8 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member',
                fun = .RPS,
                prob_thresholds = prob_thresholds, 
                indices_for_clim = indices_for_clim, Fair = Fair,
-               ncores = ncores)$output1
-
+               ncores = ncores, weights = weights)$output1
+  browser()
   # Return only the mean RPS
   rps <- MeanDims(rps, time_dim, na.rm = FALSE)
  
@@ -140,12 +140,10 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member',
 
 
 .RPS <- function(exp, obs, prob_thresholds = c(1/3, 2/3), 
-                 indices_for_clim = NULL, Fair = FALSE) {
+                 indices_for_clim = NULL, Fair = FALSE, weights = NULL) {
   # exp: [sdate, memb]
   # obs: [sdate, (memb)]
-
-  exp_probs <- .get_probs(data = exp, indices_for_quantiles = indices_for_clim, 
-                          prob_thresholds = prob_thresholds)
+  exp_probs <- .get_probs(data = exp, indices_for_quantiles = indices_for_clim, prob_thresholds = prob_thresholds, weights = weights)
   # exp_probs: [bin, sdate] 
   obs_probs <- .get_probs(data = obs, indices_for_quantiles = indices_for_clim, 
                           prob_thresholds = prob_thresholds)
@@ -168,7 +166,7 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member',
   return(rps)
 }
 
-.get_probs <- function(data, indices_for_quantiles, prob_thresholds) {
+.get_probs <- function(data, indices_for_quantiles, prob_thresholds, weights = NULL) {
   # if exp: [sdate, memb]
   # if obs: [sdate, (memb)]
 
@@ -176,18 +174,78 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member',
   if (length(dim(data)) == 1) dim(data) <- c(dim(data), 1)
 
   # Absolute thresholds
-  quantiles <- quantile(x = as.vector(data[indices_for_quantiles, ]), probs = prob_thresholds, type = 8, na.rm = TRUE)
-
+  if (is.null(weights)){
+    quantiles <- quantile(x = as.vector(data[indices_for_quantiles, ]), probs = prob_thresholds, type = 8, na.rm = TRUE)
+  } else {
+    # linear approach to compute weighted distribution thresholds.
+    weights <- s2dv::Reorder(data = weights, order = names(dim(data))) # move to check or in .rps *******
+    sorted_arrays <- .sorted_distributions(data[indices_for_quantiles, ], weights[indices_for_quantiles, ])
+    sorted_data <- sorted_arrays$data
+    sorted_weights <- sorted_arrays$weights
+    # approach 1 (interpolate from discrete cumulative weights distribution. NOT type 8)
+    # function vvvvvvvvvvvvvvvvvvv
+    # cumulative_weights <- cumsum(sorted_weights) - 0.5 * sorted_weights
+    # cumulative_weights <- cumulative_weights - cumulative_weights[1] # fix the 0
+    # cumulative_weights <- cumulative_weights / cumulative_weights[length(cumulative_weights)] # fix the 1
+    # # function ^^^^^^^^^^^^^^^^^^^
+    # quantiles <- interp1(cumulative_weights, sorted_data, probs, "cubic") # library(pracma)
+    # approach 2 (interpolate from the polynomial fit of the cumulative weights distribution)
+    order <- 5
+    fit_model <- lm(sorted_data ~ poly(cumsum(sorted_weights), order, raw=TRUE)) # order 10, but we should think what order we want to use
+    quantiles <- prob_thresholds*0 # make a list of zeros the length of prob_thresholds
+    for (i in 1:(order+1)) {quantiles <- quantiles+fit_model$coefficients[i]*prob_thresholds^(i-1)}
+  }
   # Probabilities
   probs <- array(dim = c(bin = length(quantiles) + 1, dim(data)[1])) # [bin, sdate]
   for (i_time in 1:dim(data)[1]) {
-    if (anyNA(data[i_time, ])) {
-      probs[, i_time] <- rep(NA, dim = length(quantiles) + 1)
+    if (is.null(weights)){
+      if (anyNA(data[i_time, ])) {
+        probs[, i_time] <- rep(NA, dim = length(quantiles) + 1)
+      } else {
+        probs[, i_time] <- colMeans(easyVerification::convert2prob(data[i_time, ], threshold = quantiles))
+      }
     } else {
-      probs[, i_time] <- colMeans(easyVerification::convert2prob(data[i_time, ], threshold = quantiles))
+      sorted_arrays <- .sorted_distributions(data[i_time, ], weights[i_time, ])
+      sorted_data <- sorted_arrays$data
+      sorted_weights <- sorted_arrays$weights
+      # approach 1. interpolate
+      cumulative_weights <- cumsum(sorted_weights) - 0.5 * sorted_weights
+      cumulative_weights <- cumulative_weights - cumulative_weights[1] # fix the 0
+      cumulative_weights <- cumulative_weights / cumulative_weights[length(cumulative_weights)] # fix the 1
+      # Solve issues for quantiles outside the sorted_data range:
+      xtra_length <- 10
+      if (min(quantiles)<min(sorted_data)){
+        xtra <- seq(min(quantiles), min(sorted_data), length.out=xtra_length)
+        sorted_data <- c(xtra,sorted_data)
+        cumulative_weights <- c(numeric(xtra_length),cumulative_weights)
+      }
+      if (max(quantiles)>max(sorted_data)){
+        xtra <- seq(max(sorted_data), max(quantiles), length.out=xtra_length)
+        sorted_data <- c(sorted_data,xtra)
+        cumulative_weights <- c(cumulative_weights,numeric(xtra_length))
+      }
+      integrated_probs <- interp1(sorted_data, cumulative_weights, quantiles, "cubic")
+      probs[, i_time] <- c(integrated_probs[1], integrated_probs[2]-integrated_probs[1], 1-integrated_probs[2])
+      if (min(probs[, i_time])<0) {print(paste0("ERROR negative probability in i_time=",i_time))} ######################## checks
+      if (max(probs[, i_time])>1) {print(paste0("ERROR >1 probability in i_time=",i_time))} ######################## checks
+      
+      # approach 2. might give erroneous results for quantiles that fall outside the data range. Try normal distro
+      # order <- 5
+      # fit_model <- lm(cumsum(sorted_weights) ~ poly(sorted_data, order, raw=TRUE)) # order 10, but we should think what order we want to use
+      # integrated_probs <- quantiles*0
+      # for (i in 1:(order+1)) {integrated_probs <- integrated_probs+fit_model$coefficients[i]*quantiles^(i-1)}
+      # probs[, i_time] <- c(integrated_probs[1], integrated_probs[2]-integrated_probs[1], 1-integrated_probs[2])
     }
+
   }
 
   return(probs)
 }
 
+.sorted_distributions <- function(data_vector, weights_vector){
+  weights_vector <- as.vector(weights_vector)
+  data_vector <- as.vector(data_vector)
+  weights_vector <- weights_vector / sum(weights_vector) # normalize to 1
+  sorter <- order(data_vector)
+  return(list("data" = data_vector[sorter], "weights" = weights_vector[sorter]))
+}
\ No newline at end of file
-- 
GitLab


From 7536e289fd004435716c600611bafa05273ae37b Mon Sep 17 00:00:00 2001
From: jcos <josep.cos@bsc.es>
Date: Thu, 26 May 2022 12:34:59 +0200
Subject: [PATCH 2/3] switch to approx() and use it in the quantiles and probs

---
 R/RPS.R | 105 ++++++++++++++++++++++++++++++--------------------------
 1 file changed, 56 insertions(+), 49 deletions(-)

diff --git a/R/RPS.R b/R/RPS.R
index a5d54d7..e323936 100644
--- a/R/RPS.R
+++ b/R/RPS.R
@@ -131,7 +131,7 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member',
                prob_thresholds = prob_thresholds, 
                indices_for_clim = indices_for_clim, Fair = Fair,
                ncores = ncores, weights = weights)$output1
-  browser()
+  # browser()
   # Return only the mean RPS
   rps <- MeanDims(rps, time_dim, na.rm = FALSE)
  
@@ -178,67 +178,70 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member',
     quantiles <- quantile(x = as.vector(data[indices_for_quantiles, ]), probs = prob_thresholds, type = 8, na.rm = TRUE)
   } else {
     # linear approach to compute weighted distribution thresholds.
+    # browser()
+    #weights<-as.array(weights)
     weights <- s2dv::Reorder(data = weights, order = names(dim(data))) # move to check or in .rps *******
+    # is there a better way of doing this: vvvvvvvvvv
     sorted_arrays <- .sorted_distributions(data[indices_for_quantiles, ], weights[indices_for_quantiles, ])
     sorted_data <- sorted_arrays$data
-    sorted_weights <- sorted_arrays$weights
-    # approach 1 (interpolate from discrete cumulative weights distribution. NOT type 8)
-    # function vvvvvvvvvvvvvvvvvvv
-    # cumulative_weights <- cumsum(sorted_weights) - 0.5 * sorted_weights
-    # cumulative_weights <- cumulative_weights - cumulative_weights[1] # fix the 0
-    # cumulative_weights <- cumulative_weights / cumulative_weights[length(cumulative_weights)] # fix the 1
-    # # function ^^^^^^^^^^^^^^^^^^^
-    # quantiles <- interp1(cumulative_weights, sorted_data, probs, "cubic") # library(pracma)
+    cumulative_weights <- sorted_arrays$cumulative_weights
+    quantiles <- approx(cumulative_weights, sorted_data, prob_thresholds, "linear")$y # library(pracma)
+
     # approach 2 (interpolate from the polynomial fit of the cumulative weights distribution)
-    order <- 5
-    fit_model <- lm(sorted_data ~ poly(cumsum(sorted_weights), order, raw=TRUE)) # order 10, but we should think what order we want to use
-    quantiles <- prob_thresholds*0 # make a list of zeros the length of prob_thresholds
-    for (i in 1:(order+1)) {quantiles <- quantiles+fit_model$coefficients[i]*prob_thresholds^(i-1)}
+    # order <- 5 # should it be fixed? should it be provided? should it be computed according to the number of points available?
+    # fit_model <- lm(sorted_data ~ poly(cumsum(sorted_weights), order, raw=TRUE)) 
+    # quantiles <- numeric(length(prob_thresholds)) # make a list of zeros the length of prob_thresholds
+    # for (i in 1:(order+1)) {quantiles <- quantiles+fit_model$coefficients[i]*prob_thresholds^(i-1)}
   }
   # Probabilities
+  # browser()
   probs <- array(dim = c(bin = length(quantiles) + 1, dim(data)[1])) # [bin, sdate]
   for (i_time in 1:dim(data)[1]) {
-    if (is.null(weights)){
-      if (anyNA(data[i_time, ])) {
+    if (anyNA(data[i_time, ])) {
         probs[, i_time] <- rep(NA, dim = length(quantiles) + 1)
-      } else {
-        probs[, i_time] <- colMeans(easyVerification::convert2prob(data[i_time, ], threshold = quantiles))
-      }
     } else {
-      sorted_arrays <- .sorted_distributions(data[i_time, ], weights[i_time, ])
-      sorted_data <- sorted_arrays$data
-      sorted_weights <- sorted_arrays$weights
-      # approach 1. interpolate
-      cumulative_weights <- cumsum(sorted_weights) - 0.5 * sorted_weights
-      cumulative_weights <- cumulative_weights - cumulative_weights[1] # fix the 0
-      cumulative_weights <- cumulative_weights / cumulative_weights[length(cumulative_weights)] # fix the 1
-      # Solve issues for quantiles outside the sorted_data range:
-      xtra_length <- 10
-      if (min(quantiles)<min(sorted_data)){
-        xtra <- seq(min(quantiles), min(sorted_data), length.out=xtra_length)
-        sorted_data <- c(xtra,sorted_data)
-        cumulative_weights <- c(numeric(xtra_length),cumulative_weights)
-      }
-      if (max(quantiles)>max(sorted_data)){
-        xtra <- seq(max(sorted_data), max(quantiles), length.out=xtra_length)
-        sorted_data <- c(sorted_data,xtra)
-        cumulative_weights <- c(cumulative_weights,numeric(xtra_length))
+      if (is.null(weights)){
+          probs[, i_time] <- colMeans(easyVerification::convert2prob(data[i_time, ], threshold = quantiles))
+          print("............probs...................")
+          print(probs[, i_time])
+          print("...............................")
+      } else {
+        # function vvvv
+        sorted_arrays <- .sorted_distributions(data[i_time, ], weights[i_time, ])
+        sorted_data <- sorted_arrays$data
+        cumulative_weights <- sorted_arrays$cumulative_weights
+        # find any quantiles that are outside the data range
+        integrated_probs <- array(dim = c(bin=length(quantiles)))
+        for (i_quant in 1:length(quantiles)) {
+          if (quantiles[i_quant] < min(sorted_data)) {
+            integrated_probs[i_quant] = 0 # append
+          } else if (max(sorted_data) < quantiles[i_quant]) {
+            integrated_probs[i_quant] = 1 # append
+          } else {
+            integrated_probs[i_quant] = approx(sorted_data, cumulative_weights, quantiles[i_quant], "linear")$y
+          }
+        }
+        probs[, i_time] <- append(integrated_probs, 1) - append(0, integrated_probs)
+        # print("..........quantiles.........")
+        # print(quantiles)
+        # print("..........integrated probs.........")
+        # print(integrated_probs)
+        print("..........probs.........")
+        print(probs[, i_time])
+        print("..............................")
+        print("..............................")
+        if (min(probs[, i_time])<0) {print(paste0("ERROR negative probability in i_time=",i_time))} ######################## checks
+        if (max(probs[, i_time])>1) {print(paste0("ERROR >1 probability in i_time=",i_time))} ######################## checks
+        # approach 2. might give erroneous results for quantiles that fall outside the data range. Try normal distro
+        # order <- 5
+        # fit_model <- lm(cumsum(sorted_weights) ~ poly(sorted_data, order, raw=TRUE)) # order 10, but we should think what order we want to use
+        # integrated_probs <- quantiles*0
+        # for (i in 1:(order+1)) {integrated_probs <- integrated_probs+fit_model$coefficients[i]*quantiles^(i-1)}
+        # probs[, i_time] <- c(integrated_probs[1], integrated_probs[2]-integrated_probs[1], 1-integrated_probs[2])
       }
-      integrated_probs <- interp1(sorted_data, cumulative_weights, quantiles, "cubic")
-      probs[, i_time] <- c(integrated_probs[1], integrated_probs[2]-integrated_probs[1], 1-integrated_probs[2])
-      if (min(probs[, i_time])<0) {print(paste0("ERROR negative probability in i_time=",i_time))} ######################## checks
-      if (max(probs[, i_time])>1) {print(paste0("ERROR >1 probability in i_time=",i_time))} ######################## checks
-      
-      # approach 2. might give erroneous results for quantiles that fall outside the data range. Try normal distro
-      # order <- 5
-      # fit_model <- lm(cumsum(sorted_weights) ~ poly(sorted_data, order, raw=TRUE)) # order 10, but we should think what order we want to use
-      # integrated_probs <- quantiles*0
-      # for (i in 1:(order+1)) {integrated_probs <- integrated_probs+fit_model$coefficients[i]*quantiles^(i-1)}
-      # probs[, i_time] <- c(integrated_probs[1], integrated_probs[2]-integrated_probs[1], 1-integrated_probs[2])
     }
 
   }
-
   return(probs)
 }
 
@@ -247,5 +250,9 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member',
   data_vector <- as.vector(data_vector)
   weights_vector <- weights_vector / sum(weights_vector) # normalize to 1
   sorter <- order(data_vector)
-  return(list("data" = data_vector[sorter], "weights" = weights_vector[sorter]))
+  sorted_weights <- weights_vector[sorter]
+  cumulative_weights <- cumsum(sorted_weights) - 0.5 * sorted_weights
+  cumulative_weights <- cumulative_weights - cumulative_weights[1] # fix the 0
+  cumulative_weights <- cumulative_weights / cumulative_weights[length(cumulative_weights)] # fix the 1
+  return(list("data" = data_vector[sorter], "cumulative_weights" = cumulative_weights))
 }
\ No newline at end of file
-- 
GitLab


From 281ff94a074846ace6d65a5ab2c1c06d1b54858d Mon Sep 17 00:00:00 2001
From: jcos <josep.cos@bsc.es>
Date: Wed, 8 Jun 2022 16:39:54 +0200
Subject: [PATCH 3/3] cleanup

---
 R/RPS.R | 39 ++++++---------------------------------
 1 file changed, 6 insertions(+), 33 deletions(-)

diff --git a/R/RPS.R b/R/RPS.R
index e323936..e0e19b4 100644
--- a/R/RPS.R
+++ b/R/RPS.R
@@ -122,7 +122,6 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member',
   } else {
     target_dims_obs <- c(time_dim, memb_dim)
   }
-
   rps <- Apply(data = list(exp = exp, obs = obs), 
                target_dims = list(exp = c(time_dim, memb_dim), 
                                   obs = target_dims_obs),
@@ -177,24 +176,13 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member',
   if (is.null(weights)){
     quantiles <- quantile(x = as.vector(data[indices_for_quantiles, ]), probs = prob_thresholds, type = 8, na.rm = TRUE)
   } else {
-    # linear approach to compute weighted distribution thresholds.
-    # browser()
-    #weights<-as.array(weights)
     weights <- s2dv::Reorder(data = weights, order = names(dim(data))) # move to check or in .rps *******
-    # is there a better way of doing this: vvvvvvvvvv
     sorted_arrays <- .sorted_distributions(data[indices_for_quantiles, ], weights[indices_for_quantiles, ])
     sorted_data <- sorted_arrays$data
     cumulative_weights <- sorted_arrays$cumulative_weights
-    quantiles <- approx(cumulative_weights, sorted_data, prob_thresholds, "linear")$y # library(pracma)
-
-    # approach 2 (interpolate from the polynomial fit of the cumulative weights distribution)
-    # order <- 5 # should it be fixed? should it be provided? should it be computed according to the number of points available?
-    # fit_model <- lm(sorted_data ~ poly(cumsum(sorted_weights), order, raw=TRUE)) 
-    # quantiles <- numeric(length(prob_thresholds)) # make a list of zeros the length of prob_thresholds
-    # for (i in 1:(order+1)) {quantiles <- quantiles+fit_model$coefficients[i]*prob_thresholds^(i-1)}
+    quantiles <- approx(cumulative_weights, sorted_data, prob_thresholds, "linear")$y
   }
   # Probabilities
-  # browser()
   probs <- array(dim = c(bin = length(quantiles) + 1, dim(data)[1])) # [bin, sdate]
   for (i_time in 1:dim(data)[1]) {
     if (anyNA(data[i_time, ])) {
@@ -202,42 +190,27 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member',
     } else {
       if (is.null(weights)){
           probs[, i_time] <- colMeans(easyVerification::convert2prob(data[i_time, ], threshold = quantiles))
-          print("............probs...................")
-          print(probs[, i_time])
-          print("...............................")
       } else {
-        # function vvvv
         sorted_arrays <- .sorted_distributions(data[i_time, ], weights[i_time, ])
         sorted_data <- sorted_arrays$data
         cumulative_weights <- sorted_arrays$cumulative_weights
         # find any quantiles that are outside the data range
         integrated_probs <- array(dim = c(bin=length(quantiles)))
         for (i_quant in 1:length(quantiles)) {
+          # for thresholds falling under the distribution
           if (quantiles[i_quant] < min(sorted_data)) {
-            integrated_probs[i_quant] = 0 # append
+            integrated_probs[i_quant] = 0 
+          # for thresholds falling over the distribution
           } else if (max(sorted_data) < quantiles[i_quant]) {
-            integrated_probs[i_quant] = 1 # append
+            integrated_probs[i_quant] = 1
           } else {
             integrated_probs[i_quant] = approx(sorted_data, cumulative_weights, quantiles[i_quant], "linear")$y
           }
         }
         probs[, i_time] <- append(integrated_probs, 1) - append(0, integrated_probs)
-        # print("..........quantiles.........")
-        # print(quantiles)
-        # print("..........integrated probs.........")
-        # print(integrated_probs)
-        print("..........probs.........")
-        print(probs[, i_time])
-        print("..............................")
-        print("..............................")
+        # This shouldn't happen, idk if this cehck is needed then
         if (min(probs[, i_time])<0) {print(paste0("ERROR negative probability in i_time=",i_time))} ######################## checks
         if (max(probs[, i_time])>1) {print(paste0("ERROR >1 probability in i_time=",i_time))} ######################## checks
-        # approach 2. might give erroneous results for quantiles that fall outside the data range. Try normal distro
-        # order <- 5
-        # fit_model <- lm(cumsum(sorted_weights) ~ poly(sorted_data, order, raw=TRUE)) # order 10, but we should think what order we want to use
-        # integrated_probs <- quantiles*0
-        # for (i in 1:(order+1)) {integrated_probs <- integrated_probs+fit_model$coefficients[i]*quantiles^(i-1)}
-        # probs[, i_time] <- c(integrated_probs[1], integrated_probs[2]-integrated_probs[1], 1-integrated_probs[2])
       }
     }
 
-- 
GitLab