From 2c9ca69f6859d7cc77fb5035f346733451b3c1c2 Mon Sep 17 00:00:00 2001 From: jcos Date: Fri, 20 May 2022 15:59:28 +0200 Subject: [PATCH 1/3] basic workflow --- R/RPS.R | 84 ++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 71 insertions(+), 13 deletions(-) diff --git a/R/RPS.R b/R/RPS.R index 84956ac..a5d54d7 100644 --- a/R/RPS.R +++ b/R/RPS.R @@ -48,7 +48,7 @@ #'@export RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', prob_thresholds = c(1/3, 2/3), indices_for_clim = NULL, Fair = FALSE, - ncores = NULL) { + ncores = NULL, weights = NULL) { # Check inputs ## exp and obs (1) @@ -130,8 +130,8 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', fun = .RPS, prob_thresholds = prob_thresholds, indices_for_clim = indices_for_clim, Fair = Fair, - ncores = ncores)$output1 - + ncores = ncores, weights = weights)$output1 + browser() # Return only the mean RPS rps <- MeanDims(rps, time_dim, na.rm = FALSE) @@ -140,12 +140,10 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', .RPS <- function(exp, obs, prob_thresholds = c(1/3, 2/3), - indices_for_clim = NULL, Fair = FALSE) { + indices_for_clim = NULL, Fair = FALSE, weights = NULL) { # exp: [sdate, memb] # obs: [sdate, (memb)] - - exp_probs <- .get_probs(data = exp, indices_for_quantiles = indices_for_clim, - prob_thresholds = prob_thresholds) + exp_probs <- .get_probs(data = exp, indices_for_quantiles = indices_for_clim, prob_thresholds = prob_thresholds, weights = weights) # exp_probs: [bin, sdate] obs_probs <- .get_probs(data = obs, indices_for_quantiles = indices_for_clim, prob_thresholds = prob_thresholds) @@ -168,7 +166,7 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', return(rps) } -.get_probs <- function(data, indices_for_quantiles, prob_thresholds) { +.get_probs <- function(data, indices_for_quantiles, prob_thresholds, weights = NULL) { # if exp: [sdate, memb] # if obs: [sdate, (memb)] @@ -176,18 +174,78 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', if (length(dim(data)) == 1) dim(data) <- c(dim(data), 1) # Absolute thresholds - quantiles <- quantile(x = as.vector(data[indices_for_quantiles, ]), probs = prob_thresholds, type = 8, na.rm = TRUE) - + if (is.null(weights)){ + quantiles <- quantile(x = as.vector(data[indices_for_quantiles, ]), probs = prob_thresholds, type = 8, na.rm = TRUE) + } else { + # linear approach to compute weighted distribution thresholds. + weights <- s2dv::Reorder(data = weights, order = names(dim(data))) # move to check or in .rps ******* + sorted_arrays <- .sorted_distributions(data[indices_for_quantiles, ], weights[indices_for_quantiles, ]) + sorted_data <- sorted_arrays$data + sorted_weights <- sorted_arrays$weights + # approach 1 (interpolate from discrete cumulative weights distribution. NOT type 8) + # function vvvvvvvvvvvvvvvvvvv + # cumulative_weights <- cumsum(sorted_weights) - 0.5 * sorted_weights + # cumulative_weights <- cumulative_weights - cumulative_weights[1] # fix the 0 + # cumulative_weights <- cumulative_weights / cumulative_weights[length(cumulative_weights)] # fix the 1 + # # function ^^^^^^^^^^^^^^^^^^^ + # quantiles <- interp1(cumulative_weights, sorted_data, probs, "cubic") # library(pracma) + # approach 2 (interpolate from the polynomial fit of the cumulative weights distribution) + order <- 5 + fit_model <- lm(sorted_data ~ poly(cumsum(sorted_weights), order, raw=TRUE)) # order 10, but we should think what order we want to use + quantiles <- prob_thresholds*0 # make a list of zeros the length of prob_thresholds + for (i in 1:(order+1)) {quantiles <- quantiles+fit_model$coefficients[i]*prob_thresholds^(i-1)} + } # Probabilities probs <- array(dim = c(bin = length(quantiles) + 1, dim(data)[1])) # [bin, sdate] for (i_time in 1:dim(data)[1]) { - if (anyNA(data[i_time, ])) { - probs[, i_time] <- rep(NA, dim = length(quantiles) + 1) + if (is.null(weights)){ + if (anyNA(data[i_time, ])) { + probs[, i_time] <- rep(NA, dim = length(quantiles) + 1) + } else { + probs[, i_time] <- colMeans(easyVerification::convert2prob(data[i_time, ], threshold = quantiles)) + } } else { - probs[, i_time] <- colMeans(easyVerification::convert2prob(data[i_time, ], threshold = quantiles)) + sorted_arrays <- .sorted_distributions(data[i_time, ], weights[i_time, ]) + sorted_data <- sorted_arrays$data + sorted_weights <- sorted_arrays$weights + # approach 1. interpolate + cumulative_weights <- cumsum(sorted_weights) - 0.5 * sorted_weights + cumulative_weights <- cumulative_weights - cumulative_weights[1] # fix the 0 + cumulative_weights <- cumulative_weights / cumulative_weights[length(cumulative_weights)] # fix the 1 + # Solve issues for quantiles outside the sorted_data range: + xtra_length <- 10 + if (min(quantiles)max(sorted_data)){ + xtra <- seq(max(sorted_data), max(quantiles), length.out=xtra_length) + sorted_data <- c(sorted_data,xtra) + cumulative_weights <- c(cumulative_weights,numeric(xtra_length)) + } + integrated_probs <- interp1(sorted_data, cumulative_weights, quantiles, "cubic") + probs[, i_time] <- c(integrated_probs[1], integrated_probs[2]-integrated_probs[1], 1-integrated_probs[2]) + if (min(probs[, i_time])<0) {print(paste0("ERROR negative probability in i_time=",i_time))} ######################## checks + if (max(probs[, i_time])>1) {print(paste0("ERROR >1 probability in i_time=",i_time))} ######################## checks + + # approach 2. might give erroneous results for quantiles that fall outside the data range. Try normal distro + # order <- 5 + # fit_model <- lm(cumsum(sorted_weights) ~ poly(sorted_data, order, raw=TRUE)) # order 10, but we should think what order we want to use + # integrated_probs <- quantiles*0 + # for (i in 1:(order+1)) {integrated_probs <- integrated_probs+fit_model$coefficients[i]*quantiles^(i-1)} + # probs[, i_time] <- c(integrated_probs[1], integrated_probs[2]-integrated_probs[1], 1-integrated_probs[2]) } + } return(probs) } +.sorted_distributions <- function(data_vector, weights_vector){ + weights_vector <- as.vector(weights_vector) + data_vector <- as.vector(data_vector) + weights_vector <- weights_vector / sum(weights_vector) # normalize to 1 + sorter <- order(data_vector) + return(list("data" = data_vector[sorter], "weights" = weights_vector[sorter])) +} \ No newline at end of file -- GitLab From 7536e289fd004435716c600611bafa05273ae37b Mon Sep 17 00:00:00 2001 From: jcos Date: Thu, 26 May 2022 12:34:59 +0200 Subject: [PATCH 2/3] switch to approx() and use it in the quantiles and probs --- R/RPS.R | 105 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 56 insertions(+), 49 deletions(-) diff --git a/R/RPS.R b/R/RPS.R index a5d54d7..e323936 100644 --- a/R/RPS.R +++ b/R/RPS.R @@ -131,7 +131,7 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', prob_thresholds = prob_thresholds, indices_for_clim = indices_for_clim, Fair = Fair, ncores = ncores, weights = weights)$output1 - browser() + # browser() # Return only the mean RPS rps <- MeanDims(rps, time_dim, na.rm = FALSE) @@ -178,67 +178,70 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', quantiles <- quantile(x = as.vector(data[indices_for_quantiles, ]), probs = prob_thresholds, type = 8, na.rm = TRUE) } else { # linear approach to compute weighted distribution thresholds. + # browser() + #weights<-as.array(weights) weights <- s2dv::Reorder(data = weights, order = names(dim(data))) # move to check or in .rps ******* + # is there a better way of doing this: vvvvvvvvvv sorted_arrays <- .sorted_distributions(data[indices_for_quantiles, ], weights[indices_for_quantiles, ]) sorted_data <- sorted_arrays$data - sorted_weights <- sorted_arrays$weights - # approach 1 (interpolate from discrete cumulative weights distribution. NOT type 8) - # function vvvvvvvvvvvvvvvvvvv - # cumulative_weights <- cumsum(sorted_weights) - 0.5 * sorted_weights - # cumulative_weights <- cumulative_weights - cumulative_weights[1] # fix the 0 - # cumulative_weights <- cumulative_weights / cumulative_weights[length(cumulative_weights)] # fix the 1 - # # function ^^^^^^^^^^^^^^^^^^^ - # quantiles <- interp1(cumulative_weights, sorted_data, probs, "cubic") # library(pracma) + cumulative_weights <- sorted_arrays$cumulative_weights + quantiles <- approx(cumulative_weights, sorted_data, prob_thresholds, "linear")$y # library(pracma) + # approach 2 (interpolate from the polynomial fit of the cumulative weights distribution) - order <- 5 - fit_model <- lm(sorted_data ~ poly(cumsum(sorted_weights), order, raw=TRUE)) # order 10, but we should think what order we want to use - quantiles <- prob_thresholds*0 # make a list of zeros the length of prob_thresholds - for (i in 1:(order+1)) {quantiles <- quantiles+fit_model$coefficients[i]*prob_thresholds^(i-1)} + # order <- 5 # should it be fixed? should it be provided? should it be computed according to the number of points available? + # fit_model <- lm(sorted_data ~ poly(cumsum(sorted_weights), order, raw=TRUE)) + # quantiles <- numeric(length(prob_thresholds)) # make a list of zeros the length of prob_thresholds + # for (i in 1:(order+1)) {quantiles <- quantiles+fit_model$coefficients[i]*prob_thresholds^(i-1)} } # Probabilities + # browser() probs <- array(dim = c(bin = length(quantiles) + 1, dim(data)[1])) # [bin, sdate] for (i_time in 1:dim(data)[1]) { - if (is.null(weights)){ - if (anyNA(data[i_time, ])) { + if (anyNA(data[i_time, ])) { probs[, i_time] <- rep(NA, dim = length(quantiles) + 1) - } else { - probs[, i_time] <- colMeans(easyVerification::convert2prob(data[i_time, ], threshold = quantiles)) - } } else { - sorted_arrays <- .sorted_distributions(data[i_time, ], weights[i_time, ]) - sorted_data <- sorted_arrays$data - sorted_weights <- sorted_arrays$weights - # approach 1. interpolate - cumulative_weights <- cumsum(sorted_weights) - 0.5 * sorted_weights - cumulative_weights <- cumulative_weights - cumulative_weights[1] # fix the 0 - cumulative_weights <- cumulative_weights / cumulative_weights[length(cumulative_weights)] # fix the 1 - # Solve issues for quantiles outside the sorted_data range: - xtra_length <- 10 - if (min(quantiles)max(sorted_data)){ - xtra <- seq(max(sorted_data), max(quantiles), length.out=xtra_length) - sorted_data <- c(sorted_data,xtra) - cumulative_weights <- c(cumulative_weights,numeric(xtra_length)) + if (is.null(weights)){ + probs[, i_time] <- colMeans(easyVerification::convert2prob(data[i_time, ], threshold = quantiles)) + print("............probs...................") + print(probs[, i_time]) + print("...............................") + } else { + # function vvvv + sorted_arrays <- .sorted_distributions(data[i_time, ], weights[i_time, ]) + sorted_data <- sorted_arrays$data + cumulative_weights <- sorted_arrays$cumulative_weights + # find any quantiles that are outside the data range + integrated_probs <- array(dim = c(bin=length(quantiles))) + for (i_quant in 1:length(quantiles)) { + if (quantiles[i_quant] < min(sorted_data)) { + integrated_probs[i_quant] = 0 # append + } else if (max(sorted_data) < quantiles[i_quant]) { + integrated_probs[i_quant] = 1 # append + } else { + integrated_probs[i_quant] = approx(sorted_data, cumulative_weights, quantiles[i_quant], "linear")$y + } + } + probs[, i_time] <- append(integrated_probs, 1) - append(0, integrated_probs) + # print("..........quantiles.........") + # print(quantiles) + # print("..........integrated probs.........") + # print(integrated_probs) + print("..........probs.........") + print(probs[, i_time]) + print("..............................") + print("..............................") + if (min(probs[, i_time])<0) {print(paste0("ERROR negative probability in i_time=",i_time))} ######################## checks + if (max(probs[, i_time])>1) {print(paste0("ERROR >1 probability in i_time=",i_time))} ######################## checks + # approach 2. might give erroneous results for quantiles that fall outside the data range. Try normal distro + # order <- 5 + # fit_model <- lm(cumsum(sorted_weights) ~ poly(sorted_data, order, raw=TRUE)) # order 10, but we should think what order we want to use + # integrated_probs <- quantiles*0 + # for (i in 1:(order+1)) {integrated_probs <- integrated_probs+fit_model$coefficients[i]*quantiles^(i-1)} + # probs[, i_time] <- c(integrated_probs[1], integrated_probs[2]-integrated_probs[1], 1-integrated_probs[2]) } - integrated_probs <- interp1(sorted_data, cumulative_weights, quantiles, "cubic") - probs[, i_time] <- c(integrated_probs[1], integrated_probs[2]-integrated_probs[1], 1-integrated_probs[2]) - if (min(probs[, i_time])<0) {print(paste0("ERROR negative probability in i_time=",i_time))} ######################## checks - if (max(probs[, i_time])>1) {print(paste0("ERROR >1 probability in i_time=",i_time))} ######################## checks - - # approach 2. might give erroneous results for quantiles that fall outside the data range. Try normal distro - # order <- 5 - # fit_model <- lm(cumsum(sorted_weights) ~ poly(sorted_data, order, raw=TRUE)) # order 10, but we should think what order we want to use - # integrated_probs <- quantiles*0 - # for (i in 1:(order+1)) {integrated_probs <- integrated_probs+fit_model$coefficients[i]*quantiles^(i-1)} - # probs[, i_time] <- c(integrated_probs[1], integrated_probs[2]-integrated_probs[1], 1-integrated_probs[2]) } } - return(probs) } @@ -247,5 +250,9 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', data_vector <- as.vector(data_vector) weights_vector <- weights_vector / sum(weights_vector) # normalize to 1 sorter <- order(data_vector) - return(list("data" = data_vector[sorter], "weights" = weights_vector[sorter])) + sorted_weights <- weights_vector[sorter] + cumulative_weights <- cumsum(sorted_weights) - 0.5 * sorted_weights + cumulative_weights <- cumulative_weights - cumulative_weights[1] # fix the 0 + cumulative_weights <- cumulative_weights / cumulative_weights[length(cumulative_weights)] # fix the 1 + return(list("data" = data_vector[sorter], "cumulative_weights" = cumulative_weights)) } \ No newline at end of file -- GitLab From 281ff94a074846ace6d65a5ab2c1c06d1b54858d Mon Sep 17 00:00:00 2001 From: jcos Date: Wed, 8 Jun 2022 16:39:54 +0200 Subject: [PATCH 3/3] cleanup --- R/RPS.R | 39 ++++++--------------------------------- 1 file changed, 6 insertions(+), 33 deletions(-) diff --git a/R/RPS.R b/R/RPS.R index e323936..e0e19b4 100644 --- a/R/RPS.R +++ b/R/RPS.R @@ -122,7 +122,6 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', } else { target_dims_obs <- c(time_dim, memb_dim) } - rps <- Apply(data = list(exp = exp, obs = obs), target_dims = list(exp = c(time_dim, memb_dim), obs = target_dims_obs), @@ -177,24 +176,13 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', if (is.null(weights)){ quantiles <- quantile(x = as.vector(data[indices_for_quantiles, ]), probs = prob_thresholds, type = 8, na.rm = TRUE) } else { - # linear approach to compute weighted distribution thresholds. - # browser() - #weights<-as.array(weights) weights <- s2dv::Reorder(data = weights, order = names(dim(data))) # move to check or in .rps ******* - # is there a better way of doing this: vvvvvvvvvv sorted_arrays <- .sorted_distributions(data[indices_for_quantiles, ], weights[indices_for_quantiles, ]) sorted_data <- sorted_arrays$data cumulative_weights <- sorted_arrays$cumulative_weights - quantiles <- approx(cumulative_weights, sorted_data, prob_thresholds, "linear")$y # library(pracma) - - # approach 2 (interpolate from the polynomial fit of the cumulative weights distribution) - # order <- 5 # should it be fixed? should it be provided? should it be computed according to the number of points available? - # fit_model <- lm(sorted_data ~ poly(cumsum(sorted_weights), order, raw=TRUE)) - # quantiles <- numeric(length(prob_thresholds)) # make a list of zeros the length of prob_thresholds - # for (i in 1:(order+1)) {quantiles <- quantiles+fit_model$coefficients[i]*prob_thresholds^(i-1)} + quantiles <- approx(cumulative_weights, sorted_data, prob_thresholds, "linear")$y } # Probabilities - # browser() probs <- array(dim = c(bin = length(quantiles) + 1, dim(data)[1])) # [bin, sdate] for (i_time in 1:dim(data)[1]) { if (anyNA(data[i_time, ])) { @@ -202,42 +190,27 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', } else { if (is.null(weights)){ probs[, i_time] <- colMeans(easyVerification::convert2prob(data[i_time, ], threshold = quantiles)) - print("............probs...................") - print(probs[, i_time]) - print("...............................") } else { - # function vvvv sorted_arrays <- .sorted_distributions(data[i_time, ], weights[i_time, ]) sorted_data <- sorted_arrays$data cumulative_weights <- sorted_arrays$cumulative_weights # find any quantiles that are outside the data range integrated_probs <- array(dim = c(bin=length(quantiles))) for (i_quant in 1:length(quantiles)) { + # for thresholds falling under the distribution if (quantiles[i_quant] < min(sorted_data)) { - integrated_probs[i_quant] = 0 # append + integrated_probs[i_quant] = 0 + # for thresholds falling over the distribution } else if (max(sorted_data) < quantiles[i_quant]) { - integrated_probs[i_quant] = 1 # append + integrated_probs[i_quant] = 1 } else { integrated_probs[i_quant] = approx(sorted_data, cumulative_weights, quantiles[i_quant], "linear")$y } } probs[, i_time] <- append(integrated_probs, 1) - append(0, integrated_probs) - # print("..........quantiles.........") - # print(quantiles) - # print("..........integrated probs.........") - # print(integrated_probs) - print("..........probs.........") - print(probs[, i_time]) - print("..............................") - print("..............................") + # This shouldn't happen, idk if this cehck is needed then if (min(probs[, i_time])<0) {print(paste0("ERROR negative probability in i_time=",i_time))} ######################## checks if (max(probs[, i_time])>1) {print(paste0("ERROR >1 probability in i_time=",i_time))} ######################## checks - # approach 2. might give erroneous results for quantiles that fall outside the data range. Try normal distro - # order <- 5 - # fit_model <- lm(cumsum(sorted_weights) ~ poly(sorted_data, order, raw=TRUE)) # order 10, but we should think what order we want to use - # integrated_probs <- quantiles*0 - # for (i in 1:(order+1)) {integrated_probs <- integrated_probs+fit_model$coefficients[i]*quantiles^(i-1)} - # probs[, i_time] <- c(integrated_probs[1], integrated_probs[2]-integrated_probs[1], 1-integrated_probs[2]) } } -- GitLab