From bb69ee75cba4a5dc747ae76bddae74cc0415ddc9 Mon Sep 17 00:00:00 2001 From: Victoria Agudetse Roures Date: Fri, 30 Dec 2022 12:48:22 +0100 Subject: [PATCH 1/3] Add checks for anomalies --- modules/Anomalies/Anomalies.R | 7 +++++++ modules/Saving/Saving.R | 4 +++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/modules/Anomalies/Anomalies.R b/modules/Anomalies/Anomalies.R index 2fe1da0e..f8b49da8 100644 --- a/modules/Anomalies/Anomalies.R +++ b/modules/Anomalies/Anomalies.R @@ -5,6 +5,13 @@ source("modules/Anomalies/tmp/CST_Anomaly.R") compute_anomalies <- function(recipe, data) { + if (is.null(recipe$Analysis$Workflow$Anomalies$compute)) { + error(recipe$Run$logger, + paste("The anomaly module has been called, but the element", + "'Workflow:Anomalies:compute' is missing from the recipe.")) + stop() + } + if (recipe$Analysis$Workflow$Anomalies$compute) { if (recipe$Analysis$Workflow$Anomalies$cross_validation) { cross <- TRUE diff --git a/modules/Saving/Saving.R b/modules/Saving/Saving.R index 124f0468..1fc035e9 100644 --- a/modules/Saving/Saving.R +++ b/modules/Saving/Saving.R @@ -463,7 +463,9 @@ save_metrics <- function(skill, # Add global and variable attributes global_attributes <- get_global_attributes(recipe, archive) - if (recipe$Analysis$Workflow$Anomalies$compute) { + ## TODO: Sort out the logic once default behavior is decided + if ((!is.null(recipe$Analysis$Workflow$Anomalies$compute)) && + (recipe$Analysis$Workflow$Anomalies$compute)) { global_attributes <- c(list(from_anomalies = "Yes"), global_attributes) } else { -- GitLab From ef4221c71e8a3a581f3905f66b0278cfad7fdf05 Mon Sep 17 00:00:00 2001 From: Victoria Agudetse Roures Date: Fri, 30 Dec 2022 13:24:07 +0100 Subject: [PATCH 2/3] Add anomalies in recipe check to all functions --- modules/Saving/Saving.R | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/modules/Saving/Saving.R b/modules/Saving/Saving.R index 1fc035e9..28b5e552 100644 --- a/modules/Saving/Saving.R +++ b/modules/Saving/Saving.R @@ -579,7 +579,9 @@ save_corr <- function(skill, # Add global and variable attributes global_attributes <- get_global_attributes(recipe, archive) - if (recipe$Analysis$Workflow$Anomalies$compute) { + ## TODO: Sort out the logic once default behavior is decided + if ((!is.null(recipe$Analysis$Workflow$Anomalies$compute)) && + (recipe$Analysis$Workflow$Anomalies$compute)) { global_attributes <- c(global_attributes, list(from_anomalies = "Yes")) } else { @@ -692,7 +694,9 @@ save_percentiles <- function(percentiles, # Add global and variable attributes global_attributes <- get_global_attributes(recipe, archive) - if (recipe$Analysis$Workflow$Anomalies$compute) { + ## TODO: Sort out the logic once default behavior is decided + if ((!is.null(recipe$Analysis$Workflow$Anomalies$compute)) && + (recipe$Analysis$Workflow$Anomalies$compute)) { global_attributes <- c(list(from_anomalies = "Yes"), global_attributes) } else { @@ -801,7 +805,9 @@ save_probabilities <- function(probs, var.longname <- attr(data_cube$Variable, 'variable')$long_name global_attributes <- get_global_attributes(recipe, archive) # Add anomaly computation to global attributes - if (recipe$Analysis$Workflow$Anomalies$compute) { + ## TODO: Sort out the logic once default behavior is decided + if ((!is.null(recipe$Analysis$Workflow$Anomalies$compute)) && + (recipe$Analysis$Workflow$Anomalies$compute)) { global_attributes <- c(list(from_anomalies = "Yes"), global_attributes) } else { -- GitLab From 621ccda5a6c090526a5961d731a2a2c366d75f1b Mon Sep 17 00:00:00 2001 From: Victoria Agudetse Roures Date: Fri, 30 Dec 2022 13:24:26 +0100 Subject: [PATCH 3/3] Update call to RMSSS --- modules/Skill/Skill.R | 7 +- modules/Skill/tmp/RMSSS.R | 290 ++++++++++++++++++++++++++++++-------- 2 files changed, 237 insertions(+), 60 deletions(-) diff --git a/modules/Skill/Skill.R b/modules/Skill/Skill.R index 9a59be40..9892ba52 100644 --- a/modules/Skill/Skill.R +++ b/modules/Skill/Skill.R @@ -224,16 +224,15 @@ compute_skill_metrics <- function(recipe, data) { skill_metrics[[ paste0(metric, "_conf.low") ]] <- skill$conf.lower skill_metrics[[ paste0(metric, "_conf.up") ]] <- skill$conf.upper } else if (metric == 'rmsss') { - # Compute hcst ensemble mean - exp_ensmean <- MeanDims(exp$data, dims = memb_dim, drop = FALSE) # Compute RMSS - skill <- RMSSS(exp_ensmean, obs$data, + skill <- RMSSS(data$hcst$data, data$obs$data, dat_dim = 'dat', time_dim = time_dim, + memb_dim = memb_dim, pval = FALSE, sign = TRUE, + sig_method = 'Random Walk', ncores = ncores) - rm(exp_ensmean) # Compute ensemble mean and modify dimensions skill <- lapply(skill, function(x) { .drop_dims(x)}) diff --git a/modules/Skill/tmp/RMSSS.R b/modules/Skill/tmp/RMSSS.R index 9c47da45..d2ff4861 100644 --- a/modules/Skill/tmp/RMSSS.R +++ b/modules/Skill/tmp/RMSSS.R @@ -11,7 +11,7 @@ #'The RMSSS are computed along the time_dim dimension which should correspond #'to the start date dimension.\cr #'The p-value and significance test are optionally provided by an one-sided -#'Fisher test.\cr +#'Fisher test or Random Walk test.\cr #' #'@param exp A named numeric array of experimental data which contains at least #' two dimensions for dat_dim and time_dim. It can also be a vector with the @@ -23,10 +23,22 @@ #' dimension can be different. It can also be a vector with the same length as #' 'exp', then the vector will automatically be 'time_dim' and 'dat_dim' will #' be 1. +#'@param ref A named numerical array of the reference forecast data with at +#' least time dimension, or 0 (typical climatological forecast) or 1 +#' (normalized climatological forecast). If it is an array, the dimensions must +#' be the same as 'exp' except 'memb_dim' and 'dat_dim'. If there is only one +#' reference dataset, it should not have dataset dimension. If there is +#' corresponding reference for each experiment, the dataset dimension must +#' have the same length as in 'exp'. If 'ref' is NULL, the typical +#' climatological forecast is used as reference forecast (equivelant to 0.) +#' The default value is NULL. #'@param dat_dim A character string indicating the name of dataset (nobs/nexp) #' dimension. The default value is 'dataset'. #'@param time_dim A character string indicating the name of dimension along #' which the RMSSS are computed. The default value is 'sdate'. +#'@param memb_dim A character string indicating the name of the member dimension +#' to compute the ensemble mean; it should be set to NULL if the parameter 'exp' +#' and 'ref' are already the ensemble mean. The default value is NULL. #'@param pval A logical value indicating whether to compute or not the p-value #' of the test Ho: RMSSS = 0. The default value is TRUE. #'@param sign A logical value indicating whether to compute or not the @@ -34,6 +46,8 @@ #' FALSE. #'@param alpha A numeric of the significance level to be used in the #' statistical significance test. The default value is 0.05. +#'@param sig_method A character string indicating the significance method. The +#' options are "one-sided Fisher" (default) and "Random Walk". #'@param ncores An integer indicating the number of cores to use for parallel #' computation. The default value is NULL. #' @@ -66,11 +80,12 @@ #'@import multiApply #'@importFrom stats pf #'@export -RMSSS <- function(exp, obs, time_dim = 'sdate', dat_dim = 'dataset', - pval = TRUE, sign = FALSE, alpha = 0.05, ncores = NULL) { +RMSSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', dat_dim = 'dataset', + memb_dim = NULL, pval = TRUE, sign = FALSE, alpha = 0.05, + sig_method = 'one-sided Fisher', ncores = NULL) { # Check inputs - ## exp and obs (1) + ## exp, obs, and ref (1) if (is.null(exp) | is.null(obs)) { stop("Parameter 'exp' and 'obs' cannot be NULL.") } @@ -99,6 +114,19 @@ RMSSS <- function(exp, obs, time_dim = 'sdate', dat_dim = 'dataset', !all(names(dim(obs)) %in% names(dim(exp)))) { stop("Parameter 'exp' and 'obs' must have same dimension name.") } + if (!is.null(ref)) { + if (!is.numeric(ref)) { + stop("Parameter 'ref' must be numeric.") + } + if (is.array(ref)) { + if (any(is.null(names(dim(ref))))| any(nchar(names(dim(ref))) == 0)) { + stop("Parameter 'ref' must have dimension names.") + } + } else if (length(ref) != 1 | any(!ref %in% c(0, 1))) { + stop("Parameter 'ref' must be a numeric array or number 0 or 1.") + } + } + ## time_dim if (!is.character(time_dim) | length(time_dim) > 1) { stop("Parameter 'time_dim' must be a character string.") @@ -116,6 +144,23 @@ RMSSS <- function(exp, obs, time_dim = 'sdate', dat_dim = 'dataset', " Set it as NULL if there is no dataset dimension.") } } + ## memb_dim + if (!is.null(memb_dim)) { + if (!is.character(memb_dim) | length(memb_dim) > 1) { + stop("Parameter 'memb_dim' must be a character string.") + } + if (!memb_dim %in% names(dim(exp))) { + stop("Parameter 'memb_dim' is not found in 'exp' dimension.") + } + if (memb_dim %in% names(dim(obs))) { + if (identical(as.numeric(dim(obs)[memb_dim]), 1)) { + obs <- ClimProjDiags::Subset(x = obs, along = memb_dim, indices = 1, drop = 'selected') + } else { + stop("Not implemented for observations with members ('obs' can have 'memb_dim', ", + "but it should be of length = 1).") + } + } + } ## pval if (!is.logical(pval) | length(pval) > 1) { stop("Parameter 'pval' must be one logical value.") @@ -128,6 +173,14 @@ RMSSS <- function(exp, obs, time_dim = 'sdate', dat_dim = 'dataset', if (!is.numeric(alpha) | length(alpha) > 1) { stop("Parameter 'alpha' must be one numeric value.") } + ## sig_method + if (length(sig_method) != 1 | !any(sig_method %in% c('one-sided Fisher', 'Random Walk'))) { + stop("Parameter 'sig_method' must be one of 'one-sided Fisher' or 'Random Walk'.") + } + if (sig_method == "Random Walk" & pval == T) { + warning("p-value cannot be calculated by significance method 'Random Walk'.") + pval <- FALSE + } ## ncores if (!is.null(ncores)) { if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 | @@ -138,66 +191,185 @@ RMSSS <- function(exp, obs, time_dim = 'sdate', dat_dim = 'dataset', ## exp and obs (2) name_exp <- sort(names(dim(exp))) name_obs <- sort(names(dim(obs))) + if (!is.null(memb_dim)) { + name_exp <- name_exp[-which(name_exp == memb_dim)] + } if (!is.null(dat_dim)) { name_exp <- name_exp[-which(name_exp == dat_dim)] name_obs <- name_obs[-which(name_obs == dat_dim)] } if(!all(dim(exp)[name_exp] == dim(obs)[name_obs])) { stop(paste0("Parameter 'exp' and 'obs' must have same length of ", - "all dimension except 'dat_dim'.")) + "all dimension except 'memb_dim' and 'dat_dim'.")) + } + if (!is.null(ref)) { + name_ref <- sort(names(dim(ref))) + if (!is.null(memb_dim) && memb_dim %in% name_ref) { + name_ref <- name_ref[-which(name_ref == memb_dim)] + } + if (!is.null(dat_dim)) { + if (dat_dim %in% name_ref) { + if (!identical(dim(exp)[dat_dim], dim(ref)[dat_dim])) { + stop(paste0("If parameter 'ref' has dataset dimension, it must be ", + "equal to dataset dimension of 'exp'.")) + } + name_ref <- name_ref[-which(name_ref == dat_dim)] + } + } + if (!identical(length(name_exp), length(name_ref)) | + !identical(dim(exp)[name_exp], dim(ref)[name_ref])) { + stop(paste0("Parameter 'exp' and 'ref' must have the same length of ", + "all dimensions except 'memb_dim' and 'dat_dim' if there is ", + "only one reference dataset.")) + } } + if (dim(exp)[time_dim] <= 2) { stop("The length of time_dim must be more than 2 to compute RMSSS.") } ############################### - # Sort dimension - name_exp <- names(dim(exp)) - name_obs <- names(dim(obs)) - order_obs <- match(name_exp, name_obs) - obs <- Reorder(obs, order_obs) +# # Sort dimension +# name_exp <- names(dim(exp)) +# name_obs <- names(dim(obs)) +# order_obs <- match(name_exp, name_obs) +# obs <- Reorder(obs, order_obs) + + + ############################### + # Create ref array if needed + if (is.null(ref)) ref <- 0 + if (!is.array(ref)) { + ref <- array(data = ref, dim = dim(exp)) + } + ############################### + ## Ensemble mean + if (!is.null(memb_dim)) { + exp <- MeanDims(exp, memb_dim, na.rm = T) + if (!is.null(ref) & memb_dim %in% names(dim(ref))) { + ref <- MeanDims(ref, memb_dim, na.rm = T) + } + } ############################### # Calculate RMSSS - - res <- Apply(list(exp, obs), - target_dims = list(c(time_dim, dat_dim), - c(time_dim, dat_dim)), + +# if (!is.null(ref)) { # use "ref" as reference forecast +# if (!is.null(dat_dim) && dat_dim %in% names(dim(ref))) { +# target_dims_ref <- c(time_dim, dat_dim) +# } else { +# target_dims_ref <- c(time_dim) +# } +# data <- list(exp = exp, obs = obs, ref = ref) +# target_dims = list(exp = c(time_dim, dat_dim), +# obs = c(time_dim, dat_dim), +# ref = target_dims_ref) +# } else { +# data <- list(exp = exp, obs = obs) +# target_dims = list(exp = c(time_dim, dat_dim), +# obs = c(time_dim, dat_dim)) +# } + data <- list(exp = exp, obs = obs, ref = ref) + if (!is.null(dat_dim)) { + if (dat_dim %in% names(dim(ref))) { + target_dims <- list(exp = c(time_dim, dat_dim), + obs = c(time_dim, dat_dim), + ref = c(time_dim, dat_dim)) + } else { + target_dims <- list(exp = c(time_dim, dat_dim), + obs = c(time_dim, dat_dim), + ref = c(time_dim)) + } + } else { + target_dims <- list(exp = time_dim, obs = time_dim, ref = time_dim) + } + + res <- Apply(data, + target_dims = target_dims, fun = .RMSSS, time_dim = time_dim, dat_dim = dat_dim, pval = pval, sign = sign, alpha = alpha, + sig_method = sig_method, ncores = ncores) return(res) } -.RMSSS <- function(exp, obs, time_dim = 'sdate', dat_dim = 'dataset', pval = TRUE, - sign = FALSE, alpha = 0.05) { +.RMSSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', dat_dim = 'dataset', pval = TRUE, + sign = FALSE, alpha = 0.05, sig_method = 'one-sided Fisher') { # exp: [sdate, (dat)] # obs: [sdate, (dat)] + # ref: [sdate, (dat)] or NULL + + if (is.null(ref)) { + ref <- array(data = 0, dim = dim(obs)) + } else if (identical(ref, 0) | identical(ref, 1)) { + ref <- array(ref, dim = dim(exp)) + } if (is.null(dat_dim)) { # exp: [sdate] # obs: [sdate] nexp <- 1 nobs <- 1 - dim(exp) <- c(dim(exp), nexp = nexp) - dim(obs) <- c(dim(obs), nobs = nobs) + nref <- 1 + # Add dat dim back temporarily + dim(exp) <- c(dim(exp), dat = 1) + dim(obs) <- c(dim(obs), dat = 1) + dim(ref) <- c(dim(ref), dat = 1) + } else { # exp: [sdate, dat_exp] # obs: [sdate, dat_obs] nexp <- as.numeric(dim(exp)[2]) nobs <- as.numeric(dim(obs)[2]) + if (dat_dim %in% names(dim(ref))) { + nref <- as.numeric(dim(ref)[2]) + } else { + dim(ref) <- c(dim(ref), dat = 1) + nref <- 1 + } } nsdate <- as.numeric(dim(exp)[1]) - - p_val <- array(dim = c(nexp = nexp, nobs = nobs)) + + # RMS of forecast dif1 <- array(dim = c(nsdate, nexp, nobs)) names(dim(dif1)) <- c(time_dim, 'nexp', 'nobs') + for (i in 1:nobs) { + dif1[, , i] <- sapply(1:nexp, function(x) {exp[, x] - obs[, i]}) + } + + rms_exp <- apply(dif1^2, c(2, 3), mean, na.rm = TRUE)^0.5 #array(dim = c(nexp, nobs)) + + # RMS of reference +# if (!is.null(ref)) { + dif2 <- array(dim = c(nsdate, nref, nobs)) + names(dim(dif2)) <- c(time_dim, 'nexp', 'nobs') + for (i in 1:nobs) { + dif2[, , i] <- sapply(1:nref, function(x) {ref[, x] - obs[, i]}) + } + rms_ref <- apply(dif2^2, c(2, 3), mean, na.rm = TRUE)^0.5 #array(dim = c(nref, nobs)) + if (nexp != nref) { + # expand rms_ref to nexp (nref is 1) + rms_ref <- array(rms_ref, dim = c(nobs = nobs, nexp = nexp)) + rms_ref <- Reorder(rms_ref, c(2, 1)) + } +# } else { +# rms_ref <- array(colMeans(obs^2, na.rm = TRUE)^0.5, dim = c(nobs = nobs, nexp = nexp)) +## rms_ref[which(abs(rms_ref) <= (max(abs(rms_ref), na.rm = TRUE) / 1000))] <- max(abs( +## rms_ref), na.rm = TRUE) / 1000 +# rms_ref <- Reorder(rms_ref, c(2, 1)) +# #rms_ref above: [nexp, nobs] +# } + + rmsss <- 1 - rms_exp / rms_ref + +################################################# + # if (conf) { # conflow <- (1 - conf.lev) / 2 # confhigh <- 1 - conflow @@ -205,45 +377,51 @@ RMSSS <- function(exp, obs, time_dim = 'sdate', dat_dim = 'dataset', # conf_high <- array(dim = c(nexp = nexp, nobs = nobs)) # } - # dif1 - for (i in 1:nobs) { - dif1[, , i] <- sapply(1:nexp, function(x) {exp[, x] - obs[, i]}) - } + if (sig_method == 'one-sided Fisher') { + p_val <- array(dim = c(nexp = nexp, nobs = nobs)) + ## pval and sign + if (pval || sign) { + eno1 <- Eno(dif1, time_dim) + if (is.null(ref)) { + eno2 <- Eno(obs, time_dim) + eno2 <- array(eno2, dim = c(nobs = nobs, nexp = nexp)) + eno2 <- Reorder(eno2, c(2, 1)) + } else { + eno2 <- Eno(dif2, time_dim) + if (nref != nexp) { + eno2 <- array(eno2, dim = c(nobs = nobs, nexp = nexp)) + eno2 <- Reorder(eno2, c(2, 1)) + } + } - rms1 <- apply(dif1^2, c(2, 3), mean, na.rm = TRUE)^0.5 #array(dim = c(nexp, nobs)) - rms2 <- array(colMeans(obs^2, na.rm = TRUE)^0.5, dim = c(nobs = nobs)) - rms2[which(abs(rms2) <= (max(abs(rms2), na.rm = TRUE) / 1000))] <- max(abs( - rms2), na.rm = TRUE) / 1000 - #rms2 above: [nobs] - rms2 <- array(rms2, dim = c(nobs = nobs, nexp = nexp)) - #rms2 above: [nobs, nexp] - rms2 <- Reorder(rms2, c(2, 1)) - #rms2 above: [nexp, nobs] - - # use rms1 and rms2 to calculate rmsss - rmsss <- 1 - rms1/rms2 - - ## pval and sign - if (pval || sign) { - eno1 <- Eno(dif1, time_dim) - eno2 <- Eno(obs, time_dim) - eno2 <- array(eno2, dim = c(nobs = nobs, nexp = nexp)) - eno2 <- Reorder(eno2, c(2, 1)) - } + F.stat <- (eno2 * rms_ref^2 / (eno2 - 1)) / ((eno1 * rms_exp^2 / (eno1- 1))) + tmp <- !is.na(eno1) & !is.na(eno2) & eno1 > 2 & eno2 > 2 + p_val <- 1 - pf(F.stat, eno1 - 1, eno2 - 1) + if (sign) signif <- p_val <= alpha + # If there isn't enough valid data, return NA + p_val[which(!tmp)] <- NA + if (sign) signif[which(!tmp)] <- NA + + # change not enough valid data rmsss to NA + rmsss[which(!tmp)] <- NA + } - # pval - if (pval || sign) { + } else if (sig_method == "Random Walk") { + signif <- array(dim = c(nexp = nexp, nobs = nobs)) + for (i in 1:nexp) { + for (j in 1:nobs) { - F.stat <- (eno2 * rms2^2 / (eno2- 1)) / ((eno1 * rms1^2 / (eno1- 1))) - tmp <- !is.na(eno1) & !is.na(eno2) & eno1 > 2 & eno2 > 2 - p_val <- 1 - pf(F.stat, eno1 - 1, eno2 - 1) - if (sign) signif <- p_val <= alpha - # If there isn't enough valid data, return NA - p_val[which(!tmp)] <- NA - if (sign) signif[which(!tmp)] <- NA - - # change not significant rmsss to NA - rmsss[which(!tmp)] <- NA + # Error + error_exp <- array(data = abs(exp[, i] - obs[, j]), dim = c(time = nsdate)) + if (nref == nexp) { + error_ref <- array(data = abs(ref[, i] - obs[, j]), dim = c(time = nsdate)) + } else { + # nref = 1 + error_ref <- array(data = abs(ref - obs[, j]), dim = c(time = nsdate)) + } + signif[i, j] <- .RandomWalkTest(skill_A = error_exp, skill_B = error_ref)$signif + } + } } ################################### -- GitLab