diff --git a/conf/variable-dictionary.yml b/conf/variable-dictionary.yml index f0dfcb5676f4e855d9f61692691cacbf9cf6fc95..994256ced8cabf3b36f779b48de8f5f72e24d424 100644 --- a/conf/variable-dictionary.yml +++ b/conf/variable-dictionary.yml @@ -264,3 +264,5 @@ metrics: long_name: "Mean Bias Skill Score Statistical Significance" enssprerr: long_name: "Ensemble Spread-To-Error Ratio" + rmsss: + long_name: "Root Mean Square Skill Score" diff --git a/modules/Loading/testing_recipes/recipe_test-new-metrics.yml b/modules/Loading/testing_recipes/recipe_test-new-metrics.yml new file mode 100644 index 0000000000000000000000000000000000000000..df84138d71bcd3d9e660b85b2d3fb4b4d13c8080 --- /dev/null +++ b/modules/Loading/testing_recipes/recipe_test-new-metrics.yml @@ -0,0 +1,46 @@ +Description: + Author: V. Agudetse + +Analysis: + Horizon: Seasonal + Variables: + name: tas + freq: monthly_mean + Datasets: + System: + name: system7c3s + Multimodel: False + Reference: + name: era5 + Time: + sdate: '1101' + fcst_year: '2020' + hcst_start: '1998' + hcst_end: '2010' + ftime_min: 1 + ftime_max: 2 + Region: + latmin: -10 + latmax: 10 + lonmin: 0 + lonmax: 20 + Regrid: + method: bilinear + type: to_system + Workflow: + Calibration: + method: mse_min + Skill: + metric: RMSSS + Probabilities: + percentiles: [[1/3, 2/3], [1/10, 9/10]] + Indicators: + index: no + ncores: 7 + remove_NAs: yes + Output_format: S2S4E +Run: + Loglevel: INFO + Terminal: yes + output_dir: /esarchive/scratch/vagudets/repos/auto-s2s/out-logs/ + code_dir: /esarchive/scratch/vagudets/repos/auto-s2s/ diff --git a/modules/Skill/Skill.R b/modules/Skill/Skill.R index 99f12346c21e98e87c4e78efcaf688c0bb37ee41..76b08942c805cf2a0e5d889cb3f84761076d7d72 100644 --- a/modules/Skill/Skill.R +++ b/modules/Skill/Skill.R @@ -14,6 +14,7 @@ source("modules/Skill/s2s.metrics.R") source("modules/Skill/tmp/RandomWalkTest.R") source("modules/Skill/tmp/Bias.R") source("modules/Skill/tmp/AbsBiasSS.R") +source("modules/Skill/tmp/RMSSS.R") ## TODO: Implement this in the future ## Which parameter are required? @@ -147,6 +148,7 @@ compute_skill_metrics <- function(recipe, exp, obs) { skill_metrics[[ paste0(metric, "_significance") ]] <- skill$sign # Ensemble mean correlation } else if (metric %in% c('enscorr', 'corr')) { + ## TODO: Return significance ## TODO: Implement option for Kendall and Spearman methods? skill <- s2dv::Corr(exp$data, obs$data, dat_dim = 'dat', @@ -161,6 +163,22 @@ compute_skill_metrics <- function(recipe, exp, obs) { skill_metrics[[ paste0(metric, "_p.value") ]] <- skill$p.val skill_metrics[[ paste0(metric, "_conf.low") ]] <- skill$conf.lower skill_metrics[[ paste0(metric, "_conf.up") ]] <- skill$conf.upper + } else if (metric == 'rmsss') { + # Compute hcst ensemble mean + exp_ensmean <- MeanDims(exp$data, dims = memb_dim, drop = FALSE) + # Compute RMSS + skill <- RMSSS(exp_ensmean, obs$data, + dat_dim = 'dat', + time_dim = time_dim, + pval = FALSE, + sign = TRUE, + ncores = ncores) + rm(exp_ensmean) + # Compute ensemble mean and modify dimensions + skill <- lapply(skill, function(x) { + .drop_dims(x)}) + skill_metrics[[ metric ]] <- skill$rmsss + skill_metrics[[ paste0(metric, "_significance") ]] <- skill$sign } else if (metric == 'enssprerr') { # Remove ensemble dim from obs to avoid veriApply warning obs_noensdim <- ClimProjDiags::Subset(obs$data, "ensemble", 1, diff --git a/modules/Skill/tmp/RMSSS.R b/modules/Skill/tmp/RMSSS.R new file mode 100644 index 0000000000000000000000000000000000000000..9c47da4554aebd21fa8c17e963b0121557779d0c --- /dev/null +++ b/modules/Skill/tmp/RMSSS.R @@ -0,0 +1,270 @@ +#'Compute root mean square error skill score +#' +#'Compute the root mean square error skill score (RMSSS) between an array of +#'forecast 'exp' and an array of observation 'obs'. The two arrays should +#'have the same dimensions except along dat_dim, where the length can be +#'different, with the number of experiments/models (nexp) and the number of +#'observational datasets (nobs).\cr +#'RMSSS computes the root mean square error skill score of each jexp in 1:nexp +#'against each job in 1:nobs which gives nexp * nobs RMSSS for each grid point +#'of the array.\cr +#'The RMSSS are computed along the time_dim dimension which should correspond +#'to the start date dimension.\cr +#'The p-value and significance test are optionally provided by an one-sided +#'Fisher test.\cr +#' +#'@param exp A named numeric array of experimental data which contains at least +#' two dimensions for dat_dim and time_dim. It can also be a vector with the +#' same length as 'obs', then the vector will automatically be 'time_dim' and +#' 'dat_dim' will be 1. +#'@param obs A named numeric array of observational data which contains at least +#' two dimensions for dat_dim and time_dim. The dimensions should be the same +#' as paramter 'exp' except the length of 'dat_dim' dimension. The order of +#' dimension can be different. It can also be a vector with the same length as +#' 'exp', then the vector will automatically be 'time_dim' and 'dat_dim' will +#' be 1. +#'@param dat_dim A character string indicating the name of dataset (nobs/nexp) +#' dimension. The default value is 'dataset'. +#'@param time_dim A character string indicating the name of dimension along +#' which the RMSSS are computed. The default value is 'sdate'. +#'@param pval A logical value indicating whether to compute or not the p-value +#' of the test Ho: RMSSS = 0. The default value is TRUE. +#'@param sign A logical value indicating whether to compute or not the +#' statistical significance of the test Ho: RMSSS = 0. The default value is +#' FALSE. +#'@param alpha A numeric of the significance level to be used in the +#' statistical significance test. The default value is 0.05. +#'@param ncores An integer indicating the number of cores to use for parallel +#' computation. The default value is NULL. +#' +#'@return +#'A list containing the numeric arrays with dimension:\cr +#' c(nexp, nobs, all other dimensions of exp except time_dim).\cr +#'nexp is the number of experiment (i.e., dat_dim in exp), and nobs is the +#'number of observation (i.e., dat_dim in obs). If dat_dim is NULL, nexp and +#'nobs are omitted.\cr +#'\item{$rmsss}{ +#' A numerical array of the root mean square error skill score. +#'} +#'\item{$p.val}{ +#' A numerical array of the p-value with the same dimensions as $rmsss. +#' Only present if \code{pval = TRUE}. +#'} +#'\item{sign}{ +#' A logical array of the statistical significance of the RMSSS with the same +#' dimensions as $rmsss. Only present if \code{sign = TRUE}. +#'} +#' +#'@examples +#' set.seed(1) +#' exp <- array(rnorm(30), dim = c(dataset = 2, time = 3, memb = 5)) +#' set.seed(2) +#' obs <- array(rnorm(15), dim = c(time = 3, memb = 5, dataset = 1)) +#' res <- RMSSS(exp, obs, time_dim = 'time', dat_dim = 'dataset') +#' +#'@rdname RMSSS +#'@import multiApply +#'@importFrom stats pf +#'@export +RMSSS <- function(exp, obs, time_dim = 'sdate', dat_dim = 'dataset', + pval = TRUE, sign = FALSE, alpha = 0.05, ncores = NULL) { + + # Check inputs + ## exp and obs (1) + if (is.null(exp) | is.null(obs)) { + stop("Parameter 'exp' and 'obs' cannot be NULL.") + } + if (!is.numeric(exp) | !is.numeric(obs)) { + stop("Parameter 'exp' and 'obs' must be a numeric array.") + } + if (is.null(dim(exp)) & is.null(dim(obs))) { #is vector + if (length(exp) == length(obs)) { + exp <- array(exp, dim = c(length(exp), 1)) + names(dim(exp)) <- c(time_dim, dat_dim) + obs <- array(obs, dim = c(length(obs), 1)) + names(dim(obs)) <- c(time_dim, dat_dim) + } else { + stop(paste0("Parameter 'exp' and 'obs' must be array with as least two ", + "dimensions time_dim and dat_dim, or vector of same length.")) + } + } else if (is.null(dim(exp)) | is.null(dim(obs))) { + stop(paste0("Parameter 'exp' and 'obs' must be array with as least two ", + "dimensions time_dim and dat_dim, or vector of same length.")) + } + if(any(is.null(names(dim(exp))))| any(nchar(names(dim(exp))) == 0) | + any(is.null(names(dim(obs))))| any(nchar(names(dim(obs))) == 0)) { + stop("Parameter 'exp' and 'obs' must have dimension names.") + } + if(!all(names(dim(exp)) %in% names(dim(obs))) | + !all(names(dim(obs)) %in% names(dim(exp)))) { + stop("Parameter 'exp' and 'obs' must have same dimension name.") + } + ## time_dim + if (!is.character(time_dim) | length(time_dim) > 1) { + stop("Parameter 'time_dim' must be a character string.") + } + if (!time_dim %in% names(dim(exp)) | !time_dim %in% names(dim(obs))) { + stop("Parameter 'time_dim' is not found in 'exp' or 'obs' dimension.") + } + ## dat_dim + if (!is.null(dat_dim)) { + if (!is.character(dat_dim) | length(dat_dim) > 1) { + stop("Parameter 'dat_dim' must be a character string or NULL.") + } + if (!dat_dim %in% names(dim(exp)) | !dat_dim %in% names(dim(obs))) { + stop("Parameter 'dat_dim' is not found in 'exp' or 'obs' dimension.", + " Set it as NULL if there is no dataset dimension.") + } + } + ## pval + if (!is.logical(pval) | length(pval) > 1) { + stop("Parameter 'pval' must be one logical value.") + } + ## sign + if (!is.logical(sign) | length(sign) > 1) { + stop("Parameter 'sign' must be one logical value.") + } + ## alpha + if (!is.numeric(alpha) | length(alpha) > 1) { + stop("Parameter 'alpha' must be one numeric value.") + } + ## ncores + if (!is.null(ncores)) { + if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 | + length(ncores) > 1) { + stop("Parameter 'ncores' must be a positive integer.") + } + } + ## exp and obs (2) + name_exp <- sort(names(dim(exp))) + name_obs <- sort(names(dim(obs))) + if (!is.null(dat_dim)) { + name_exp <- name_exp[-which(name_exp == dat_dim)] + name_obs <- name_obs[-which(name_obs == dat_dim)] + } + if(!all(dim(exp)[name_exp] == dim(obs)[name_obs])) { + stop(paste0("Parameter 'exp' and 'obs' must have same length of ", + "all dimension except 'dat_dim'.")) + } + if (dim(exp)[time_dim] <= 2) { + stop("The length of time_dim must be more than 2 to compute RMSSS.") + } + + + ############################### + # Sort dimension + name_exp <- names(dim(exp)) + name_obs <- names(dim(obs)) + order_obs <- match(name_exp, name_obs) + obs <- Reorder(obs, order_obs) + + + ############################### + # Calculate RMSSS + + res <- Apply(list(exp, obs), + target_dims = list(c(time_dim, dat_dim), + c(time_dim, dat_dim)), + fun = .RMSSS, + time_dim = time_dim, dat_dim = dat_dim, + pval = pval, sign = sign, alpha = alpha, + ncores = ncores) + + return(res) +} + +.RMSSS <- function(exp, obs, time_dim = 'sdate', dat_dim = 'dataset', pval = TRUE, + sign = FALSE, alpha = 0.05) { + # exp: [sdate, (dat)] + # obs: [sdate, (dat)] + + if (is.null(dat_dim)) { + # exp: [sdate] + # obs: [sdate] + nexp <- 1 + nobs <- 1 + dim(exp) <- c(dim(exp), nexp = nexp) + dim(obs) <- c(dim(obs), nobs = nobs) + } else { + # exp: [sdate, dat_exp] + # obs: [sdate, dat_obs] + nexp <- as.numeric(dim(exp)[2]) + nobs <- as.numeric(dim(obs)[2]) + } + + nsdate <- as.numeric(dim(exp)[1]) + + p_val <- array(dim = c(nexp = nexp, nobs = nobs)) + dif1 <- array(dim = c(nsdate, nexp, nobs)) + names(dim(dif1)) <- c(time_dim, 'nexp', 'nobs') + +# if (conf) { +# conflow <- (1 - conf.lev) / 2 +# confhigh <- 1 - conflow +# conf_low <- array(dim = c(nexp = nexp, nobs = nobs)) +# conf_high <- array(dim = c(nexp = nexp, nobs = nobs)) +# } + + # dif1 + for (i in 1:nobs) { + dif1[, , i] <- sapply(1:nexp, function(x) {exp[, x] - obs[, i]}) + } + + rms1 <- apply(dif1^2, c(2, 3), mean, na.rm = TRUE)^0.5 #array(dim = c(nexp, nobs)) + rms2 <- array(colMeans(obs^2, na.rm = TRUE)^0.5, dim = c(nobs = nobs)) + rms2[which(abs(rms2) <= (max(abs(rms2), na.rm = TRUE) / 1000))] <- max(abs( + rms2), na.rm = TRUE) / 1000 + #rms2 above: [nobs] + rms2 <- array(rms2, dim = c(nobs = nobs, nexp = nexp)) + #rms2 above: [nobs, nexp] + rms2 <- Reorder(rms2, c(2, 1)) + #rms2 above: [nexp, nobs] + + # use rms1 and rms2 to calculate rmsss + rmsss <- 1 - rms1/rms2 + + ## pval and sign + if (pval || sign) { + eno1 <- Eno(dif1, time_dim) + eno2 <- Eno(obs, time_dim) + eno2 <- array(eno2, dim = c(nobs = nobs, nexp = nexp)) + eno2 <- Reorder(eno2, c(2, 1)) + } + + # pval + if (pval || sign) { + + F.stat <- (eno2 * rms2^2 / (eno2- 1)) / ((eno1 * rms1^2 / (eno1- 1))) + tmp <- !is.na(eno1) & !is.na(eno2) & eno1 > 2 & eno2 > 2 + p_val <- 1 - pf(F.stat, eno1 - 1, eno2 - 1) + if (sign) signif <- p_val <= alpha + # If there isn't enough valid data, return NA + p_val[which(!tmp)] <- NA + if (sign) signif[which(!tmp)] <- NA + + # change not significant rmsss to NA + rmsss[which(!tmp)] <- NA + } + + ################################### + # Remove extra dimensions if dat_dim = NULL + if (is.null(dat_dim)) { + dim(rmsss) <- NULL + dim(p_val) <- NULL + if (sign) dim(signif) <- NULL + } + ################################### + + # output + res <- list(rmsss = rmsss) + if (pval) { + p.val <- list(p.val = p_val) + res <- c(res, p.val) + } + if (sign) { + signif <- list(sign = signif) + res <- c(res, signif) + } + + return(res) +} diff --git a/modules/Visualization/Visualization.R b/modules/Visualization/Visualization.R index ff0e9fd44d8e71b831819271d51c7f1374e20068..2d50a1a0a4f8971087f29e64e7bda832ff63043c 100644 --- a/modules/Visualization/Visualization.R +++ b/modules/Visualization/Visualization.R @@ -93,7 +93,7 @@ plot_skill_metrics <- function(recipe, archive, data_cube, skill_metrics, # Group different metrics by type skill_scores <- c("rpss", "bss90", "bss10", "frpss", "crpss", "mean_bias_ss", "enscorr", "rpss_specs", "bss90_specs", "bss10_specs", - "enscorr_specs") + "enscorr_specs", "rmsss") scores <- c("rps", "frps", "crps", "frps_specs") for (name in c(skill_scores, scores, "mean_bias", "enssprerr")) { @@ -101,7 +101,8 @@ plot_skill_metrics <- function(recipe, archive, data_cube, skill_metrics, if (name %in% names(skill_metrics)) { # Define plot characteristics and metric name to display in plot if (name %in% c("rpss", "bss90", "bss10", "frpss", "crpss", - "rpss_specs", "bss90_specs", "bss10_specs")) { + "rpss_specs", "bss90_specs", "bss10_specs", + "rmsss")) { display_name <- toupper(strsplit(name, "_")[[1]][1]) skill <- skill_metrics[[name]] brks <- seq(-1, 1, by = 0.1)