diff --git a/DESCRIPTION b/DESCRIPTION index 3a79ba60fd249bcc9da43aa9aa078b9b3997566e..eee31d4a3926e55b73eb1110361e05df1979535c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: s2dv Title: A Set of Common Tools for Seasonal to Decadal Verification -Version: 1.3.0 +Version: 1.4.0 Authors@R: c( person("BSC-CNS", role = c("aut", "cph")), person("An-Chi", "Ho", , "an.ho@bsc.es", role = c("aut", "cre")), @@ -44,7 +44,7 @@ Imports: easyVerification Suggests: testthat -License: Apache License 2.0 +License: GPL-3 URL: https://earth.bsc.es/gitlab/es/s2dv/ BugReports: https://earth.bsc.es/gitlab/es/s2dv/-/issues LazyData: true diff --git a/NAMESPACE b/NAMESPACE index d440ec097f07df8d3c0b7e37629587de38d0a9d9..6224a157d96700787b4a63b5897d1d6c71d53996 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -59,6 +59,7 @@ export(ProjectField) export(REOF) export(RMS) export(RMSSS) +export(ROCSS) export(RPS) export(RPSS) export(RandomWalkTest) @@ -100,6 +101,7 @@ importFrom(SpecsVerification,enscrps_cpp) importFrom(abind,abind) importFrom(abind,adrop) importFrom(easyNCDF,ArrayToNc) +importFrom(easyVerification,EnsRoca) importFrom(easyVerification,convert2prob) importFrom(grDevices,bmp) importFrom(grDevices,col2rgb) diff --git a/NEWS.md b/NEWS.md index c7daf9370c409791fe9679082ba30ce06f8cf347..4711c9c8d448156dca707c32b29e8fa94871b4b8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,26 @@ +# s2dv 1.4.0 (Release date: 2023-03-21) +**Bugfixes** +- AbsBiasSS() significance test bugfix +- RPSS() significance test bugfix +- Trend() output "p.val" bugfix when NAs exist +- RMS() bugfix when dat_dim is NULL and conf is FALSE +- NAO() parameter "ftime_avg" sanity check improvement +- CDORemap() recognizes the CDO version with non-numeric values +- CDORemap() reorders the unlimited dimension to the last position in order to save as netCDF correctly + +**Development** +- Make the argument default values consistent between functions +- Season() sanity check improvement +- RMSSS() new parameters: "ref", "memb_dim", "sig_method". RandomWalkTest() is one option for significance test. +- Corr() new output "sign" and change parameter "conf.lev" to "alpha" +- CRPSS() uses cross-validation when `ref` is NULL +- RPS() and RPSS(): New parameter "cross.val" to choose to use cross-validation or not +- New function: ROCSS() +- RandomWalkTest(): New parameters "alpha" and "test.type"; Test method options: 'two.sided.approx','two.sided','greater','less'; change from positively oriented to negatively oriented +- Reorder(): Reorder attribute "dimensions" along with the data reordering. The attribute exists in Load() objects. +- ProjectField() efficiency improvement +- NAO(): parameter "ftime_avg" can be NULL so no average is calculated + # s2dv 1.3.0 (Release date: 2022-10-17) - New functions: Bias, AbsBiasSS, CRPS, CRPSS - split RPSS parameter 'weights' into 'weights_exp' and 'weights_ref' diff --git a/R/AMV.R b/R/AMV.R index e14897b291de60fa2c6ad5051e1e070eed72f094..916762813f71cacbd6aa4f1541dcf2207544e666 100644 --- a/R/AMV.R +++ b/R/AMV.R @@ -224,12 +224,12 @@ AMV <- function(data, data_lats, data_lons, type, lat_dim = 'lat', lon_dim = 'lo mean_1 <- ClimProjDiags::WeightedMean(data = data, lon = data_lons, lat = data_lats, region = regions$reg1, - londim = which(names(dim(data)) == lon_dim), - latdim = which(names(dim(data)) == lat_dim)) + londim = lon_dim, + latdim = lat_dim) mean_2 <- ClimProjDiags::WeightedMean(data = data, lon = data_lons, lat = data_lats, region = regions$reg2, - londim = which(names(dim(data)) == lon_dim), - latdim = which(names(dim(data)) == lat_dim)) + londim = lon_dim, + latdim = lat_dim) data <- ClimProjDiags::CombineIndices(indices = list(mean_1,mean_2), weights = NULL, operation = 'subtract') # (mean_1 - mean_2) diff --git a/R/AbsBiasSS.R b/R/AbsBiasSS.R index 0ceb009c7b4dc33b6f9788dc6cb8459f0e25767b..8cb1bcee6edabb662aace1ba9e1aaa8da5e3ba82 100644 --- a/R/AbsBiasSS.R +++ b/R/AbsBiasSS.R @@ -267,7 +267,7 @@ AbsBiasSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', memb_dim = NULL, bias_ref <- .Bias(exp = ref_data, obs = obs_data, na.rm = na.rm, absolute = TRUE, time_mean = FALSE) ## Skill score and significance biasSS[i, j] <- 1 - mean(bias_exp) / mean(bias_ref) - sign[i, j] <- .RandomWalkTest(skill_A = bias_exp, skill_B = bias_ref)$signif + sign[i, j] <- .RandomWalkTest(skill_A = bias_exp, skill_B = bias_ref, sign = T, pval = F)$sign } } diff --git a/R/CDORemap.R b/R/CDORemap.R index f0044cba60d30889d2d1aa0a68f4057ad035357e..09a00040c6b9eb7b9660a344f1891c41312614b0 100644 --- a/R/CDORemap.R +++ b/R/CDORemap.R @@ -629,13 +629,14 @@ CDORemap <- function(data_array = NULL, lons, lats, grid, method, if (nchar(Sys.which('cdo')[1]) < 1) { stop("CDO must be installed in order to use the .CDORemap.") } - cdo_version <- as.numeric_version( + cdo_version <- strsplit(suppressWarnings(system2("cdo", args = '-V', stderr = TRUE))[[1]], ' ')[[1]][5] - ) .warning(paste0("CDORemap: Using CDO version ", cdo_version, ".")) + cdo_version <- as.numeric_version(unlist(strsplit(cdo_version, "[A-Za-z]", fixed = FALSE))[[1]]) if ((cdo_version >= as.numeric_version('1.7.0')) && (method == 'con')) { method <- 'ycon' } + # CDO takes arrays of 3 dimensions or 4 if one of them is unlimited. # The unlimited dimension can only be the left-most (right-most in R). # There are no restrictions for the dimension names or variable names. @@ -666,7 +667,10 @@ CDORemap <- function(data_array = NULL, lons, lats, grid, method, total_slices <- 1 other_dims_per_chunk <- ifelse(is_irregular, 1, 2) # 4 (the maximum accepted by CDO) - 2 (lon, lat) = 2. if (length(other_dims) > 1 || (length(other_dims) > 0 && (is_irregular))) { - if (!(length(dim(data_array)) %in% other_dims)) { + # If lat/lon is the last dimension OR the largest other_dims is not the last one, + # reorder the largest other dimension to the last as unlimited dim. + if (!(length(dim(data_array)) %in% other_dims) | + which.max(dim(data_array)[other_dims]) != length(other_dims)) { if (avoid_writes || is_irregular) { dims_mod <- dim(data_array) dims_mod[which(names(dim(data_array)) %in% @@ -675,9 +679,9 @@ CDORemap <- function(data_array = NULL, lons, lats, grid, method, permutation <- (1:length(dim(data_array)))[-dim_to_move] permutation <- c(permutation, dim_to_move) permutation_back <- sort(permutation, index.return = TRUE)$ix - dim_backup <- dim(data_array) +# dim_backup <- dim(data_array) data_array <- aperm(data_array, permutation) - dim(data_array) <- dim_backup[permutation] +# dim(data_array) <- dim_backup[permutation] other_dims <- which(!(names(dim(data_array)) %in% c(lon_dim, lat_dim))) } else { # We allow only lon, lat and 1 more dimension per chunk, so @@ -695,6 +699,9 @@ CDORemap <- function(data_array = NULL, lons, lats, grid, method, total_slices <- prod(dim(slices_to_iterate)) } if ((other_dims_per_chunk > 1) || (other_dims_per_chunk > 0 && is_irregular)) { + #NOTE: Why don't we use the second line here? In history, that line was never used. + # The first line sort() can cause problems. If the largest other_dims is always + # the last dim, tail(other_dims) is enough. unlimited_dim <- tail(sort(tail(other_dims_ordered_by_size, other_dims_per_chunk)), 1) #unlimited_dim <- tail(other_dims) } diff --git a/R/CRPSS.R b/R/CRPSS.R index a6b4a1405a80156149cf16cad5c955b9135428c2..e2b0df6edbd4cd1227c0e5c34991e013422c5048 100644 --- a/R/CRPSS.R +++ b/R/CRPSS.R @@ -222,11 +222,20 @@ CRPSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', memb_dim = 'member', if (is.null(ref)) { ## using climatology as reference forecast ## all the time steps are used as if they were members - ## then, ref dimensions are [sdate, memb], both with length(sdate) + ## then, ref dimensions are [sdate, memb] + ## memb dimension has length(sdate) - 1 due to cross-validation obs_time_len <- dim(obs)[time_dim] if (is.null(dat_dim)) { - ref <- array(data = rep(obs, each = obs_time_len), dim = c(obs_time_len, obs_time_len)) + + ## Without cross-validation: + ## ref <- array(data = rep(obs, each = obs_time_len), dim = c(obs_time_len, obs_time_len)) + ## With cross-validation (excluding the value of that year to create ref for that year): + ref <- array(data = NA, dim = c(obs_time_len, obs_time_len - 1)) + for (i in 1:obs_time_len) { + ref[i, ] <- obs[-i] + } + names(dim(ref)) <- c(time_dim, memb_dim) # ref: [sdate, memb]; obs: [sdate] crps_ref <- .CRPS(exp = ref, obs = obs, time_dim = time_dim, memb_dim = memb_dim, @@ -237,7 +246,15 @@ CRPSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', memb_dim = 'member', crps_ref <- array(dim = c(obs_time_len, nobs)) names(dim(crps_ref)) <- c(time_dim, 'nobs') for (i_obs in 1:nobs) { - ref <- array(data = rep(obs[, i_obs], each = obs_time_len), dim = c(obs_time_len, obs_time_len)) + + ## Without cross-validation: + ## ref <- array(data = rep(obs[, i_obs], each = obs_time_len), dim = c(obs_time_len, obs_time_len)) + ## With cross-validation (excluding the value of that year to create ref for that year): + ref <- array(data = NA, dim = c(obs_time_len, obs_time_len - 1)) + for (i in 1:obs_time_len) { + ref[i, ] <- obs[-i, i_obs] + } + names(dim(ref)) <- c(time_dim, memb_dim) crps_ref[, i_obs] <- .CRPS(exp = ref, obs = ClimProjDiags::Subset(obs, dat_dim, i_obs, drop = 'selected'), time_dim = time_dim, memb_dim = memb_dim, dat_dim = NULL, Fair = Fair) @@ -276,14 +293,14 @@ CRPSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', memb_dim = 'member', for (i in 1:nexp) { for (j in 1:nobs) { crpss[i, j] <- 1 - crps_exp_mean[i, j] / crps_ref_mean[j] - sign[i, j] <- .RandomWalkTest(skill_A = crps_exp_mean[i, j], skill_B = crps_ref_mean[j])$signif + sign[i, j] <- .RandomWalkTest(skill_A = crps_exp_mean[i, j], skill_B = crps_ref_mean[j], sign = T, pval = F)$sign } } } else { for (i in 1:nexp) { for (j in 1:nobs) { crpss[i, j] <- 1 - crps_exp_mean[i, j] / crps_ref_mean[i, j] - sign[i, j] <- .RandomWalkTest(skill_A = crps_exp_mean[i, j], skill_B = crps_ref_mean[i, j])$signif + sign[i, j] <- .RandomWalkTest(skill_A = crps_exp_mean[i, j], skill_B = crps_ref_mean[i, j], sign = T, pval = F)$sign } } } @@ -291,7 +308,7 @@ CRPSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', memb_dim = 'member', } else { crpss <- 1 - mean(crps_exp) / mean(crps_ref) # Significance - sign <- .RandomWalkTest(skill_A = crps_exp, skill_B = crps_ref)$signif + sign <- .RandomWalkTest(skill_A = crps_exp, skill_B = crps_ref, sign = T, pval = F)$sign } return(list(crpss = crpss, sign = sign)) diff --git a/R/Corr.R b/R/Corr.R index 67efb09c0a1119c0970682de43518a32368f48a1..3430647ad579137b0fef76f48c33c5f03a1c000a 100644 --- a/R/Corr.R +++ b/R/Corr.R @@ -38,12 +38,16 @@ #'@param memb A logical value indicating whether to remain 'memb_dim' dimension #' (TRUE) or do ensemble mean over 'memb_dim' (FALSE). Only functional when #' 'memb_dim' is not NULL. The default value is TRUE. -#'@param pval A logical value indicating whether to compute or not the p-value +#'@param pval A logical value indicating whether to return or not the p-value #' of the test Ho: Corr = 0. The default value is TRUE. -#'@param conf A logical value indicating whether to retrieve the confidence -#' intervals or not. The default value is TRUE. -#'@param conf.lev A numeric indicating the confidence level for the -#' regression computation. The default value is 0.95. +#'@param conf A logical value indicating whether to return or not the confidence +#' intervals. The default value is TRUE. +#'@param sign A logical value indicating whether to retrieve the statistical +#' significance of the test Ho: Corr = 0 based on 'alpha'. The default value is +#' FALSE. +#'@param alpha A numeric indicating the significance level for the statistical +#' significance test. The default value is 0.05. +#'@param conf.lev Deprecated. Use alpha now instead. alpha = 1 - conf.lev. #'@param ncores An integer indicating the number of cores to use for parallel #' computation. The default value is NULL. #' @@ -68,6 +72,9 @@ #'\item{$conf.upper}{ #' The upper confidence interval. Only present if \code{conf = TRUE}. #'} +#'\item{$sign}{ +#' The statistical significance. Only present if \code{sign = TRUE}. +#'} #' #'@examples #'# Case 1: Load sample data as in Load() example: @@ -100,8 +107,8 @@ Corr <- function(exp, obs, time_dim = 'sdate', dat_dim = 'dataset', comp_dim = NULL, limits = NULL, method = 'pearson', memb_dim = NULL, memb = TRUE, - pval = TRUE, conf = TRUE, - conf.lev = 0.95, ncores = NULL) { + pval = TRUE, conf = TRUE, sign = FALSE, + alpha = 0.05, conf.lev = NULL, ncores = NULL) { # Check inputs ## exp and obs (1) @@ -185,9 +192,19 @@ Corr <- function(exp, obs, time_dim = 'sdate', dat_dim = 'dataset', if (!is.logical(conf) | length(conf) > 1) { stop("Parameter 'conf' must be one logical value.") } - ## conf.lev - if (!is.numeric(conf.lev) | conf.lev < 0 | conf.lev > 1 | length(conf.lev) > 1) { - stop("Parameter 'conf.lev' must be a numeric number between 0 and 1.") + ## sign + if (!is.logical(sign) | length(sign) > 1) { + stop("Parameter 'sign' must be one logical value.") + } + ## conf.lev + ##NOTE: remove the parameter and the warning after v1.4.0 + if (!missing("conf.lev")) { + .warning(paste0("Argument 'conf.lev' is deprecated. Please use 'alpha' instead. ", + "'alpha' = ", 1 - conf.lev, " is used."), tag = '! Deprecation: ') + } + ## alpha + if (!is.numeric(alpha) | alpha < 0 | alpha > 1 | length(alpha) > 1) { + stop("Parameter 'alpha' must be a numeric number between 0 and 1.") } ## ncores if (!is.null(ncores)) { @@ -259,14 +276,15 @@ Corr <- function(exp, obs, time_dim = 'sdate', dat_dim = 'dataset', fun = .Corr, dat_dim = dat_dim, memb_dim = memb_dim, time_dim = time_dim, method = method, - pval = pval, conf = conf, conf.lev = conf.lev, ncores_input = ncores, + pval = pval, conf = conf, sign = sign, alpha = alpha, ncores = ncores) return(res) } -.Corr <- function(exp, obs, dat_dim = 'dataset', memb_dim = 'member', time_dim = 'sdate', method = 'pearson', - conf = TRUE, pval = TRUE, conf.lev = 0.95, ncores_input = NULL) { +.Corr <- function(exp, obs, dat_dim = 'dataset', memb_dim = 'member', + time_dim = 'sdate', method = 'pearson', + conf = TRUE, pval = TRUE, sign = FALSE, alpha = 0.05) { if (is.null(memb_dim)) { if (is.null(dat_dim)) { # exp: [sdate] @@ -372,20 +390,20 @@ Corr <- function(exp, obs, time_dim = 'sdate', dat_dim = 'dataset', # } # } - if (pval | conf) { + if (pval || conf || sign) { if (method == "kendall" | method == "spearman") { if (!is.null(dat_dim) | !is.null(memb_dim)) { tmp <- apply(obs, c(1:length(dim(obs)))[-1], rank) # for memb_dim = NULL, 2; for memb_dim, c(2, 3) names(dim(tmp))[1] <- time_dim - eno <- Eno(tmp, time_dim, ncores = ncores_input) + eno <- Eno(tmp, time_dim) } else { tmp <- rank(obs) tmp <- array(tmp) names(dim(tmp)) <- time_dim - eno <- Eno(tmp, time_dim, ncores = ncores_input) + eno <- Eno(tmp, time_dim) } } else if (method == "pearson") { - eno <- Eno(obs, time_dim, ncores = ncores_input) + eno <- Eno(obs, time_dim) } if (is.null(memb_dim)) { @@ -406,16 +424,19 @@ Corr <- function(exp, obs, time_dim = 'sdate', dat_dim = 'dataset', #############old################# #This doesn't return error but it's diff from cor.test() when method is spearman and kendall - if (pval) { - t <-sqrt(CORR * CORR * (eno_expand - 2) / (1 - (CORR ^ 2))) + if (pval || sign) { + t <- sqrt(CORR * CORR * (eno_expand - 2) / (1 - (CORR ^ 2))) p.val <- pt(t, eno_expand - 2, lower.tail = FALSE) + if (sign) signif <- !is.na(p.val) & p.val <= alpha } ################################### if (conf) { - conf.lower <- (1 - conf.lev) / 2 + conf.lower <- alpha / 2 conf.upper <- 1 - conf.lower + suppressWarnings({ conflow <- tanh(atanh(CORR) + qnorm(conf.lower) / sqrt(eno_expand - 3)) confhigh <- tanh(atanh(CORR) + qnorm(conf.upper) / sqrt(eno_expand - 3)) + }) } ################################### @@ -433,16 +454,15 @@ Corr <- function(exp, obs, time_dim = 'sdate', dat_dim = 'dataset', ################################### - if (pval & conf) { - res <- list(corr = CORR, p.val = p.val, - conf.lower = conflow, conf.upper = confhigh) - } else if (pval & !conf) { - res <- list(corr = CORR, p.val = p.val) - } else if (!pval & conf) { - res <- list(corr = CORR, - conf.lower = conflow, conf.upper = confhigh) - } else { - res <- list(corr = CORR) + res <- list(corr = CORR) + if (pval) { + res <- c(res, list(p.val = p.val)) + } + if (conf) { + res <- c(res, list(conf.lower = conflow, conf.upper = confhigh)) + } + if (sign) { + res <- c(res, list(sign = signif)) } return(res) diff --git a/R/DiffCorr.R b/R/DiffCorr.R index 1e07458989db3f40db83aa954abfc8b2b8929738..93078248dcf442e6d5500977f875ca1b57db8886 100644 --- a/R/DiffCorr.R +++ b/R/DiffCorr.R @@ -155,8 +155,6 @@ DiffCorr <- function(exp, obs, ref, N.eff = NA, time_dim = 'sdate', if (!test.type %in% c('two-sided', 'one-sided')) { stop("Parameter 'test.type' must be 'two-sided' or 'one-sided'.") } - #NOTE: warning can be removed in the next release - .warning("The default significance test has changed after s2dv_1.2.0. The default method is 'two-sided'.") ## ncores if (!is.null(ncores)) { diff --git a/R/GMST.R b/R/GMST.R index 85b382d524fc763f166971994067c98cc604777b..92109ea6ae570fea6e2454f6f1f0fa75b5fa22e1 100644 --- a/R/GMST.R +++ b/R/GMST.R @@ -273,8 +273,8 @@ GMST <- function(data_tas, data_tos, data_lats, data_lons, mask_sea_land, sea_va data <- ClimProjDiags::WeightedMean(data = data, lon = data_lons, lat = data_lats, region = NULL, - londim = which(names(dim(data)) == lon_dim), - latdim = which(names(dim(data)) == lat_dim)) + londim = lon_dim, + latdim = lat_dim) if (type == 'dcpp'){ target_dims <- c(sdate_dim, fmonth_dim) diff --git a/R/GSAT.R b/R/GSAT.R index 30975b9b1b7f2678f823f3488226a7d10cff0714..0cde94ab0a798d2603a04a1b51c733b4b3032392 100644 --- a/R/GSAT.R +++ b/R/GSAT.R @@ -211,8 +211,8 @@ GSAT <- function(data, data_lats, data_lons, type, lat_dim = 'lat', lon_dim = 'l data <- ClimProjDiags::WeightedMean(data = data, lon = data_lons, lat = data_lats, region = NULL, - londim = which(names(dim(data)) == lon_dim), - latdim = which(names(dim(data)) == lat_dim)) + londim = lon_dim, + latdim = lat_dim) if (type == 'dcpp'){ target_dims <- c(sdate_dim, fmonth_dim) diff --git a/R/NAO.R b/R/NAO.R index 6d48dba6aaec6eb8fc41dba2462c097e514ccc3f..fb5220cf5ce02e73987ec9e85c951d0234c213b3 100644 --- a/R/NAO.R +++ b/R/NAO.R @@ -33,8 +33,8 @@ #'@param ftime_dim A character string indicating the name of the forecast time #' dimension of 'exp' and 'obs'. The default value is 'ftime'. #'@param ftime_avg A numeric vector of the forecast time steps to average -#' across the target period. The default value is 2:4, i.e., from 2nd to 4th -#' forecast time steps. +#' across the target period. If average is not needed, set NULL. The default +#' value is 2:4, i.e., from 2nd to 4th forecast time steps. #'@param obsproj A logical value indicating whether to compute the NAO index by #' projecting the forecast anomalies onto the leading EOF of observational #' reference (TRUE) or compute the NAO by first computing the leading @@ -48,11 +48,13 @@ #'A list which contains: #'\item{exp}{ #' A numeric array of forecast NAO index in verification format with the same -#' dimensions as 'exp' except space_dim and ftime_dim. +#' dimensions as 'exp' except space_dim and ftime_dim. If ftime_avg is NULL, +#' ftime_dim remains. #' } #'\item{obs}{ #' A numeric array of observed NAO index in verification format with the same -#' dimensions as 'obs' except space_dim and ftime_dim. +#' dimensions as 'obs' except space_dim and ftime_dim. If ftime_avg is NULL, +#' ftime_dim remains. #'} #' #'@references @@ -198,16 +200,18 @@ NAO <- function(exp = NULL, obs = NULL, lat, lon, time_dim = 'sdate', } } ## ftime_avg - if (!is.vector(ftime_avg) | !is.integer(ftime_avg)) { - stop("Parameter 'ftime_avg' must be an integer vector.") - } - if (!is.null(exp)) { - if (max(ftime_avg) > dim(exp)[ftime_dim] | min(ftime_avg) < 1) { - stop("Parameter 'ftime_avg' must be within the range of ftime_dim length.") + if (!is.null(ftime_avg)) { + if (!is.vector(ftime_avg) | !is.numeric(ftime_avg)) { + stop("Parameter 'ftime_avg' must be an integer vector.") } - } else { - if (max(ftime_avg) > dim(obs)[ftime_dim] | min(ftime_avg) < 1) { - stop("Parameter 'ftime_avg' must be within the range of ftime_dim length.") + if (!is.null(exp)) { + if (max(ftime_avg) > dim(exp)[ftime_dim] | min(ftime_avg) < 1) { + stop("Parameter 'ftime_avg' must be within the range of ftime_dim length.") + } + } else { + if (max(ftime_avg) > dim(obs)[ftime_dim] | min(ftime_avg) < 1) { + stop("Parameter 'ftime_avg' must be within the range of ftime_dim length.") + } } } ## sdate >= 2 @@ -281,22 +285,24 @@ NAO <- function(exp = NULL, obs = NULL, lat, lon, time_dim = 'sdate', } ## ncores if (!is.null(ncores)) { - if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 | + if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores == 0 | length(ncores) > 1) { stop("Parameter 'ncores' must be a positive integer.") } } # Average ftime - if (!is.null(exp)) { - exp_sub <- ClimProjDiags::Subset(exp, ftime_dim, ftime_avg, drop = FALSE) - exp <- MeanDims(exp_sub, ftime_dim, na.rm = TRUE) - ## Cross-validated PCs. Fabian. This should be extended to - ## nmod and nlt by simple loops. Virginie - } - if (!is.null(obs)) { - obs_sub <- ClimProjDiags::Subset(obs, ftime_dim, ftime_avg, drop = FALSE) - obs <- MeanDims(obs_sub, ftime_dim, na.rm = TRUE) + if (!is.null(ftime_avg)) { + if (!is.null(exp)) { + exp_sub <- ClimProjDiags::Subset(exp, ftime_dim, ftime_avg, drop = FALSE) + exp <- MeanDims(exp_sub, ftime_dim, na.rm = TRUE) + ## Cross-validated PCs. Fabian. This should be extended to + ## nmod and nlt by simple loops. Virginie + } + if (!is.null(obs)) { + obs_sub <- ClimProjDiags::Subset(obs, ftime_dim, ftime_avg, drop = FALSE) + obs <- MeanDims(obs_sub, ftime_dim, na.rm = TRUE) + } } # wght diff --git a/R/ProjectField.R b/R/ProjectField.R index 309f3efd731824e3e031a473a22a657e7d7ed845..55e7fd20e39fe8c9ec9b62f59b4e52d1b2f814cc 100644 --- a/R/ProjectField.R +++ b/R/ProjectField.R @@ -237,10 +237,14 @@ ProjectField <- function(ano, eof, time_dim = 'sdate', space_dim = c('lat', 'lon # Weight e.1 <- eof_mode * wght ano <- ano * InsertDim(wght, 1, ntime) + #ano <- aaply(ano, 1, '*', wght) # much heavier - na <- apply(ano, 1, mean, na.rm = TRUE) # if [lat, lon] all NA, it's NA + na <- rowMeans(ano, na.rm = TRUE) # if [lat, lon] all NA, it's NA + #na <- apply(ano, 1, mean, na.rm = TRUE) # much heavier tmp <- ano * InsertDim(e.1, 1, ntime) # [sdate, lat, lon] - pc.ver <- apply(tmp, 1, sum, na.rm = TRUE) + rm(ano) + #pc.ver <- apply(tmp, 1, sum, na.rm = TRUE) # much heavier + pc.ver <- rowSums(tmp, na.rm = TRUE) pc.ver[which(is.na(na))] <- NA } else { # mode = NULL @@ -250,7 +254,7 @@ ProjectField <- function(ano, eof, time_dim = 'sdate', space_dim = c('lat', 'lon ano <- ano * InsertDim(wght, 1, ntime) dim(ano) <- c(ntime, prod(dim(ano)[2:3])) # [sdate, lat*lon] - na <- apply(ano, 1, mean, na.rm = TRUE) # if [lat, lon] all NA, it's NA + na <- rowMeans(ano, na.rm = TRUE) # if [lat, lon] all NA, it's NA na <- aperm(array(na, dim = c(ntime, dim(e.1)[1])), c(2, 1)) # Matrix multiplication e.1 [mode, lat*lon] by ano [lat*lon, sdate] diff --git a/R/RMS.R b/R/RMS.R index b3188fcd045518458cec57415c4bcd7da3b5621a..645e34b0c753b5017716e20fb3eaa56162089eb1 100644 --- a/R/RMS.R +++ b/R/RMS.R @@ -217,10 +217,9 @@ RMS <- function(exp, obs, time_dim = 'sdate', dat_dim = 'dataset', nobs <- as.numeric(dim(obs)[2]) } - nsdate <- as.numeric(dim(exp)[1]) - - dif <- array(dim = c(sdate = nsdate, nexp = nexp, nobs = nobs)) + dif <- array(dim = c(dim(exp)[1], nexp = nexp, nobs = nobs)) chi <- array(dim = c(nexp = nexp, nobs = nobs)) + if (conf) { conflow <- (1 - conf.lev) / 2 confhigh <- 1 - conflow @@ -232,6 +231,7 @@ RMS <- function(exp, obs, time_dim = 'sdate', dat_dim = 'dataset', for (i in 1:nobs) { dif[, , i] <- sapply(1:nexp, function(x) {exp[, x] - obs[, i]}) } + rms <- apply(dif^2, c(2, 3), mean, na.rm = TRUE)^0.5 #array(dim = c(_exp, nobs)) if (conf) { @@ -255,8 +255,10 @@ RMS <- function(exp, obs, time_dim = 'sdate', dat_dim = 'dataset', # Remove nexp and nobs if dat_dim = NULL if (is.null(dat_dim)) { dim(rms) <- NULL - dim(conf.lower) <- NULL - dim(conf.upper) <- NULL + if (conf) { + dim(conf.lower) <- NULL + dim(conf.upper) <- NULL + } } ################################### @@ -269,4 +271,4 @@ RMS <- function(exp, obs, time_dim = 'sdate', dat_dim = 'dataset', return(res) -} +} \ No newline at end of file diff --git a/R/RMSSS.R b/R/RMSSS.R index e44105cea96ba9f14c2e96f8488f4cc986c9826d..b8b3cc0eb5b7e85923f9684739888d07dddd2cc7 100644 --- a/R/RMSSS.R +++ b/R/RMSSS.R @@ -6,11 +6,12 @@ #'different, with the number of experiments/models (nexp) and the number of #'observational datasets (nobs).\cr #'RMSSS computes the root mean square error skill score of each jexp in 1:nexp -#'against each jobs in 1:nobs which gives nexp * nobs RMSSS for each other -#'grid point of the array.\cr -#'The RMSSS are computed along the time_dim dimension which should corresponds -#'to the startdate dimension.\cr -#'The p-value is optionally provided by an one-sided Fisher test.\cr +#'against each job in 1:nobs which gives nexp * nobs RMSSS for each grid point +#'of the array.\cr +#'The RMSSS are computed along the time_dim dimension which should correspond +#'to the start date dimension.\cr +#'The p-value and significance test are optionally provided by an one-sided +#'Fisher test or Random Walk test.\cr #' #'@param exp A named numeric array of experimental data which contains at least #' two dimensions for dat_dim and time_dim. It can also be a vector with the @@ -22,13 +23,31 @@ #' dimension can be different. It can also be a vector with the same length as #' 'exp', then the vector will automatically be 'time_dim' and 'dat_dim' will #' be 1. +#'@param ref A named numerical array of the reference forecast data with at +#' least time dimension, or 0 (typical climatological forecast) or 1 +#' (normalized climatological forecast). If it is an array, the dimensions must +#' be the same as 'exp' except 'memb_dim' and 'dat_dim'. If there is only one +#' reference dataset, it should not have dataset dimension. If there is +#' corresponding reference for each experiment, the dataset dimension must +#' have the same length as in 'exp'. If 'ref' is NULL, the typical +#' climatological forecast is used as reference forecast (equivelant to 0.) +#' The default value is NULL. #'@param dat_dim A character string indicating the name of dataset (nobs/nexp) #' dimension. The default value is 'dataset'. #'@param time_dim A character string indicating the name of dimension along #' which the RMSSS are computed. The default value is 'sdate'. +#'@param memb_dim A character string indicating the name of the member dimension +#' to compute the ensemble mean; it should be set to NULL if the parameter 'exp' +#' and 'ref' are already the ensemble mean. The default value is NULL. #'@param pval A logical value indicating whether to compute or not the p-value -#' of the test Ho: RMSSS = 0. If pval = TRUE, the insignificant RMSSS will -#' return NA. The default value is TRUE. +#' of the test Ho: RMSSS = 0. The default value is TRUE. +#'@param sign A logical value indicating whether to compute or not the +#' statistical significance of the test Ho: RMSSS = 0. The default value is +#' FALSE. +#'@param alpha A numeric of the significance level to be used in the +#' statistical significance test. The default value is 0.05. +#'@param sig_method A character string indicating the significance method. The +#' options are "one-sided Fisher" (default) and "Random Walk". #'@param ncores An integer indicating the number of cores to use for parallel #' computation. The default value is NULL. #' @@ -36,12 +55,18 @@ #'A list containing the numeric arrays with dimension:\cr #' c(nexp, nobs, all other dimensions of exp except time_dim).\cr #'nexp is the number of experiment (i.e., dat_dim in exp), and nobs is the -#'number of observation (i.e., dat_dim in obs).\cr +#'number of observation (i.e., dat_dim in obs). If dat_dim is NULL, nexp and +#'nobs are omitted.\cr #'\item{$rmsss}{ -#' The root mean square error skill score. +#' A numerical array of the root mean square error skill score. #'} #'\item{$p.val}{ -#' The p-value. Only present if \code{pval = TRUE}. +#' A numerical array of the p-value with the same dimensions as $rmsss. +#' Only present if \code{pval = TRUE}. +#'} +#'\item{sign}{ +#' A logical array of the statistical significance of the RMSSS with the same +#' dimensions as $rmsss. Only present if \code{sign = TRUE}. #'} #' #'@examples @@ -49,17 +74,18 @@ #' exp <- array(rnorm(30), dim = c(dataset = 2, time = 3, memb = 5)) #' set.seed(2) #' obs <- array(rnorm(15), dim = c(time = 3, memb = 5, dataset = 1)) -#' res <- RMSSS(exp, obs, time_dim = 'time') +#' res <- RMSSS(exp, obs, time_dim = 'time', dat_dim = 'dataset') #' #'@rdname RMSSS #'@import multiApply #'@importFrom stats pf #'@export -RMSSS <- function(exp, obs, time_dim = 'sdate', dat_dim = 'dataset', - pval = TRUE, ncores = NULL) { +RMSSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', dat_dim = 'dataset', + memb_dim = NULL, pval = TRUE, sign = FALSE, alpha = 0.05, + sig_method = 'one-sided Fisher', ncores = NULL) { # Check inputs - ## exp and obs (1) + ## exp, obs, and ref (1) if (is.null(exp) | is.null(obs)) { stop("Parameter 'exp' and 'obs' cannot be NULL.") } @@ -88,6 +114,19 @@ RMSSS <- function(exp, obs, time_dim = 'sdate', dat_dim = 'dataset', !all(names(dim(obs)) %in% names(dim(exp)))) { stop("Parameter 'exp' and 'obs' must have same dimension name.") } + if (!is.null(ref)) { + if (!is.numeric(ref)) { + stop("Parameter 'ref' must be numeric.") + } + if (is.array(ref)) { + if (any(is.null(names(dim(ref))))| any(nchar(names(dim(ref))) == 0)) { + stop("Parameter 'ref' must have dimension names.") + } + } else if (length(ref) != 1 | any(!ref %in% c(0, 1))) { + stop("Parameter 'ref' must be a numeric array or number 0 or 1.") + } + } + ## time_dim if (!is.character(time_dim) | length(time_dim) > 1) { stop("Parameter 'time_dim' must be a character string.") @@ -105,10 +144,43 @@ RMSSS <- function(exp, obs, time_dim = 'sdate', dat_dim = 'dataset', " Set it as NULL if there is no dataset dimension.") } } + ## memb_dim + if (!is.null(memb_dim)) { + if (!is.character(memb_dim) | length(memb_dim) > 1) { + stop("Parameter 'memb_dim' must be a character string.") + } + if (!memb_dim %in% names(dim(exp))) { + stop("Parameter 'memb_dim' is not found in 'exp' dimension.") + } + if (memb_dim %in% names(dim(obs))) { + if (identical(as.numeric(dim(obs)[memb_dim]), 1)) { + obs <- ClimProjDiags::Subset(x = obs, along = memb_dim, indices = 1, drop = 'selected') + } else { + stop("Not implemented for observations with members ('obs' can have 'memb_dim', ", + "but it should be of length = 1).") + } + } + } ## pval if (!is.logical(pval) | length(pval) > 1) { stop("Parameter 'pval' must be one logical value.") } + ## sign + if (!is.logical(sign) | length(sign) > 1) { + stop("Parameter 'sign' must be one logical value.") + } + ## alpha + if (!is.numeric(alpha) | length(alpha) > 1) { + stop("Parameter 'alpha' must be one numeric value.") + } + ## sig_method + if (length(sig_method) != 1 | !any(sig_method %in% c('one-sided Fisher', 'Random Walk'))) { + stop("Parameter 'sig_method' must be one of 'one-sided Fisher' or 'Random Walk'.") + } + if (sig_method == "Random Walk" & pval == T) { + warning("p-value cannot be calculated by significance method 'Random Walk'.") + pval <- FALSE + } ## ncores if (!is.null(ncores)) { if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 | @@ -119,63 +191,185 @@ RMSSS <- function(exp, obs, time_dim = 'sdate', dat_dim = 'dataset', ## exp and obs (2) name_exp <- sort(names(dim(exp))) name_obs <- sort(names(dim(obs))) + if (!is.null(memb_dim)) { + name_exp <- name_exp[-which(name_exp == memb_dim)] + } if (!is.null(dat_dim)) { name_exp <- name_exp[-which(name_exp == dat_dim)] name_obs <- name_obs[-which(name_obs == dat_dim)] } if(!all(dim(exp)[name_exp] == dim(obs)[name_obs])) { stop(paste0("Parameter 'exp' and 'obs' must have same length of ", - "all dimension except 'dat_dim'.")) + "all dimension except 'memb_dim' and 'dat_dim'.")) + } + if (!is.null(ref)) { + name_ref <- sort(names(dim(ref))) + if (!is.null(memb_dim) && memb_dim %in% name_ref) { + name_ref <- name_ref[-which(name_ref == memb_dim)] + } + if (!is.null(dat_dim)) { + if (dat_dim %in% name_ref) { + if (!identical(dim(exp)[dat_dim], dim(ref)[dat_dim])) { + stop(paste0("If parameter 'ref' has dataset dimension, it must be ", + "equal to dataset dimension of 'exp'.")) + } + name_ref <- name_ref[-which(name_ref == dat_dim)] + } + } + if (!identical(length(name_exp), length(name_ref)) | + !identical(dim(exp)[name_exp], dim(ref)[name_ref])) { + stop(paste0("Parameter 'exp' and 'ref' must have the same length of ", + "all dimensions except 'memb_dim' and 'dat_dim' if there is ", + "only one reference dataset.")) + } } + if (dim(exp)[time_dim] <= 2) { stop("The length of time_dim must be more than 2 to compute RMSSS.") } ############################### - # Sort dimension - name_exp <- names(dim(exp)) - name_obs <- names(dim(obs)) - order_obs <- match(name_exp, name_obs) - obs <- Reorder(obs, order_obs) +# # Sort dimension +# name_exp <- names(dim(exp)) +# name_obs <- names(dim(obs)) +# order_obs <- match(name_exp, name_obs) +# obs <- Reorder(obs, order_obs) + ############################### + # Create ref array if needed + if (is.null(ref)) ref <- 0 + if (!is.array(ref)) { + ref <- array(data = ref, dim = dim(exp)) + } + + ############################### + ## Ensemble mean + if (!is.null(memb_dim)) { + exp <- MeanDims(exp, memb_dim, na.rm = T) + if (!is.null(ref) & memb_dim %in% names(dim(ref))) { + ref <- MeanDims(ref, memb_dim, na.rm = T) + } + } + ############################### # Calculate RMSSS - - res <- Apply(list(exp, obs), - target_dims = list(c(time_dim, dat_dim), - c(time_dim, dat_dim)), + +# if (!is.null(ref)) { # use "ref" as reference forecast +# if (!is.null(dat_dim) && dat_dim %in% names(dim(ref))) { +# target_dims_ref <- c(time_dim, dat_dim) +# } else { +# target_dims_ref <- c(time_dim) +# } +# data <- list(exp = exp, obs = obs, ref = ref) +# target_dims = list(exp = c(time_dim, dat_dim), +# obs = c(time_dim, dat_dim), +# ref = target_dims_ref) +# } else { +# data <- list(exp = exp, obs = obs) +# target_dims = list(exp = c(time_dim, dat_dim), +# obs = c(time_dim, dat_dim)) +# } + data <- list(exp = exp, obs = obs, ref = ref) + if (!is.null(dat_dim)) { + if (dat_dim %in% names(dim(ref))) { + target_dims <- list(exp = c(time_dim, dat_dim), + obs = c(time_dim, dat_dim), + ref = c(time_dim, dat_dim)) + } else { + target_dims <- list(exp = c(time_dim, dat_dim), + obs = c(time_dim, dat_dim), + ref = c(time_dim)) + } + } else { + target_dims <- list(exp = time_dim, obs = time_dim, ref = time_dim) + } + + res <- Apply(data, + target_dims = target_dims, fun = .RMSSS, time_dim = time_dim, dat_dim = dat_dim, - pval = pval, ncores_input = ncores, + pval = pval, sign = sign, alpha = alpha, + sig_method = sig_method, ncores = ncores) return(res) } -.RMSSS <- function(exp, obs, time_dim = 'sdate', dat_dim = 'dataset', pval = TRUE, - ncores_input = NULL) { +.RMSSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', dat_dim = 'dataset', pval = TRUE, + sign = FALSE, alpha = 0.05, sig_method = 'one-sided Fisher') { + # exp: [sdate, (dat)] + # obs: [sdate, (dat)] + # ref: [sdate, (dat)] or NULL + + if (is.null(ref)) { + ref <- array(data = 0, dim = dim(obs)) + } else if (identical(ref, 0) | identical(ref, 1)) { + ref <- array(ref, dim = dim(exp)) + } + if (is.null(dat_dim)) { # exp: [sdate] # obs: [sdate] nexp <- 1 nobs <- 1 - dim(exp) <- c(dim(exp), nexp = nexp) - dim(obs) <- c(dim(obs), nobs = nobs) + nref <- 1 + # Add dat dim back temporarily + dim(exp) <- c(dim(exp), dat = 1) + dim(obs) <- c(dim(obs), dat = 1) + dim(ref) <- c(dim(ref), dat = 1) + } else { # exp: [sdate, dat_exp] # obs: [sdate, dat_obs] nexp <- as.numeric(dim(exp)[2]) nobs <- as.numeric(dim(obs)[2]) + if (dat_dim %in% names(dim(ref))) { + nref <- as.numeric(dim(ref)[2]) + } else { + dim(ref) <- c(dim(ref), dat = 1) + nref <- 1 + } } nsdate <- as.numeric(dim(exp)[1]) - - p_val <- array(dim = c(nexp = nexp, nobs = nobs)) + + # RMS of forecast dif1 <- array(dim = c(nsdate, nexp, nobs)) names(dim(dif1)) <- c(time_dim, 'nexp', 'nobs') + for (i in 1:nobs) { + dif1[, , i] <- sapply(1:nexp, function(x) {exp[, x] - obs[, i]}) + } + + rms_exp <- apply(dif1^2, c(2, 3), mean, na.rm = TRUE)^0.5 #array(dim = c(nexp, nobs)) + + # RMS of reference +# if (!is.null(ref)) { + dif2 <- array(dim = c(nsdate, nref, nobs)) + names(dim(dif2)) <- c(time_dim, 'nexp', 'nobs') + for (i in 1:nobs) { + dif2[, , i] <- sapply(1:nref, function(x) {ref[, x] - obs[, i]}) + } + rms_ref <- apply(dif2^2, c(2, 3), mean, na.rm = TRUE)^0.5 #array(dim = c(nref, nobs)) + if (nexp != nref) { + # expand rms_ref to nexp (nref is 1) + rms_ref <- array(rms_ref, dim = c(nobs = nobs, nexp = nexp)) + rms_ref <- Reorder(rms_ref, c(2, 1)) + } +# } else { +# rms_ref <- array(colMeans(obs^2, na.rm = TRUE)^0.5, dim = c(nobs = nobs, nexp = nexp)) +## rms_ref[which(abs(rms_ref) <= (max(abs(rms_ref), na.rm = TRUE) / 1000))] <- max(abs( +## rms_ref), na.rm = TRUE) / 1000 +# rms_ref <- Reorder(rms_ref, c(2, 1)) +# #rms_ref above: [nexp, nobs] +# } + + rmsss <- 1 - rms_exp / rms_ref + +################################################# + # if (conf) { # conflow <- (1 - conf.lev) / 2 # confhigh <- 1 - conflow @@ -183,44 +377,51 @@ RMSSS <- function(exp, obs, time_dim = 'sdate', dat_dim = 'dataset', # conf_high <- array(dim = c(nexp = nexp, nobs = nobs)) # } - # dif1 - for (i in 1:nobs) { - dif1[, , i] <- sapply(1:nexp, function(x) {exp[, x] - obs[, i]}) - } + if (sig_method == 'one-sided Fisher') { + p_val <- array(dim = c(nexp = nexp, nobs = nobs)) + ## pval and sign + if (pval || sign) { + eno1 <- Eno(dif1, time_dim) + if (is.null(ref)) { + eno2 <- Eno(obs, time_dim) + eno2 <- array(eno2, dim = c(nobs = nobs, nexp = nexp)) + eno2 <- Reorder(eno2, c(2, 1)) + } else { + eno2 <- Eno(dif2, time_dim) + if (nref != nexp) { + eno2 <- array(eno2, dim = c(nobs = nobs, nexp = nexp)) + eno2 <- Reorder(eno2, c(2, 1)) + } + } - # rms1 and eno1 - rms1 <- apply(dif1^2, c(2, 3), mean, na.rm = TRUE)^0.5 #array(dim = c(nexp, nobs)) - # rms2 and eno2 - rms2 <- array(colMeans(obs^2, na.rm = TRUE)^0.5, dim = c(nobs = nobs)) - rms2[which(abs(rms2) <= (max(abs(rms2), na.rm = TRUE) / 1000))] <- max(abs( - rms2), na.rm = TRUE) / 1000 - #rms2 above: [nobs] - rms2 <- array(rms2, dim = c(nobs = nobs, nexp = nexp)) - #rms2 above: [nobs, nexp] - rms2 <- Reorder(rms2, c(2, 1)) - #rms2 above: [nexp, nobs] - - # use rms1 and rms2 to calculate rmsss - rmsss <- 1 - rms1/rms2 - - ## pval and conf - if (pval) { - eno1 <- Eno(dif1, time_dim, ncores = ncores_input) - eno2 <- Eno(obs, time_dim, ncores = ncores_input) - eno2 <- array(eno2, dim = c(nobs = nobs, nexp = nexp)) - eno2 <- Reorder(eno2, c(2, 1)) - } + F.stat <- (eno2 * rms_ref^2 / (eno2 - 1)) / ((eno1 * rms_exp^2 / (eno1- 1))) + tmp <- !is.na(eno1) & !is.na(eno2) & eno1 > 2 & eno2 > 2 + p_val <- 1 - pf(F.stat, eno1 - 1, eno2 - 1) + if (sign) signif <- p_val <= alpha + # If there isn't enough valid data, return NA + p_val[which(!tmp)] <- NA + if (sign) signif[which(!tmp)] <- NA + + # change not enough valid data rmsss to NA + rmsss[which(!tmp)] <- NA + } - # pval - if (pval) { + } else if (sig_method == "Random Walk") { + signif <- array(dim = c(nexp = nexp, nobs = nobs)) + for (i in 1:nexp) { + for (j in 1:nobs) { - F.stat <- (eno2 * rms2^2 / (eno2- 1)) / ((eno1 * rms1^2 / (eno1- 1))) - tmp <- !is.na(eno1) & !is.na(eno2) & eno1 > 2 & eno2 > 2 - p_val <- 1 - pf(F.stat, eno1 - 1, eno2 - 1) - p_val[which(!tmp)] <- NA - - # change not significant rmsss to NA - rmsss[which(!tmp)] <- NA + # Error + error_exp <- array(data = abs(exp[, i] - obs[, j]), dim = c(time = nsdate)) + if (nref == nexp) { + error_ref <- array(data = abs(ref[, i] - obs[, j]), dim = c(time = nsdate)) + } else { + # nref = 1 + error_ref <- array(data = abs(ref - obs[, j]), dim = c(time = nsdate)) + } + signif[i, j] <- .RandomWalkTest(skill_A = error_exp, skill_B = error_ref)$sign + } + } } ################################### @@ -228,15 +429,19 @@ RMSSS <- function(exp, obs, time_dim = 'sdate', dat_dim = 'dataset', if (is.null(dat_dim)) { dim(rmsss) <- NULL dim(p_val) <- NULL + if (sign) dim(signif) <- NULL } ################################### # output + res <- list(rmsss = rmsss) if (pval) { - res <- list(rmsss = rmsss, p.val = p_val) - - } else { - res <- list(rmsss = rmsss) + p.val <- list(p.val = p_val) + res <- c(res, p.val) + } + if (sign) { + signif <- list(sign = signif) + res <- c(res, signif) } return(res) diff --git a/R/ROCSS.R b/R/ROCSS.R new file mode 100644 index 0000000000000000000000000000000000000000..7831a8830099d64f39ba2934ae21763d198fa742 --- /dev/null +++ b/R/ROCSS.R @@ -0,0 +1,301 @@ +#'Compute the Relative Operating Characteristic Skill Score +#' +#'The Relative Operating Characteristic Skill Score (ROCSS; Kharin and Zwiers, +#'2003) is based on the ROC curve, which gives information about the hit rates +#'against the false-alarm rates for a particular category or event. The ROC +#'curve can be summarized with the area under the ROC curve, known as the ROC +#'score, to provide a skill value for each category. The ROCSS ranges between +#'minus infinite and 1. A positive ROCSS value indicates that the forecast has +#'higher skill than the reference forecasts, meaning the contrary otherwise. +#'@param exp A named numerical array of the forecast with at least time and +#' member dimension. +#'@param obs A named numerical array of the observation with at least time +#' dimension. The dimensions must be the same as 'exp' except 'memb_dim' and +#' 'dat_dim'. +#'@param ref A named numerical array of the reference forecast data with at +#' least time and member dimension. The dimensions must be the same as 'exp' +#' except 'memb_dim' and 'dat_dim'. If there is only one reference dataset, +#' it should not have dataset dimension. If there is corresponding reference +#' for each experiement, the dataset dimension must have the same length as in +#' 'exp'. If 'ref' is NULL, the random forecast is used as reference forecast. +#' The default value is NULL. +#'@param time_dim A character string indicating the name of the time dimension. +#' The default value is 'sdate'. +#'@param memb_dim A character string indicating the name of the member dimension +#' to compute the probabilities of the forecast and the reference forecast. The +#' default value is 'member'. +#'@param dat_dim A character string indicating the name of dataset dimension. +#' The length of this dimension can be different between 'exp' and 'obs'. +#' The default value is NULL. +#'@param prob_thresholds A numeric vector of the relative thresholds (from 0 to +#' 1) between the categories. The default value is c(1/3, 2/3), which +#' corresponds to tercile equiprobable categories. +#'@param indices_for_clim A vector of the indices to be taken along 'time_dim' +#' for computing the thresholds between the probabilistic categories. If NULL, +#' the whole period is used. The default value is NULL. +#'@param cross.val A logical indicating whether to compute the thresholds +#' between probabilistic categories in cross-validation. The default value is +#' FALSE. +#'@param ncores An integer indicating the number of cores to use for parallel +#' computation. The default value is NULL. +#' +#'@return +#'A numerical array of ROCSS with the same dimensions as 'exp' excluding +#''time_dim' and 'memb_dim' dimensions and including 'cat' dimension, which is +#'each category. The length if 'cat' dimension corresponds to the number of +#'probabilistic categories, i.e., 1 + length(prob_thresholds). If there are +#'multiple datasets, two additional dimensions 'nexp' and 'nobs' are added. +#' +#'@references +#'Kharin, V. V. and Zwiers, F. W. (2003): +#' https://doi.org/10.1175/1520-0442(2003)016%3C4145:OTRSOP%3E2.0.CO;2 +#' +#'@examples +#'exp <- array(rnorm(1000), dim = c(lon = 3, lat = 2, sdate = 60, member = 10)) +#'ref <- array(rnorm(1000), dim = c(lon = 3, lat = 2, sdate = 60, member = 10)) +#'obs <- array(rnorm(1000), dim = c(lon = 3, lat = 2, sdate = 60)) +#'ROCSS(exp = exp, obs = obs) ## random forecast as reference forecast +#'ROCSS(exp = exp, obs = obs, ref = ref) ## ref as reference forecast +#' +#'@import multiApply +#'@importFrom easyVerification EnsRoca +#'@export +ROCSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', memb_dim = 'member', + dat_dim = NULL, prob_thresholds = c(1/3, 2/3), indices_for_clim = NULL, + cross.val = FALSE, ncores = NULL) { + + # Check inputs + ## exp, obs, and ref (1) + if (!is.array(exp) | !is.numeric(exp)) { + stop("Parameter 'exp' must be a numeric array.") + } + if (!is.array(obs) | !is.numeric(obs)) { + stop("Parameter 'obs' must be a numeric array.") + } + if (any(is.null(names(dim(exp))))| any(nchar(names(dim(exp))) == 0) | + any(is.null(names(dim(obs))))| any(nchar(names(dim(obs))) == 0)) { + stop("Parameter 'exp' and 'obs' must have dimension names.") + } + if (!is.null(ref)) { + if (!is.array(ref) | !is.numeric(ref)) + stop("Parameter 'ref' must be a numeric array.") + if (any(is.null(names(dim(ref))))| any(nchar(names(dim(ref))) == 0)) { + stop("Parameter 'ref' must have dimension names.") + } + } + ## time_dim + if (!is.character(time_dim) | length(time_dim) != 1) { + stop("Parameter 'time_dim' must be a character string.") + } + if (!time_dim %in% names(dim(exp)) | !time_dim %in% names(dim(obs))) { + stop("Parameter 'time_dim' is not found in 'exp' or 'obs' dimension.") + } + if (!is.null(ref) & !time_dim %in% names(dim(ref))) { + stop("Parameter 'time_dim' is not found in 'ref' dimension.") + } + ## memb_dim + if (!is.character(memb_dim) | length(memb_dim) > 1) { + stop("Parameter 'memb_dim' must be a character string.") + } + if (!memb_dim %in% names(dim(exp))) { + stop("Parameter 'memb_dim' is not found in 'exp' dimension.") + } + if (!is.null(ref) & !memb_dim %in% names(dim(ref))) { + stop("Parameter 'memb_dim' is not found in 'ref' dimension.") + } + ## dat_dim + if (!is.null(dat_dim)) { + if (!is.character(dat_dim) | length(dat_dim) > 1) { + stop("Parameter 'dat_dim' must be a character string.") + } + if (!dat_dim %in% names(dim(exp)) | !dat_dim %in% names(dim(obs))) { + stop("Parameter 'dat_dim' is not found in 'exp' or 'obs' dimension.", + " Set it as NULL if there is no dataset dimension.") + } + } + ## exp, obs, and ref (2) + name_exp <- sort(names(dim(exp))) + name_obs <- sort(names(dim(obs))) + name_exp <- name_exp[-which(name_exp == memb_dim)] + if (memb_dim %in% name_obs) { + name_obs <- name_obs[-which(name_obs == memb_dim)] + } + if (!is.null(dat_dim)) { + name_exp <- name_exp[-which(name_exp == dat_dim)] + name_obs <- name_obs[-which(name_obs == dat_dim)] + } + if (!identical(length(name_exp), length(name_obs)) | + !identical(dim(exp)[name_exp], dim(obs)[name_obs])) { + stop(paste0("Parameter 'exp' and 'obs' must have same length of ", + "all dimensions except 'memb_dim' and 'dat_dim'.")) + } + if (!is.null(ref)) { + name_ref <- sort(names(dim(ref))) + name_ref <- name_ref[-which(name_ref == memb_dim)] + if (!is.null(dat_dim)) { + if (dat_dim %in% name_ref) { + if (!identical(dim(exp)[dat_dim], dim(ref)[dat_dim])) { + stop(paste0("If parameter 'ref' has dataset dimension, it must be", + " equal to dataset dimension of 'exp'.")) + } + name_ref <- name_ref[-which(name_ref == dat_dim)] + } + } + if (!identical(length(name_exp), length(name_ref)) | + !identical(dim(exp)[name_exp], dim(ref)[name_ref])) { + stop(paste0("Parameter 'exp' and 'ref' must have the same length of ", + "all dimensions except 'memb_dim' and 'dat_dim'", + " if there is only one reference dataset.")) + } + } + ## prob_thresholds + if (!is.numeric(prob_thresholds) | !is.vector(prob_thresholds) | + any(prob_thresholds <= 0) | any(prob_thresholds >= 1)) { + stop("Parameter 'prob_thresholds' must be a numeric vector between 0 and 1.") + } + ## indices_for_clim + if (is.null(indices_for_clim)) { + indices_for_clim <- 1:dim(obs)[time_dim] + } else { + if (!is.numeric(indices_for_clim) | !is.vector(indices_for_clim)) { + stop("Parameter 'indices_for_clim' must be NULL or a numeric vector.") + } else if (length(indices_for_clim) > dim(obs)[time_dim] | + max(indices_for_clim) > dim(obs)[time_dim] | + any(indices_for_clim) < 1) { + stop("Parameter 'indices_for_clim' should be the indices of 'time_dim'.") + } + } + ## cross.val + if (!is.logical(cross.val) | length(cross.val) > 1) { + stop("Parameter 'cross.val' must be either TRUE or FALSE.") + } + ## ncores + if (!is.null(ncores)) { + if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 | + length(ncores) > 1) { + stop("Parameter 'ncores' must be either NULL or a positive integer.") + } + } + + ############################### + + # Compute ROCSS + ## output_dims + if (is.null(dat_dim)) { + output_dims <- 'cat' + } else { + output_dims <- c('nexp', 'nobs', 'cat') + } + ## target_dims + if (!memb_dim %in% names(dim(obs))) { + target_dims_obs <- c(time_dim, dat_dim) + } else { + target_dims_obs <- c(time_dim, memb_dim, dat_dim) + } + ## If ref doesn't have & dat_dim is not NULL + if (!is.null(ref) && !is.null(dat_dim) &&!dat_dim %in% names(dim(ref))) { + target_dims_ref <- c(time_dim, memb_dim) + } else { + target_dims_ref <- c(time_dim, memb_dim, dat_dim) + } + + if (!is.null(ref)) { ## reference forecast is provided + res <- Apply(data = list(exp = exp, obs = obs, ref = ref), + target_dims = list(exp = c(time_dim, memb_dim, dat_dim), + obs = target_dims_obs, + ref = target_dims_ref), + output_dims = output_dims, + fun = .ROCSS, + prob_thresholds = prob_thresholds, + indices_for_clim = indices_for_clim, + time_dim = time_dim, dat_dim = dat_dim, + cross.val = cross.val, + ncores = ncores)$output1 + + } else { ## Random forecast as reference forecast + res <- Apply(data = list(exp = exp, obs = obs), + target_dims = list(exp = c(time_dim, memb_dim, dat_dim), + obs = target_dims_obs), + output_dims = output_dims, + fun = .ROCSS, + ref = ref, + prob_thresholds = prob_thresholds, + indices_for_clim = indices_for_clim, + time_dim = time_dim, dat_dim = dat_dim, + cross.val = cross.val, + ncores = ncores)$output1 + } + + return(res) +} + +.ROCSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', dat_dim = NULL, prob_thresholds = c(1/3, 2/3), + indices_for_clim = NULL, cross.val = FALSE) { + + # exp: [sdate, memb, (dat)] + # obs: [sdate, (dat)] + # ref: [sdate, memb, (dat)] or NULL + + if (is.null(dat_dim)) { + nexp <- 1 + nobs <- 1 + dat_dim <- 'dat' + dim(exp) <- c(dim(exp), dat = 1) + dim(obs) <- c(dim(obs), dat = 1) + if (!is.null(ref)) { + dim(ref) <- c(dim(ref), dat = 1) + } + remove_dat_dim <- TRUE + } else { + nexp <- as.numeric(dim(exp)[dat_dim]) + nobs <- as.numeric(dim(obs)[dat_dim]) + if (!is.null(ref) && !dat_dim %in% names(dim(ref))) { # make ref have the same dat dim as exp + ref <- array(ref, dim = dim(exp)) + } + remove_dat_dim <- FALSE + } + + ncats <- 1 + length(prob_thresholds) + rocs_exp <- array(dim = c(nexp = nexp, nobs = nobs, cat = ncats)) + if (!is.null(ref)) rocs_ref <- array(dim = dim(rocs_exp)) + + for (exp_i in 1:nexp) { + for (obs_i in 1:nobs) { + + # Input dim for .get_probs + ## if exp: [sdate, memb] + ## if obs: [sdate, (memb)] + exp_probs <- .get_probs(ClimProjDiags::Subset(exp, dat_dim, exp_i, drop = 'selected'), + indices_for_quantiles = indices_for_clim, + prob_thresholds = prob_thresholds, cross.val = cross.val) + obs_probs <- .get_probs(data = ClimProjDiags::Subset(obs, dat_dim, obs_i, drop = 'selected'), + indices_for_quantiles = indices_for_clim, + prob_thresholds = prob_thresholds, cross.val = cross.val) + ## exp_probs and obs_probs: [bin, sdate] + + ## ROCS (exp) + rocs_exp[exp_i, obs_i, ] <- unlist(EnsRoca(ens = Reorder(exp_probs, c(time_dim, 'bin')), + obs = Reorder(obs_probs, c(time_dim, 'bin')))[1:ncats]) + + if (!is.null(ref)) { + ref_probs <- .get_probs(ClimProjDiags::Subset(ref, dat_dim, exp_i, drop = 'selected'), + indices_for_quantiles = indices_for_clim, + prob_thresholds = prob_thresholds, cross.val = cross.val) + rocs_ref[exp_i, obs_i, ] <- unlist(EnsRoca(ens = Reorder(ref_probs, c(time_dim, 'bin')), + obs = Reorder(obs_probs, c(time_dim, 'bin')))[1:ncats]) + } + } + } + + ## ROCSS + if (is.null(ref)) { ## Random forecast as reference forecast + rocss <- 2 * rocs_exp - 1 + } else { ## Reference forecast is provided + rocss <- (rocs_exp - rocs_ref) / (1 - rocs_ref) + } + if (remove_dat_dim) { + rocss <- array(rocss, dim = dim(rocss)['cat']) + } + + return(rocss) +} diff --git a/R/RPS.R b/R/RPS.R index 76c81d254bf8758327290ba8946f7dfc9e7aad77..32d88a42b56b1c93742310552edaa2c3f7bb5383 100644 --- a/R/RPS.R +++ b/R/RPS.R @@ -39,6 +39,9 @@ #' ensemble should have at least 70 members or span at least 10 time steps and #' have more than 45 members if consistency between the weighted and unweighted #' methodologies is desired. +#'@param cross.val A logical indicating whether to compute the thresholds between +#' probabilistic categories in cross-validation. +#' The default value is FALSE. #'@param ncores An integer indicating the number of cores to use for parallel #' computation. The default value is NULL. #' @@ -61,7 +64,7 @@ #'@export RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', dat_dim = NULL, prob_thresholds = c(1/3, 2/3), indices_for_clim = NULL, Fair = FALSE, - weights = NULL, ncores = NULL) { + weights = NULL, cross.val = FALSE, ncores = NULL) { # Check inputs ## exp and obs (1) @@ -133,6 +136,10 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', dat_dim = NUL if (!is.logical(Fair) | length(Fair) > 1) { stop("Parameter 'Fair' must be either TRUE or FALSE.") } + ## cross.val + if (!is.logical(cross.val) | length(cross.val) > 1) { + stop("Parameter 'cross.val' must be either TRUE or FALSE.") + } ## weights if (!is.null(weights)) { if (!is.array(weights) | !is.numeric(weights)) @@ -184,7 +191,7 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', dat_dim = NUL memb_dim = memb_dim, prob_thresholds = prob_thresholds, indices_for_clim = indices_for_clim, Fair = Fair, - weights = weights, ncores = ncores)$output1 + weights = weights, cross.val = cross.val, ncores = ncores)$output1 # Return only the mean RPS rps <- MeanDims(rps, time_dim, na.rm = FALSE) @@ -194,7 +201,8 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', dat_dim = NUL .RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', dat_dim = NULL, - prob_thresholds = c(1/3, 2/3), indices_for_clim = NULL, Fair = FALSE, weights = NULL) { + prob_thresholds = c(1/3, 2/3), indices_for_clim = NULL, Fair = FALSE, weights = NULL, + cross.val = FALSE) { # exp: [sdate, memb, (dat)] # obs: [sdate, (memb), (dat)] @@ -232,10 +240,10 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', dat_dim = NUL } exp_probs <- .get_probs(data = exp_data, indices_for_quantiles = indices_for_clim, - prob_thresholds = prob_thresholds, weights = weights_data) + prob_thresholds = prob_thresholds, weights = weights_data, cross.val = cross.val) # exp_probs: [bin, sdate] obs_probs <- .get_probs(data = obs_data, indices_for_quantiles = indices_for_clim, - prob_thresholds = prob_thresholds, weights = NULL) + prob_thresholds = prob_thresholds, weights = NULL, cross.val = cross.val) # obs_probs: [bin, sdate] probs_exp_cumsum <- apply(exp_probs, 2, cumsum) probs_obs_cumsum <- apply(obs_probs, 2, cumsum) @@ -261,67 +269,3 @@ RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', dat_dim = NUL return(rps) } -.get_probs <- function(data, indices_for_quantiles, prob_thresholds, weights = NULL) { - # if exp: [sdate, memb] - # if obs: [sdate, (memb)] - - # Add dim [memb = 1] to obs if it doesn't have memb_dim - if (length(dim(data)) == 1) dim(data) <- c(dim(data), 1) - - # Absolute thresholds - if (is.null(weights)) { - quantiles <- quantile(x = as.vector(data[indices_for_quantiles, ]), probs = prob_thresholds, type = 8, na.rm = TRUE) - } else { - # weights: [sdate, memb] - sorted_arrays <- .sorted_distributions(data[indices_for_quantiles, ], weights[indices_for_quantiles, ]) - sorted_data <- sorted_arrays$data - cumulative_weights <- sorted_arrays$cumulative_weights - quantiles <- approx(cumulative_weights, sorted_data, prob_thresholds, "linear")$y - } - - # Probabilities - probs <- array(dim = c(bin = length(quantiles) + 1, dim(data)[1])) # [bin, sdate] - for (i_time in 1:dim(data)[1]) { - if (anyNA(data[i_time, ])) { - probs[, i_time] <- rep(NA, dim = length(quantiles) + 1) - } else { - if (is.null(weights)) { - probs[, i_time] <- colMeans(easyVerification::convert2prob(data[i_time, ], threshold = quantiles)) - } else { - sorted_arrays <- .sorted_distributions(data[i_time, ], weights[i_time, ]) - sorted_data <- sorted_arrays$data - cumulative_weights <- sorted_arrays$cumulative_weights - # find any quantiles that are outside the data range - integrated_probs <- array(dim = c(bin = length(quantiles))) - for (i_quant in 1:length(quantiles)) { - # for thresholds falling under the distribution - if (quantiles[i_quant] < min(sorted_data)) { - integrated_probs[i_quant] <- 0 - # for thresholds falling over the distribution - } else if (max(sorted_data) < quantiles[i_quant]) { - integrated_probs[i_quant] <- 1 - } else { - integrated_probs[i_quant] <- approx(sorted_data, cumulative_weights, quantiles[i_quant], "linear")$y - } - } - probs[, i_time] <- append(integrated_probs, 1) - append(0, integrated_probs) - if (min(probs[, i_time]) < 0 | max(probs[, i_time]) > 1) { - stop(paste0("Probability in i_time = ", i_time, " is out of [0, 1].")) - } - } - } - } - return(probs) -} - -.sorted_distributions <- function(data_vector, weights_vector) { - weights_vector <- as.vector(weights_vector) - data_vector <- as.vector(data_vector) - weights_vector <- weights_vector / sum(weights_vector) # normalize to 1 - sorter <- order(data_vector) - sorted_weights <- weights_vector[sorter] - cumulative_weights <- cumsum(sorted_weights) - 0.5 * sorted_weights - cumulative_weights <- cumulative_weights - cumulative_weights[1] # fix the 0 - cumulative_weights <- cumulative_weights / cumulative_weights[length(cumulative_weights)] # fix the 1 - return(list("data" = data_vector[sorter], "cumulative_weights" = cumulative_weights)) -} diff --git a/R/RPSS.R b/R/RPSS.R index 3d50d2bc554317949a4f2bba8c25d928502eddef..efd79500892e59c474460506dd95d054f67a5471 100644 --- a/R/RPSS.R +++ b/R/RPSS.R @@ -53,6 +53,9 @@ #' time steps and have more than 45 members if consistency between the weighted #' and unweighted methodologies is desired. #'@param weights_ref Same as 'weights_exp' but for the reference forecast. +#'@param cross.val A logical indicating whether to compute the thresholds between +#' probabilistics categories in cross-validation. +#' The default value is FALSE. #'@param ncores An integer indicating the number of cores to use for parallel #' computation. The default value is NULL. #' @@ -91,7 +94,8 @@ #'@export RPSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', memb_dim = 'member', dat_dim = NULL, prob_thresholds = c(1/3, 2/3), indices_for_clim = NULL, - Fair = FALSE, weights = NULL, weights_exp = NULL, weights_ref = NULL, ncores = NULL) { + Fair = FALSE, weights = NULL, weights_exp = NULL, weights_ref = NULL, + cross.val = FALSE, ncores = NULL) { # Check inputs ## exp, obs, and ref (1) @@ -198,6 +202,10 @@ RPSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', memb_dim = 'member', if (!is.logical(Fair) | length(Fair) > 1) { stop("Parameter 'Fair' must be either TRUE or FALSE.") } + ## cross.val + if (!is.logical(cross.val) | length(cross.val) > 1) { + stop("Parameter 'cross.val' must be either TRUE or FALSE.") + } ## weights if (!is.null(weights)) { .warning(paste0("Parameter 'weights' is deprecated and will be removed in the next release. ", @@ -301,6 +309,7 @@ RPSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', memb_dim = 'member', indices_for_clim = indices_for_clim, Fair = Fair, weights_exp = weights_exp, weights_ref = weights_ref, + cross.val = cross.val, ncores = ncores) return(output) @@ -309,7 +318,7 @@ RPSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', memb_dim = 'member', .RPSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', memb_dim = 'member', dat_dim = NULL, prob_thresholds = c(1/3, 2/3), indices_for_clim = NULL, Fair = FALSE, - weights_exp = NULL, weights_ref = NULL) { + weights_exp = NULL, weights_ref = NULL, cross.val = FALSE) { # exp: [sdate, memb, (dat)] # obs: [sdate, (memb), (dat)] @@ -326,7 +335,7 @@ RPSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', memb_dim = 'member', # RPS of the forecast rps_exp <- .RPS(exp = exp, obs = obs, time_dim = time_dim, memb_dim = memb_dim, dat_dim = dat_dim, prob_thresholds = prob_thresholds, indices_for_clim = indices_for_clim, - Fair = Fair, weights = weights_exp) + Fair = Fair, weights = weights_exp, cross.val = cross.val) # RPS of the reference forecast if (is.null(ref)) { ## using climatology as reference forecast @@ -343,7 +352,7 @@ RPSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', memb_dim = 'member', if (is.null(dim(obs_data))) dim(obs_data) <- c(dim(obs)[1:2]) # obs_probs: [bin, sdate] obs_probs <- .get_probs(data = obs_data, indices_for_quantiles = indices_for_clim, - prob_thresholds = prob_thresholds, weights = NULL) + prob_thresholds = prob_thresholds, weights = NULL, cross.val = cross.val) # clim_probs: [bin, sdate] clim_probs <- c(prob_thresholds[1], diff(prob_thresholds), 1 - prob_thresholds[length(prob_thresholds)]) clim_probs <- array(clim_probs, dim = dim(obs_probs)) @@ -380,7 +389,7 @@ RPSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', memb_dim = 'member', rps_ref <- .RPS(exp = ref, obs = obs, time_dim = time_dim, memb_dim = memb_dim, dat_dim = dat_dim, prob_thresholds = prob_thresholds, indices_for_clim = indices_for_clim, - Fair = Fair, weights = weights_ref) + Fair = Fair, weights = weights_ref, cross.val = cross.val) if (!is.null(dat_dim)) { if (isTRUE(remove_dat_dim)) { dim(rps_ref) <- dim(rps_ref)[-2] @@ -399,21 +408,21 @@ RPSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', memb_dim = 'member', for (i in 1:nexp) { for (j in 1:nobs) { rpss[i, j] <- 1 - rps_exp_mean[i, j] / rps_ref_mean[j] - sign[i, j] <- .RandomWalkTest(skill_A = rps_exp_mean[i, j], skill_B = rps_ref_mean[j])$signif + sign[i, j] <- .RandomWalkTest(skill_A = rps_exp[, i, j], skill_B = rps_ref[, j])$sign } } } else { for (i in 1:nexp) { for (j in 1:nobs) { rpss[i, j] <- 1 - rps_exp_mean[i, j] / rps_ref_mean[i, j] - sign[i, j] <- .RandomWalkTest(skill_A = rps_exp_mean[i, j], skill_B = rps_ref_mean[i, j])$signif + sign[i, j] <- .RandomWalkTest(skill_A = rps_exp[, i, j], skill_B = rps_ref[, i, j])$sign } } } } else { rpss <- 1 - mean(rps_exp) / mean(rps_ref) # Significance - sign <- .RandomWalkTest(skill_A = rps_exp, skill_B = rps_ref)$signif + sign <- .RandomWalkTest(skill_A = rps_exp, skill_B = rps_ref, sign = T, pval = F)$sign } return(list(rpss = rpss, sign = sign)) diff --git a/R/RandomWalkTest.R b/R/RandomWalkTest.R index adeadc1ec94b62920c885640938f966c91e75ddc..8d5f67f361a679dc078b8de2c692dc3f692fb0fb 100644 --- a/R/RandomWalkTest.R +++ b/R/RandomWalkTest.R @@ -1,82 +1,184 @@ -#'Random walk test for skill differences +#'Random Walk test for skill differences #' #'Forecast comparison of the skill obtained with 2 forecasts (with respect to a -#'common reference) based on Random Walks, with significance estimate at the 95% -#'confidence level, as in DelSole and Tippett (2016). +#'common observational reference) based on Random Walks (DelSole and Tippett, +#'2016). #' -#'@param skill_A A numerical array of the time series of the skill with the -#' forecaster A's. -#'@param skill_B A numerical array of the time series of the skill with the -#' forecaster B's. The dimensions should be identical as parameter 'skill_A'. +#'@param skill_A A numerical array of the time series of the scores obtained +#' with the forecaster A. +#'@param skill_B A numerical array of the time series of the scores obtained +#' with the forecaster B. The dimensions should be identical as parameter +#' 'skill_A'. #'@param time_dim A character string indicating the name of the dimension along #' which the tests are computed. The default value is 'sdate'. +#'@param test.type A character string indicating the type of significance test. +#' It can be "two.sided.approx" (to assess whether forecaster A and forecaster +#' B are significantly different in terms of skill with a two-sided test using +#' the approximation of DelSole and Tippett, 2016), "two.sided" (to assess +#' whether forecaster A and forecaster B are significantly different in terms +#' of skill with an exact two-sided test), "greater" (to assess whether +#' forecaster A shows significantly better skill than forecaster B with a +#' one-sided test for negatively oriented scores), or "less" (to assess whether +#' forecaster A shows significantly better skill than forecaster B with a +#' one-sided test for positively oriented scores). The default value is +#' "two.sided.approx". +#'@param alpha A numeric of the significance level to be used in the statistical +#' significance test (output "sign"). The default value is 0.05. +#'@param pval A logical value indicating whether to return the p-value of the +#' significance test. The default value is TRUE. +#'@param sign A logical value indicating whether to return the statistical +#' significance of the test based on 'alpha'. The default value is FALSE. #'@param ncores An integer indicating the number of cores to use for parallel #' computation. The default value is NULL. #' -#'@return A list of 2: +#'@return A list with: #'\item{$score}{ #' A numerical array with the same dimensions as the input arrays except #' 'time_dim'. The number of times that forecaster A has been better than #' forecaster B minus the number of times that forecaster B has been better -#' than forecaster A (for skill positively oriented). If $score is positive -#' forecaster A is better than forecaster B, and if $score is negative -#' forecaster B is better than forecaster B. +#' than forecaster A (for skill negatively oriented, i.e., the lower the +#' better). If $score is positive, forecaster A has been better more times +#' than forecaster B. If $score is negative, forecaster B has been better more +#' times than forecaster A. #'} -#'\item{$signif}{ -#' A logical array with the same dimensions as the input arrays except -#' 'time_dim'. Whether the difference is significant or not at the 5% -#' significance level. +#'\item{$sign}{ +#' A logical array of the statistical significance with the same dimensions +#' as the input arrays except "time_dim". Returned only if "sign" is TRUE. #'} +#'\item{$p.val}{ +#' A numeric array of the p-values with the same dimensions as the input arrays +#' except "time_dim". Returned only if "pval" is TRUE. +#'} +#' +#'@details +#' Null and alternative hypothesis for "two-sided" test (regardless of the +#' orientation of the scores):\cr +#' H0: forecaster A and forecaster B are not different in terms of skill\cr +#' H1: forecaster A and forecaster B are different in terms of skill +#' +#' Null and alternative hypothesis for one-sided "greater" (for negatively +#' oriented scores, i.e., the lower the better) and "less" (for positively +#' oriented scores, i.e., the higher the better) tests:\cr +#' H0: forecaster A is not better than forecaster B\cr +#' H1: forecaster A is better than forecaster B +#' +#' Examples of negatively oriented scores are the RPS, RMSE and the Error, while +#' the ROC score is a positively oriented score. +#' +#' DelSole and Tippett (2016) approximation for two-sided test at 95% confidence +#' level: significant if the difference between the number of times that +#' forecaster A has been better than forecaster B and forecaster B has been +#' better than forecaster A is above 2sqrt(N) or below -2sqrt(N). +#' +#'@references +#'DelSole and Tippett (2016): https://doi.org/10.1175/MWR-D-15-0218.1 #' #'@examples -#' fcst_A <- array(c(11:50), dim = c(sdate = 10, lat = 2, lon = 2)) -#' fcst_B <- array(c(21:60), dim = c(sdate = 10, lat = 2, lon = 2)) -#' reference <- array(1:40, dim = c(sdate = 10, lat = 2, lon = 2)) -#' skill_A <- abs(fcst_A - reference) -#' skill_B <- abs(fcst_B - reference) -#' RandomWalkTest(skill_A = skill_A, skill_B = skill_B, time_dim = 'sdate', ncores = 1) +#' fcst_A <- array(data = 11:50, dim = c(sdate = 10, lat = 2, lon = 2)) +#' fcst_B <- array(data = 21:60, dim = c(sdate = 10, lat = 2, lon = 2)) +#' reference <- array(data = 1:40, dim = c(sdate = 10, lat = 2, lon = 2)) +#' scores_A <- abs(fcst_A - reference) +#' scores_B <- abs(fcst_B - reference) +#' res1 <- RandomWalkTest(skill_A = scores_A, skill_B = scores_B, pval = FALSE, sign = TRUE) +#' res2 <- RandomWalkTest(skill_A = scores_A, skill_B = scores_B, test.type = 'greater') #' #'@import multiApply +#'@importFrom ClimProjDiags Subset #'@export -RandomWalkTest <- function(skill_A, skill_B, time_dim = 'sdate', ncores = NULL){ +RandomWalkTest <- function(skill_A, skill_B, time_dim = 'sdate', + test.type = 'two.sided.approx', alpha = 0.05, pval = TRUE, + sign = FALSE, ncores = NULL) { - ## Check inputs - if (is.null(skill_A) | is.null(skill_B)){ + # Check inputs + ## skill_A and skill_B + if (is.null(skill_A) | is.null(skill_B)) { stop("Parameters 'skill_A' and 'skill_B' cannot be NULL.") } - if(!is.numeric(skill_A) | !is.numeric(skill_B)){ + if(!is.numeric(skill_A) | !is.numeric(skill_B)) { stop("Parameters 'skill_A' and 'skill_B' must be a numerical array.") } - if (!identical(dim(skill_A),dim(skill_B))){ + if (!identical(dim(skill_A), dim(skill_B))) { stop("Parameters 'skill_A' and 'skill_B' must have the same dimensions.") } - if(!is.character(time_dim)){ + ## time_dim + if (!is.character(time_dim) | length(time_dim) != 1) { stop("Parameter 'time_dim' must be a character string.") } - if(!time_dim %in% names(dim(skill_A)) | !time_dim %in% names(dim(skill_B))){ + if (!time_dim %in% names(dim(skill_A)) | !time_dim %in% names(dim(skill_B))) { stop("Parameter 'time_dim' is not found in 'skill_A' or 'skill_B' dimensions.") } - if (!is.null(ncores)){ - if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 | length(ncores) > 1){ + ## alpha + if (any(!is.numeric(alpha) | alpha <= 0 | alpha >= 1 | length(alpha) > 1)) { + stop("Parameter 'alpha' must be a number between 0 and 1.") + } + ## test.type + if (!test.type %in% c('two.sided.approx','two.sided','greater','less')) { + stop("Parameter 'test.type' must be 'two.sided.approx', 'two.sided', 'greater', or 'less'.") + } + if (test.type == 'two.sided.approx') { + if (alpha != 0.05) { + .warning("DelSole and Tippett (2016) aproximation is valid for alpha ", + "= 0.05 only. Returning the significance at the 0.05 significance level.") + } + if (pval) { + .warning("p-value cannot be returned with the DelSole and Tippett (2016) ", + "aproximation. Returning the significance at the 0.05 significance level.") + } + sign <- TRUE + } + ## ncores + if (!is.null(ncores)) { + if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 | length(ncores) > 1) { stop("Parameter 'ncores' must be a positive integer.") } } ## Compute the Random Walk Test - res <- multiApply::Apply(data = list(skill_A, skill_B), - target_dims = time_dim, - fun = .RandomWalkTest, - ncores = ncores) + res <- Apply(data = list(skill_A = skill_A, + skill_B = skill_B), + target_dims = list(skill_A = time_dim, + skill_B = time_dim), + fun = .RandomWalkTest, + test.type = test.type, + alpha = alpha, pval = pval, sign = sign, + ncores = ncores) + return(res) } -.RandomWalkTest <- function(skill_A, skill_B){ +.RandomWalkTest <- function(skill_A, skill_B, test.type = 'two.sided.approx', + alpha = 0.05, pval = TRUE, sign = FALSE) { + #skill_A and skill_B: [sdate] + + N.eff <- length(skill_A) + + A_better <- sum(skill_B > skill_A) + B_better <- sum(skill_B < skill_A) + + output <- NULL + output$score <- A_better - B_better + + if (test.type == 'two.sided.approx') { + output$sign <- ifelse(test = abs(output$score) > (2 * sqrt(N.eff)), yes = TRUE, no = FALSE) + + } else { + + if (!is.na(output$score)) { + p.val <- binom.test(x = A_better, n = floor(N.eff), p = 0.5, conf.level = 1 - alpha, + alternative = test.type)$p.value + + } else { + p.val <- NA + } + + if (pval) { + output$p.val <- p.val + } + if (sign) { + output$sign <- ifelse(!is.na(p.val) & p.val <= alpha, TRUE, FALSE) + } + + } - score <- cumsum(skill_A > skill_B) - cumsum(skill_A < skill_B) - - ## TRUE if significant (if last value is above or below 2*sqrt(N)) - signif<- ifelse(test = (score[length(skill_A)] < (-2)*sqrt(length(skill_A))) | (score[length(skill_A)] > 2*sqrt(length(skill_A))), - yes = TRUE, no = FALSE) - - return(list("score"=score[length(skill_A)],"signif"=signif)) + return(output) } diff --git a/R/Reorder.R b/R/Reorder.R index 04312071a110a35f5feb396cd0fbe18f494ab010..71a22e34e0d2adab946b6e59f69f4fd3ed36dbe6 100644 --- a/R/Reorder.R +++ b/R/Reorder.R @@ -1,10 +1,14 @@ #'Reorder the dimension of an array #' -#'Reorder the dimension order of a multi-dimensional array +#'Reorder the dimensions of a multi-dimensional array. The order can be provided +#'either as indices or the dimension names. If the order is dimension name, +#'the function looks for names(dim(x)). If it doesn't exist, the function checks +#' if attributes "dimensions" exists; this attribute is in the objects generated +#' by Load(). #' -#'@param data An array of which the dimension to be reordered. +#'@param data An array of which the dimensions to be reordered. #'@param order A vector of indices or character strings indicating the new -#' order of the dimension. +#' order of the dimensions. #' #'@return An array which has the same values as parameter 'data' but with #' different dimension order. @@ -15,6 +19,11 @@ #' print(dim(Reorder(dat1, c('sdate', 'dat', 'lon', 'ftime')))) #' dat2 <- array(c(1:10), dim = c(2, 1, 5)) #' print(dim(Reorder(dat2, c(2, 1, 3)))) +#' attr(dat2, 'dimensions') <- c('sdate', 'time', 'region') +#' dat2_reorder <- Reorder(dat2, c('time', 'sdate', 'region')) +#' # A character array +#' dat3 <- array(paste0('a', 1:24), dim = c(b = 2, c = 3, d = 4)) +#' dat3_reorder <- Reorder(dat3, c('d', 'c', 'b')) #'@export Reorder <- function(data, order) { @@ -27,6 +36,9 @@ Reorder <- function(data, order) { stop("Parameter 'data' must be an array.") } + ## If attribute "dimensions" exists + attr.dim.reorder <- ifelse(!is.null(attributes(data)$dimensions), TRUE, FALSE) + ## order if (is.null(order)) { stop("Parameter 'order' cannot be NULL.") @@ -42,7 +54,23 @@ Reorder <- function(data, order) { } } if (is.character(order)) { - if (!all(order %in% names(dim(data)))) { + if (is.null(names(dim(data)))) { + if (attr.dim.reorder) { + warning("Found dimension names in attributes. Use them to reorder.") + dim_names <- attributes(data)$dimensions + } else { + stop("The array doesn't have dimension names.") + } + } else { + dim_names <- names(dim(data)) + if (attr.dim.reorder) { + if (any(attributes(data)$dimensions != dim_names)) { + warning("Found attribute 'dimensions' has different names from ", + "names(dim(x)). Use the latter one to reorder.") + } + } + } + if (!all(order %in% dim_names)) { stop("Parameter 'order' do not match the dimension names of parameter 'data'.") } } @@ -52,13 +80,12 @@ Reorder <- function(data, order) { } - ############################### # Reorder ## If order is character string, find the indices if (is.character(order)) { - order <- match(order, names(dim(data))) + order <- match(order, dim_names) } ## reorder @@ -73,10 +100,15 @@ Reorder <- function(data, order) { y <- array(1:length(data), dim = dim(data)) y <- aperm(y, order) data <- data[as.vector(y)] + dim(data) <- old_dims[order] } - dim(data) <- old_dims[order] + if (attr.dim.reorder) { + attr_bk$dimensions <- attr_bk$dimensions[order] + } + attributes(data) <- c(attributes(data), attr_bk) - data + + return(data) } diff --git a/R/ResidualCorr.R b/R/ResidualCorr.R index 8d9f0404fbfd1bd50d307da6950e7da4664859c1..6f03ecee1cbf31fa30aa010aeda20aaff7019313 100644 --- a/R/ResidualCorr.R +++ b/R/ResidualCorr.R @@ -199,7 +199,8 @@ ResidualCorr <- function(exp, obs, ref, N.eff = NA, time_dim = 'sdate', return(output) } -.ResidualCorr <- function(exp, obs, ref, N.eff, method, alpha, handle.na) { +.ResidualCorr <- function(exp, obs, ref, N.eff = NA, method = 'pearson', alpha = NULL, + handle.na = 'return.na') { # exp and ref and obs: [time] .residual.corr <- function(exp, obs, ref, method, N.eff, alpha) { diff --git a/R/SPOD.R b/R/SPOD.R index 2599477900b13d3b8d89ae7a282d7fd4158502e6..3a8ff73dc50fe0d567f94722256e5ffa5892966a 100644 --- a/R/SPOD.R +++ b/R/SPOD.R @@ -223,12 +223,12 @@ SPOD <- function(data, data_lats, data_lons, type, lat_dim = 'lat', lon_dim = 'l mean_1 <- ClimProjDiags::WeightedMean(data = data, lon = data_lons, lat = data_lats, region = regions$reg1, - londim = which(names(dim(data)) == lon_dim), - latdim = which(names(dim(data)) == lat_dim)) + londim = lon_dim, + latdim = lat_dim) mean_2 <- ClimProjDiags::WeightedMean(data = data, lon = data_lons, lat = data_lats, region = regions$reg2, - londim = which(names(dim(data)) == lon_dim), - latdim = which(names(dim(data)) == lat_dim)) + londim = lon_dim, + latdim = lat_dim) data <- ClimProjDiags::CombineIndices(indices = list(mean_1,mean_2), weights = NULL, operation = 'subtract') # (mean_1 - mean_2) diff --git a/R/Season.R b/R/Season.R index 086dc529471fb69a385f9c9b21eefc233c29b3c1..1425d592db2ae55481d30c4f4e6b7d2bd83f4991 100644 --- a/R/Season.R +++ b/R/Season.R @@ -1,4 +1,4 @@ -#'Compute seasonal mean +#'Compute seasonal mean or other calculations #' #'Compute the seasonal mean (or other methods) on monthly time series along #'one dimension of a named multi-dimensional arrays. Partial season is not @@ -6,13 +6,14 @@ #' #'@param data A named numeric array with at least one dimension 'time_dim'. #'@param time_dim A character string indicating the name of dimension along -#' which the seasonal means are computed. The default value is 'ftime'. +#' which the seasonal mean or other calculations are computed. The default +#' value is 'ftime'. #'@param monini An integer indicating what the first month of the time series is. #' It can be from 1 to 12. -#'@param moninf An integer indicating the starting month of the seasonal mean. +#'@param moninf An integer indicating the starting month of the seasonal +#' calculation. It can be from 1 to 12. +#'@param monsup An integer indicating the end month of the seasonal calculation. #' It can be from 1 to 12. -#'@param monsup An integer indicating the end month of the seasonal mean. It -#' can be from 1 to 12. #'@param method An R function to be applied for seasonal calculation. For #' example, 'sum' can be used for total precipitation. The default value is mean. #'@param na.rm A logical value indicating whether to remove NA values along @@ -26,12 +27,12 @@ #' #'@examples #'set.seed(1) -#'dat1 <- array(rnorm(144*3), dim = c(member = 2, sdate = 2, ftime = 12*3, lon = 3)) +#'dat1 <- array(rnorm(144 * 3), dim = c(member = 2, sdate = 2, ftime = 12*3, lon = 3)) #'res <- Season(data = dat1, monini = 1, moninf = 1, monsup = 2) #'res <- Season(data = dat1, monini = 10, moninf = 12, monsup = 2) #'dat2 <- dat1 #'set.seed(2) -#'na <- floor(runif(30, min = 1, max = 144*3)) +#'na <- floor(runif(30, min = 1, max = 144 * 3)) #'dat2[na] <- NA #'res <- Season(data = dat2, monini = 3, moninf = 1, monsup = 2) #'res <- Season(data = dat2, monini = 3, moninf = 1, monsup = 2, na.rm = FALSE) @@ -86,6 +87,16 @@ Season <- function(data, time_dim = 'ftime', monini, moninf, monsup, stop("Parameter 'monsup' must be a positive integer between 1 and 12.") } } + ## time_dim, monini, moninf, monsup + mon_gap <- ifelse(moninf >= monini, moninf - monini, moninf + 12 - monini) + if ((mon_gap + 1) > dim(data)[time_dim]) { + stop("Parameter 'moninf' is out of the range because 'monini' is ", monini, + " and time dimenision length is ", as.numeric(dim(data)[time_dim]), ".") + } + mon_diff <- ifelse(monsup >= moninf, monsup - moninf, monsup + 12 - moninf) + if ((mon_gap + mon_diff + 1) > dim(data)[time_dim]) { + stop("The chosen month length exceeds the time dimension of 'data'.") + } ## method if (!is.function(method)) { stop("Parameter 'method' should be an existing R function, e.g., mean or sum.") diff --git a/R/TPI.R b/R/TPI.R index d3f0550613a831f9428d8adaf43863b81c12bd93..167664f6bb0655161a930f0f9b45f8117dd480ec 100644 --- a/R/TPI.R +++ b/R/TPI.R @@ -225,16 +225,16 @@ TPI <- function(data, data_lats, data_lons, type, lat_dim = 'lat', lon_dim = 'lo mean_1 <- ClimProjDiags::WeightedMean(data = data, lon = data_lons, lat = data_lats, region = regions$reg1, - londim = which(names(dim(data)) == lon_dim), - latdim = which(names(dim(data)) == lat_dim)) + londim = lon_dim, + latdim = lat_dim) mean_2 <- ClimProjDiags::WeightedMean(data = data, lon = data_lons, lat = data_lats, region = regions$reg2, - londim = which(names(dim(data)) == lon_dim), - latdim = which(names(dim(data)) == lat_dim)) + londim = lon_dim, + latdim = lat_dim) mean_3 <- ClimProjDiags::WeightedMean(data = data, lon = data_lons, lat = data_lats, region = regions$reg3, - londim = which(names(dim(data)) == lon_dim), - latdim = which(names(dim(data)) == lat_dim)) + londim = lon_dim, + latdim = lat_dim) mean_1_3 <- ClimProjDiags::CombineIndices(indices = list(mean_1, mean_3), weights = NULL, operation = 'mean') diff --git a/R/Trend.R b/R/Trend.R index 1f714a6fac3b87e9c2796afcbfde6d3d4cb42fc7..d709101a681eec1bab3104081bb184356d2fd0ba 100644 --- a/R/Trend.R +++ b/R/Trend.R @@ -50,7 +50,9 @@ #' \code{conf = TRUE}. #'} #'\item{$p.val}{ -#' The p-value calculated by anova(). Only present if \code{pval = TRUE}. +#' A numeric array of p-value calculated by anova(). The first dimension +#' 'stats' is 1, followed by the same dimensions as parameter 'data' except +#' the 'time_dim' dimension. Only present if \code{pval = TRUE}. #'} #'\item{$detrended}{ #' A numeric array with the same dimensions as paramter 'data', containing the @@ -188,7 +190,7 @@ Trend <- function(data, time_dim = 'ftime', interval = 1, polydeg = 1, } if (pval) { - p.val <- rep(NA, polydeg + 1) + p.val <- as.array(NA) } } diff --git a/R/Utils.R b/R/Utils.R index c2c17eb2698c1378b7a4c2147c1002da9b3f9f3a..8770af99f12205401ee8d5727f3e44a4e1f987d8 100644 --- a/R/Utils.R +++ b/R/Utils.R @@ -1,7 +1,7 @@ #'@importFrom abind abind -#'@import plyr +#'@import plyr ncdf4 #'@importFrom grDevices png jpeg pdf svg bmp tiff -#'@import ncdf4 +#'@importFrom easyVerification convert2prob ## Function to tell if a regexpr() match is a complete match to a specified name .IsFullMatch <- function(x, name) { @@ -1809,3 +1809,89 @@ GradientCatsColorBar <- function(nmap, brks = NULL, cols = NULL, vertical = TRUE } +.get_probs <- function(data, indices_for_quantiles, prob_thresholds, weights = NULL, cross.val = FALSE) { + # if exp: [sdate, memb] + # if obs: [sdate, (memb)] + + # Add dim [memb = 1] to obs if it doesn't have memb_dim + if (length(dim(data)) == 1) dim(data) <- c(dim(data), 1) + + # Absolute thresholds + if (cross.val) { + quantiles <- array(NA, dim = c(bin = length(prob_thresholds), sdate = dim(data)[1])) + for (i in 1:dim(data)[1]) { + if (is.null(weights)) { + quantiles[,i] <- quantile(x = as.vector(data[indices_for_quantiles[which(indices_for_quantiles != i)], ]), + probs = prob_thresholds, type = 8, na.rm = TRUE) + } else { + # weights: [sdate, memb] + sorted_arrays <- .sorted_distributions(data[indices_for_quantiles[which(indices_for_quantiles != i)], ], + weights[indices_for_quantiles[which(indices_for_quantiles != i)], ]) + sorted_data <- sorted_arrays$data + cumulative_weights <- sorted_arrays$cumulative_weights + quantiles[,i] <- approx(cumulative_weights, sorted_data, prob_thresholds, "linear")$y + } + } + } else { + if (is.null(weights)) { + quantiles <- quantile(x = as.vector(data[indices_for_quantiles, ]), probs = prob_thresholds, type = 8, na.rm = TRUE) + } else { + # weights: [sdate, memb] + sorted_arrays <- .sorted_distributions(data[indices_for_quantiles, ], weights[indices_for_quantiles, ]) + sorted_data <- sorted_arrays$data + cumulative_weights <- sorted_arrays$cumulative_weights + quantiles <- approx(cumulative_weights, sorted_data, prob_thresholds, "linear")$y + } + quantiles <- array(rep(quantiles, dim(data)[1]),dim = c(bin = length(quantiles), dim(data)[1])) + } + + # quantiles: [bin-1, sdate] + # Probabilities + probs <- array(dim = c(dim(quantiles)[1] + 1, dim(data)[1])) # [bin, sdate] + for (i_time in 1:dim(data)[1]) { + if (anyNA(data[i_time, ])) { + probs[, i_time] <- rep(NA, dim = dim(quantiles)[1] + 1) + } else { + if (is.null(weights)) { + probs[, i_time] <- colMeans(easyVerification::convert2prob(data[i_time, ], + threshold = quantiles[,i_time])) + } else { + sorted_arrays <- .sorted_distributions(data[i_time, ], weights[i_time, ]) + sorted_data <- sorted_arrays$data + cumulative_weights <- sorted_arrays$cumulative_weights + # find any quantiles that are outside the data range + integrated_probs <- array(dim = dim(quantiles)) + for (i_quant in 1:dim(quantiles)[1]) { + # for thresholds falling under the distribution + if (quantiles[i_quant, i_time] < min(sorted_data)) { + integrated_probs[i_quant, i_time] <- 0 + # for thresholds falling over the distribution + } else if (max(sorted_data) < quantiles[i_quant, i_time]) { + integrated_probs[i_quant, i_time] <- 1 + } else { + integrated_probs[i_quant, i_time] <- approx(sorted_data, cumulative_weights, quantiles[i_quant, i_time], + "linear")$y + } + } + probs[, i_time] <- append(integrated_probs[,i_time], 1) - append(0, integrated_probs[,i_time]) + if (min(probs[, i_time]) < 0 | max(probs[, i_time]) > 1) { + stop(paste0("Probability in i_time = ", i_time, " is out of [0, 1].")) + } + } + } + } + return(probs) +} + +.sorted_distributions <- function(data_vector, weights_vector) { + weights_vector <- as.vector(weights_vector) + data_vector <- as.vector(data_vector) + weights_vector <- weights_vector / sum(weights_vector) # normalize to 1 + sorter <- order(data_vector) + sorted_weights <- weights_vector[sorter] + cumulative_weights <- cumsum(sorted_weights) - 0.5 * sorted_weights + cumulative_weights <- cumulative_weights - cumulative_weights[1] # fix the 0 + cumulative_weights <- cumulative_weights / cumulative_weights[length(cumulative_weights)] # fix the 1 + return(list("data" = data_vector[sorter], "cumulative_weights" = cumulative_weights)) +} + diff --git a/R/clim.palette.R b/R/clim.palette.R index e847ad3a983e11614997c675203b723e6f47b22e..b23ff8428f201dbe4cb129e5f202ab2f1924532f 100644 --- a/R/clim.palette.R +++ b/R/clim.palette.R @@ -5,8 +5,9 @@ #' #'@param palette Which type of palette to generate: from blue through white #' to red ('bluered'), from red through white to blue ('redblue'), from -#' yellow through orange to red ('yellowred'), or from red through orange -#' to red ('redyellow'). +#' yellow through orange to red ('yellowred'), from red through orange to +#' red ('redyellow'), from purple through white to orange ('purpleorange'), +#' and from orange through white to purple ('orangepurple'). #'@param n Number of colors to generate. #' #'@examples @@ -29,9 +30,9 @@ clim.palette <- function(palette = "bluered") { attr(colorbar, 'na_color') <- 'pink' } else if (palette == "redblue") { colorbar <- colorRampPalette(c("#67001f", "#b2182b", "#d6604d", - "#f4a582", "#fddbc7", "#f7f7f7", - "#d1e5f0", "#92c5de", "#4393c3", - "#2166ac", "#053061")) + "#f4a582", "#fddbc7", "#f7f7f7", + "#d1e5f0", "#92c5de", "#4393c3", + "#2166ac", "#053061")) attr(colorbar, 'na_color') <- 'pink' } else if (palette == "yellowred") { colorbar <- colorRampPalette(c("#ffffcc", "#ffeda0", "#fed976", @@ -40,11 +41,24 @@ clim.palette <- function(palette = "bluered") { attr(colorbar, 'na_color') <- 'pink' } else if (palette == "redyellow") { colorbar <- colorRampPalette(rev(c("#ffffcc", "#ffeda0", "#fed976", - "#feb24c", "#fd8d3c", "#fc4e2a", - "#e31a1c", "#bd0026", "#800026"))) + "#feb24c", "#fd8d3c", "#fc4e2a", + "#e31a1c", "#bd0026", "#800026"))) + attr(colorbar, 'na_color') <- 'pink' + } else if (palette == "purpleorange") { + colorbar <- colorRampPalette(c("#2d004b", "#542789", "#8073ac", + "#b2abd2", "#d8daeb", "#f7f7f7", + "#fee0b6", "#fdb863", "#e08214", + "#b35806", "#7f3b08")) + attr(colorbar, 'na_color') <- 'pink' + } else if (palette == "orangepurple") { + colorbar <- colorRampPalette(rev(c("#2d004b", "#542789", "#8073ac", + "#b2abd2", "#d8daeb", "#f7f7f7", + "#fee0b6", "#fdb863", "#e08214", + "#b35806", "#7f3b08"))) attr(colorbar, 'na_color') <- 'pink' } else { - stop("Parameter 'palette' must be one of 'bluered', 'redblue', 'yellowred' or 'redyellow'.") + stop("Parameter 'palette' must be one of 'bluered', 'redblue', 'yellowred'", + "'redyellow', 'purpleorange' or 'orangepurple'.") } colorbar } diff --git a/README.md b/README.md index 63b261da62e2c0bc343acef04242b584d88d72b9..55ade8f302d038098728953ac99db2340f34b8bc 100644 --- a/README.md +++ b/README.md @@ -46,24 +46,30 @@ Overview The s2dv scheme is composed of four modules: **Data retrieval** -> **Statistics** -> **Verification** -> **Visualisation** -- **Data retrieval** module: The first step is to gather and homogenize NetCDF data +- [**Data retrieval**](vignettes/data_retrieval.md) module: The first step is to gather and homogenize NetCDF data files from forecasts, hindcasts or observations stored in a local or remote file system. Some simple previous steps are required, however, to set up some configuration parameters so that the module can locate the source files and recognize the variables of interest. -- **Statistics** module: Once the data has been loaded into an R object, some +- [**Statistics**](vignettes/statistics.md) module: Once the data has been loaded into an R object, some statistics can be computed, such as drift-corrected anomalies, trend removal, frequency filtering and more. -- **Verification** module: Either after computing statistics or directly from +- [**Verification**](vignettes/ScoringForecast.md) module: Either after computing statistics or directly from the original data, the verification functions allow you to compute deterministic and probabilistic scores and skill scores such as root mean square error and correlation with reliability indicators such as p-values and confidence intervals. -- **Visualization** module: Plotting functions are also provided to plot the +- [**Visualization**](vignettes/visualisation.md) module: Plotting functions are also provided to plot the results obtained from any of the modules above. +If it's your first time using s2dv you can check an +[**Example**](vignettes/example.md) +of use spanning its four modules or review the +[**Tutorial**](vignettes/tutorial.md). +You will find more detailed examples in the documentation page of each module. + One important feature of s2dv is the named dimension of the data array. All the data input of the functions should have names for all the dimensions. It should not be a problem since the data retrieved by s2dv::Load or startR::Start have @@ -93,7 +99,7 @@ If you have a function to share in this package of do you want to improve a func Please, in order to achieve our goal, follow these steps: -1. Open an issue to contact the maintainers (@aho and @nperez) and agree in the suitability of your development in the package. +1. Open an issue to contact the maintainers (@aho) and agree in the suitability of your development in the package. 2. Create a branch with an identificative name: starting by 'develop-' and followed by a word releated to the development (e.g.: 'develop-Persistance' for function Persistance) 3. Add you function to folder R and create the documentation in roxyegen2 format or modify a function in the package following the discussion on the previous issue. 4. Do as many modifications and pushes as needed to achieve the desired result. diff --git a/man/Corr.Rd b/man/Corr.Rd index 4cbd0986bf3b6dbbca2fc67715ac2f56c18c96a9..bbb1e34d9edfbc82790b675e94c52c4a763d9a46 100644 --- a/man/Corr.Rd +++ b/man/Corr.Rd @@ -16,7 +16,9 @@ Corr( memb = TRUE, pval = TRUE, conf = TRUE, - conf.lev = 0.95, + sign = FALSE, + alpha = 0.05, + conf.lev = NULL, ncores = NULL ) } @@ -52,14 +54,20 @@ member dimension, set NULL. The default value is NULL.} (TRUE) or do ensemble mean over 'memb_dim' (FALSE). Only functional when 'memb_dim' is not NULL. The default value is TRUE.} -\item{pval}{A logical value indicating whether to compute or not the p-value +\item{pval}{A logical value indicating whether to return or not the p-value of the test Ho: Corr = 0. The default value is TRUE.} -\item{conf}{A logical value indicating whether to retrieve the confidence -intervals or not. The default value is TRUE.} +\item{conf}{A logical value indicating whether to return or not the confidence +intervals. The default value is TRUE.} -\item{conf.lev}{A numeric indicating the confidence level for the -regression computation. The default value is 0.95.} +\item{sign}{A logical value indicating whether to retrieve the statistical +significance of the test Ho: Corr = 0 based on 'alpha'. The default value is +FALSE.} + +\item{alpha}{A numeric indicating the significance level for the statistical +significance test. The default value is 0.05.} + +\item{conf.lev}{Deprecated. Use alpha now instead. alpha = 1 - conf.lev.} \item{ncores}{An integer indicating the number of cores to use for parallel computation. The default value is NULL.} @@ -85,6 +93,9 @@ nobs are omitted. exp_memb is the number of member in experiment (i.e., \item{$conf.upper}{ The upper confidence interval. Only present if \code{conf = TRUE}. } +\item{$sign}{ + The statistical significance. Only present if \code{sign = TRUE}. +} } \description{ Calculate the correlation coefficient (Pearson, Kendall or Spearman) for diff --git a/man/NAO.Rd b/man/NAO.Rd index 64b16562fb45f5ad2a84c6efd5dca8ba8e171876..999fd75f6f45177e353006e6bd061e094f5678c2 100644 --- a/man/NAO.Rd +++ b/man/NAO.Rd @@ -50,8 +50,8 @@ of 'ano'. The default value is c('lat', 'lon').} dimension of 'exp' and 'obs'. The default value is 'ftime'.} \item{ftime_avg}{A numeric vector of the forecast time steps to average -across the target period. The default value is 2:4, i.e., from 2nd to 4th -forecast time steps.} +across the target period. If average is not needed, set NULL. The default +value is 2:4, i.e., from 2nd to 4th forecast time steps.} \item{obsproj}{A logical value indicating whether to compute the NAO index by projecting the forecast anomalies onto the leading EOF of observational @@ -67,11 +67,13 @@ computation. The default value is NULL.} A list which contains: \item{exp}{ A numeric array of forecast NAO index in verification format with the same - dimensions as 'exp' except space_dim and ftime_dim. + dimensions as 'exp' except space_dim and ftime_dim. If ftime_avg is NULL, + ftime_dim remains. } \item{obs}{ A numeric array of observed NAO index in verification format with the same - dimensions as 'obs' except space_dim and ftime_dim. + dimensions as 'obs' except space_dim and ftime_dim. If ftime_avg is NULL, + ftime_dim remains. } } \description{ diff --git a/man/RMSSS.Rd b/man/RMSSS.Rd index 9ebcf65475512398233ccf6da90fe09289bf0388..bcf221c351b37ffd12105716e9d1edba51b92130 100644 --- a/man/RMSSS.Rd +++ b/man/RMSSS.Rd @@ -7,9 +7,14 @@ RMSSS( exp, obs, + ref = NULL, time_dim = "sdate", dat_dim = "dataset", + memb_dim = NULL, pval = TRUE, + sign = FALSE, + alpha = 0.05, + sig_method = "one-sided Fisher", ncores = NULL ) } @@ -26,15 +31,38 @@ dimension can be different. It can also be a vector with the same length as 'exp', then the vector will automatically be 'time_dim' and 'dat_dim' will be 1.} +\item{ref}{A named numerical array of the reference forecast data with at +least time dimension, or 0 (typical climatological forecast) or 1 +(normalized climatological forecast). If it is an array, the dimensions must +be the same as 'exp' except 'memb_dim' and 'dat_dim'. If there is only one +reference dataset, it should not have dataset dimension. If there is +corresponding reference for each experiment, the dataset dimension must +have the same length as in 'exp'. If 'ref' is NULL, the typical +climatological forecast is used as reference forecast (equivelant to 0.) +The default value is NULL.} + \item{time_dim}{A character string indicating the name of dimension along which the RMSSS are computed. The default value is 'sdate'.} \item{dat_dim}{A character string indicating the name of dataset (nobs/nexp) dimension. The default value is 'dataset'.} +\item{memb_dim}{A character string indicating the name of the member dimension +to compute the ensemble mean; it should be set to NULL if the parameter 'exp' +and 'ref' are already the ensemble mean. The default value is NULL.} + \item{pval}{A logical value indicating whether to compute or not the p-value -of the test Ho: RMSSS = 0. If pval = TRUE, the insignificant RMSSS will -return NA. The default value is TRUE.} +of the test Ho: RMSSS = 0. The default value is TRUE.} + +\item{sign}{A logical value indicating whether to compute or not the +statistical significance of the test Ho: RMSSS = 0. The default value is +FALSE.} + +\item{alpha}{A numeric of the significance level to be used in the +statistical significance test. The default value is 0.05.} + +\item{sig_method}{A character string indicating the significance method. The +options are "one-sided Fisher" (default) and "Random Walk".} \item{ncores}{An integer indicating the number of cores to use for parallel computation. The default value is NULL.} @@ -43,12 +71,18 @@ computation. The default value is NULL.} A list containing the numeric arrays with dimension:\cr c(nexp, nobs, all other dimensions of exp except time_dim).\cr nexp is the number of experiment (i.e., dat_dim in exp), and nobs is the -number of observation (i.e., dat_dim in obs).\cr +number of observation (i.e., dat_dim in obs). If dat_dim is NULL, nexp and +nobs are omitted.\cr \item{$rmsss}{ - The root mean square error skill score. + A numerical array of the root mean square error skill score. } \item{$p.val}{ - The p-value. Only present if \code{pval = TRUE}. + A numerical array of the p-value with the same dimensions as $rmsss. + Only present if \code{pval = TRUE}. +} +\item{sign}{ + A logical array of the statistical significance of the RMSSS with the same + dimensions as $rmsss. Only present if \code{sign = TRUE}. } } \description{ @@ -58,17 +92,18 @@ have the same dimensions except along dat_dim, where the length can be different, with the number of experiments/models (nexp) and the number of observational datasets (nobs).\cr RMSSS computes the root mean square error skill score of each jexp in 1:nexp -against each jobs in 1:nobs which gives nexp * nobs RMSSS for each other -grid point of the array.\cr -The RMSSS are computed along the time_dim dimension which should corresponds -to the startdate dimension.\cr -The p-value is optionally provided by an one-sided Fisher test.\cr +against each job in 1:nobs which gives nexp * nobs RMSSS for each grid point +of the array.\cr +The RMSSS are computed along the time_dim dimension which should correspond +to the start date dimension.\cr +The p-value and significance test are optionally provided by an one-sided +Fisher test or Random Walk test.\cr } \examples{ set.seed(1) exp <- array(rnorm(30), dim = c(dataset = 2, time = 3, memb = 5)) set.seed(2) obs <- array(rnorm(15), dim = c(time = 3, memb = 5, dataset = 1)) -res <- RMSSS(exp, obs, time_dim = 'time') +res <- RMSSS(exp, obs, time_dim = 'time', dat_dim = 'dataset') } diff --git a/man/ROCSS.Rd b/man/ROCSS.Rd new file mode 100644 index 0000000000000000000000000000000000000000..1f4951781913d190e70b74642558dcbee4ab7073 --- /dev/null +++ b/man/ROCSS.Rd @@ -0,0 +1,89 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ROCSS.R +\name{ROCSS} +\alias{ROCSS} +\title{Compute the Relative Operating Characteristic Skill Score} +\usage{ +ROCSS( + exp, + obs, + ref = NULL, + time_dim = "sdate", + memb_dim = "member", + dat_dim = NULL, + prob_thresholds = c(1/3, 2/3), + indices_for_clim = NULL, + cross.val = FALSE, + ncores = NULL +) +} +\arguments{ +\item{exp}{A named numerical array of the forecast with at least time and +member dimension.} + +\item{obs}{A named numerical array of the observation with at least time +dimension. The dimensions must be the same as 'exp' except 'memb_dim' and +'dat_dim'.} + +\item{ref}{A named numerical array of the reference forecast data with at +least time and member dimension. The dimensions must be the same as 'exp' +except 'memb_dim' and 'dat_dim'. If there is only one reference dataset, +it should not have dataset dimension. If there is corresponding reference +for each experiement, the dataset dimension must have the same length as in +'exp'. If 'ref' is NULL, the random forecast is used as reference forecast. +The default value is NULL.} + +\item{time_dim}{A character string indicating the name of the time dimension. +The default value is 'sdate'.} + +\item{memb_dim}{A character string indicating the name of the member dimension +to compute the probabilities of the forecast and the reference forecast. The +default value is 'member'.} + +\item{dat_dim}{A character string indicating the name of dataset dimension. +The length of this dimension can be different between 'exp' and 'obs'. +The default value is NULL.} + +\item{prob_thresholds}{A numeric vector of the relative thresholds (from 0 to +1) between the categories. The default value is c(1/3, 2/3), which +corresponds to tercile equiprobable categories.} + +\item{indices_for_clim}{A vector of the indices to be taken along 'time_dim' +for computing the thresholds between the probabilistic categories. If NULL, +the whole period is used. The default value is NULL.} + +\item{cross.val}{A logical indicating whether to compute the thresholds +between probabilistic categories in cross-validation. The default value is +FALSE.} + +\item{ncores}{An integer indicating the number of cores to use for parallel +computation. The default value is NULL.} +} +\value{ +A numerical array of ROCSS with the same dimensions as 'exp' excluding +'time_dim' and 'memb_dim' dimensions and including 'cat' dimension, which is +each category. The length if 'cat' dimension corresponds to the number of +probabilistic categories, i.e., 1 + length(prob_thresholds). If there are +multiple datasets, two additional dimensions 'nexp' and 'nobs' are added. +} +\description{ +The Relative Operating Characteristic Skill Score (ROCSS; Kharin and Zwiers, +2003) is based on the ROC curve, which gives information about the hit rates +against the false-alarm rates for a particular category or event. The ROC +curve can be summarized with the area under the ROC curve, known as the ROC +score, to provide a skill value for each category. The ROCSS ranges between +minus infinite and 1. A positive ROCSS value indicates that the forecast has +higher skill than the reference forecasts, meaning the contrary otherwise. +} +\examples{ +exp <- array(rnorm(1000), dim = c(lon = 3, lat = 2, sdate = 60, member = 10)) +ref <- array(rnorm(1000), dim = c(lon = 3, lat = 2, sdate = 60, member = 10)) +obs <- array(rnorm(1000), dim = c(lon = 3, lat = 2, sdate = 60)) +ROCSS(exp = exp, obs = obs) ## random forecast as reference forecast +ROCSS(exp = exp, obs = obs, ref = ref) ## ref as reference forecast + +} +\references{ +Kharin, V. V. and Zwiers, F. W. (2003): + https://doi.org/10.1175/1520-0442(2003)016%3C4145:OTRSOP%3E2.0.CO;2 +} diff --git a/man/RPS.Rd b/man/RPS.Rd index 4d8236bba260d14f016ebb82304c536d545d01dd..813c12f6e58e5cc6b33e9ffc95ebcd9555b4050d 100644 --- a/man/RPS.Rd +++ b/man/RPS.Rd @@ -14,6 +14,7 @@ RPS( indices_for_clim = NULL, Fair = FALSE, weights = NULL, + cross.val = FALSE, ncores = NULL ) } @@ -54,6 +55,10 @@ ensemble should have at least 70 members or span at least 10 time steps and have more than 45 members if consistency between the weighted and unweighted methodologies is desired.} +\item{cross.val}{A logical indicating whether to compute the thresholds between +probabilistic categories in cross-validation. +The default value is FALSE.} + \item{ncores}{An integer indicating the number of cores to use for parallel computation. The default value is NULL.} } diff --git a/man/RPSS.Rd b/man/RPSS.Rd index a68f21ca4b864411bc19f6a71aa612c910493d66..d70425e62f7df67e3a3c5ef75b0ff9163fb98389 100644 --- a/man/RPSS.Rd +++ b/man/RPSS.Rd @@ -17,6 +17,7 @@ RPSS( weights = NULL, weights_exp = NULL, weights_ref = NULL, + cross.val = FALSE, ncores = NULL ) } @@ -72,6 +73,10 @@ time steps and have more than 45 members if consistency between the weighted \item{weights_ref}{Same as 'weights_exp' but for the reference forecast.} +\item{cross.val}{A logical indicating whether to compute the thresholds between +probabilistics categories in cross-validation. +The default value is FALSE.} + \item{ncores}{An integer indicating the number of cores to use for parallel computation. The default value is NULL.} } diff --git a/man/RandomWalkTest.Rd b/man/RandomWalkTest.Rd index bd1460a74fc8c9cbe69f810ec6ef6c23497dcbd6..e123669ca902eef4c1ee581f7c9dbe3226d2d4d3 100644 --- a/man/RandomWalkTest.Rd +++ b/man/RandomWalkTest.Rd @@ -2,50 +2,109 @@ % Please edit documentation in R/RandomWalkTest.R \name{RandomWalkTest} \alias{RandomWalkTest} -\title{Random walk test for skill differences} +\title{Random Walk test for skill differences} \usage{ -RandomWalkTest(skill_A, skill_B, time_dim = "sdate", ncores = NULL) +RandomWalkTest( + skill_A, + skill_B, + time_dim = "sdate", + test.type = "two.sided.approx", + alpha = 0.05, + pval = TRUE, + sign = FALSE, + ncores = NULL +) } \arguments{ -\item{skill_A}{A numerical array of the time series of the skill with the -forecaster A's.} +\item{skill_A}{A numerical array of the time series of the scores obtained +with the forecaster A.} -\item{skill_B}{A numerical array of the time series of the skill with the -forecaster B's. The dimensions should be identical as parameter 'skill_A'.} +\item{skill_B}{A numerical array of the time series of the scores obtained +with the forecaster B. The dimensions should be identical as parameter +'skill_A'.} \item{time_dim}{A character string indicating the name of the dimension along which the tests are computed. The default value is 'sdate'.} +\item{test.type}{A character string indicating the type of significance test. +It can be "two.sided.approx" (to assess whether forecaster A and forecaster +B are significantly different in terms of skill with a two-sided test using +the approximation of DelSole and Tippett, 2016), "two.sided" (to assess +whether forecaster A and forecaster B are significantly different in terms +of skill with an exact two-sided test), "greater" (to assess whether +forecaster A shows significantly better skill than forecaster B with a +one-sided test for negatively oriented scores), or "less" (to assess whether +forecaster A shows significantly better skill than forecaster B with a +one-sided test for positively oriented scores). The default value is +"two.sided.approx".} + +\item{alpha}{A numeric of the significance level to be used in the statistical +significance test (output "sign"). The default value is 0.05.} + +\item{pval}{A logical value indicating whether to return the p-value of the +significance test. The default value is TRUE.} + +\item{sign}{A logical value indicating whether to return the statistical +significance of the test based on 'alpha'. The default value is FALSE.} + \item{ncores}{An integer indicating the number of cores to use for parallel computation. The default value is NULL.} } \value{ -A list of 2: +A list with: \item{$score}{ A numerical array with the same dimensions as the input arrays except 'time_dim'. The number of times that forecaster A has been better than forecaster B minus the number of times that forecaster B has been better - than forecaster A (for skill positively oriented). If $score is positive - forecaster A is better than forecaster B, and if $score is negative - forecaster B is better than forecaster B. + than forecaster A (for skill negatively oriented, i.e., the lower the + better). If $score is positive, forecaster A has been better more times + than forecaster B. If $score is negative, forecaster B has been better more + times than forecaster A. +} +\item{$sign}{ + A logical array of the statistical significance with the same dimensions + as the input arrays except "time_dim". Returned only if "sign" is TRUE. } -\item{$signif}{ - A logical array with the same dimensions as the input arrays except - 'time_dim'. Whether the difference is significant or not at the 5% - significance level. +\item{$p.val}{ + A numeric array of the p-values with the same dimensions as the input arrays + except "time_dim". Returned only if "pval" is TRUE. } } \description{ Forecast comparison of the skill obtained with 2 forecasts (with respect to a -common reference) based on Random Walks, with significance estimate at the 95% -confidence level, as in DelSole and Tippett (2016). +common observational reference) based on Random Walks (DelSole and Tippett, +2016). +} +\details{ +Null and alternative hypothesis for "two-sided" test (regardless of the +orientation of the scores):\cr +H0: forecaster A and forecaster B are not different in terms of skill\cr +H1: forecaster A and forecaster B are different in terms of skill + +Null and alternative hypothesis for one-sided "greater" (for negatively +oriented scores, i.e., the lower the better) and "less" (for positively +oriented scores, i.e., the higher the better) tests:\cr +H0: forecaster A is not better than forecaster B\cr +H1: forecaster A is better than forecaster B + +Examples of negatively oriented scores are the RPS, RMSE and the Error, while +the ROC score is a positively oriented score. + +DelSole and Tippett (2016) approximation for two-sided test at 95% confidence +level: significant if the difference between the number of times that +forecaster A has been better than forecaster B and forecaster B has been +better than forecaster A is above 2sqrt(N) or below -2sqrt(N). } \examples{ -fcst_A <- array(c(11:50), dim = c(sdate = 10, lat = 2, lon = 2)) -fcst_B <- array(c(21:60), dim = c(sdate = 10, lat = 2, lon = 2)) -reference <- array(1:40, dim = c(sdate = 10, lat = 2, lon = 2)) -skill_A <- abs(fcst_A - reference) -skill_B <- abs(fcst_B - reference) -RandomWalkTest(skill_A = skill_A, skill_B = skill_B, time_dim = 'sdate', ncores = 1) +fcst_A <- array(data = 11:50, dim = c(sdate = 10, lat = 2, lon = 2)) +fcst_B <- array(data = 21:60, dim = c(sdate = 10, lat = 2, lon = 2)) +reference <- array(data = 1:40, dim = c(sdate = 10, lat = 2, lon = 2)) +scores_A <- abs(fcst_A - reference) +scores_B <- abs(fcst_B - reference) +res1 <- RandomWalkTest(skill_A = scores_A, skill_B = scores_B, pval = FALSE, sign = TRUE) +res2 <- RandomWalkTest(skill_A = scores_A, skill_B = scores_B, test.type = 'greater') } +\references{ +DelSole and Tippett (2016): https://doi.org/10.1175/MWR-D-15-0218.1 +} diff --git a/man/Reorder.Rd b/man/Reorder.Rd index 8748aaf2662d81e53211e3001954cfc896428f11..c4e4a2312fc342e6e54ebd36c92c8d19eec3f7d2 100644 --- a/man/Reorder.Rd +++ b/man/Reorder.Rd @@ -7,17 +7,21 @@ Reorder(data, order) } \arguments{ -\item{data}{An array of which the dimension to be reordered.} +\item{data}{An array of which the dimensions to be reordered.} \item{order}{A vector of indices or character strings indicating the new -order of the dimension.} +order of the dimensions.} } \value{ An array which has the same values as parameter 'data' but with different dimension order. } \description{ -Reorder the dimension order of a multi-dimensional array +Reorder the dimensions of a multi-dimensional array. The order can be provided +either as indices or the dimension names. If the order is dimension name, +the function looks for names(dim(x)). If it doesn't exist, the function checks +if attributes "dimensions" exists; this attribute is in the objects generated +by Load(). } \examples{ dat1 <- array(c(1:30), dim = c(dat = 1, sdate = 3, ftime = 2, lon = 5)) @@ -25,4 +29,9 @@ Reorder the dimension order of a multi-dimensional array print(dim(Reorder(dat1, c('sdate', 'dat', 'lon', 'ftime')))) dat2 <- array(c(1:10), dim = c(2, 1, 5)) print(dim(Reorder(dat2, c(2, 1, 3)))) + attr(dat2, 'dimensions') <- c('sdate', 'time', 'region') + dat2_reorder <- Reorder(dat2, c('time', 'sdate', 'region')) + # A character array + dat3 <- array(paste0('a', 1:24), dim = c(b = 2, c = 3, d = 4)) + dat3_reorder <- Reorder(dat3, c('d', 'c', 'b')) } diff --git a/man/Season.Rd b/man/Season.Rd index 3c1e3ffcda3ec669195e8762f1710a9ecc7855b7..fccd9ffdfd24479c261488f6eeeed2eb863d8eb1 100644 --- a/man/Season.Rd +++ b/man/Season.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/Season.R \name{Season} \alias{Season} -\title{Compute seasonal mean} +\title{Compute seasonal mean or other calculations} \usage{ Season( data, @@ -19,16 +19,17 @@ Season( \item{data}{A named numeric array with at least one dimension 'time_dim'.} \item{time_dim}{A character string indicating the name of dimension along -which the seasonal means are computed. The default value is 'ftime'.} +which the seasonal mean or other calculations are computed. The default +value is 'ftime'.} \item{monini}{An integer indicating what the first month of the time series is. It can be from 1 to 12.} -\item{moninf}{An integer indicating the starting month of the seasonal mean. -It can be from 1 to 12.} +\item{moninf}{An integer indicating the starting month of the seasonal +calculation. It can be from 1 to 12.} -\item{monsup}{An integer indicating the end month of the seasonal mean. It -can be from 1 to 12.} +\item{monsup}{An integer indicating the end month of the seasonal calculation. +It can be from 1 to 12.} \item{method}{An R function to be applied for seasonal calculation. For example, 'sum' can be used for total precipitation. The default value is mean.} @@ -51,12 +52,12 @@ accounted. } \examples{ set.seed(1) -dat1 <- array(rnorm(144*3), dim = c(member = 2, sdate = 2, ftime = 12*3, lon = 3)) +dat1 <- array(rnorm(144 * 3), dim = c(member = 2, sdate = 2, ftime = 12*3, lon = 3)) res <- Season(data = dat1, monini = 1, moninf = 1, monsup = 2) res <- Season(data = dat1, monini = 10, moninf = 12, monsup = 2) dat2 <- dat1 set.seed(2) -na <- floor(runif(30, min = 1, max = 144*3)) +na <- floor(runif(30, min = 1, max = 144 * 3)) dat2[na] <- NA res <- Season(data = dat2, monini = 3, moninf = 1, monsup = 2) res <- Season(data = dat2, monini = 3, moninf = 1, monsup = 2, na.rm = FALSE) diff --git a/man/Trend.Rd b/man/Trend.Rd index d283ee652d6795f127c6853e5dfa52e9715ce2e9..7623c3613149b891a5bf83c26bb32fb56716c541 100644 --- a/man/Trend.Rd +++ b/man/Trend.Rd @@ -66,7 +66,9 @@ A list containing: \code{conf = TRUE}. } \item{$p.val}{ - The p-value calculated by anova(). Only present if \code{pval = TRUE}. + A numeric array of p-value calculated by anova(). The first dimension + 'stats' is 1, followed by the same dimensions as parameter 'data' except + the 'time_dim' dimension. Only present if \code{pval = TRUE}. } \item{$detrended}{ A numeric array with the same dimensions as paramter 'data', containing the diff --git a/man/clim.palette.Rd b/man/clim.palette.Rd index 5d17947af606686801b830d29048be5a5df41790..94c90553881cac83d3ddb5a89d99bf2b24a8cc1b 100644 --- a/man/clim.palette.Rd +++ b/man/clim.palette.Rd @@ -12,8 +12,9 @@ clim.colors(n, palette = "bluered") \arguments{ \item{palette}{Which type of palette to generate: from blue through white to red ('bluered'), from red through white to blue ('redblue'), from -yellow through orange to red ('yellowred'), or from red through orange -to red ('redyellow').} +yellow through orange to red ('yellowred'), from red through orange to +red ('redyellow'), from purple through white to orange ('purpleorange'), +and from orange through white to purple ('orangepurple').} \item{n}{Number of colors to generate.} } diff --git a/tests/testthat/test-CDORemap.R b/tests/testthat/test-CDORemap.R index 16da1b090a39a2dfa5659cfabb5c8c2af57aa62e..1ace086d7fd7117c7d1ca978f309614ae84c3e22 100644 --- a/tests/testthat/test-CDORemap.R +++ b/tests/testthat/test-CDORemap.R @@ -39,6 +39,10 @@ data4_1 <- drop(data4) data4_2 <- ClimProjDiags::Subset(data4, c(1,2,3,4,8), list(1,1,1,1,1), drop = 'selected') data4_3 <- .aperm2(data4_2, c(1, 3, 2)) + # data5: regular grid, more dimensions + data5 <- array(1:(4*10*8*3*2), dim = c(dat = 1, var = 1, memb = 4, lon = 10, lat = 8, sdate = 3, sweek = 2)) + data5_1 <- aperm(data5, c(1,2,3,6,4,5,7)) + ############################################## test_that("1. Input checks", { @@ -269,3 +273,43 @@ c(-88.2, -84.6, -81.0, -77.4, -73.8) ) }) + + +############################################################ + +test_that("6. dat5: regular regrid, more dimensions", { +suppressWarnings( +res5 <- CDORemap(data5, lons = seq(1, 5.5, by = 0.5), lats = seq(10, 13.5, by = 0.5), + grid = 'r360x181', method = 'bil', crop = T) +) +expect_equal( +as.vector(res5$lons), +1:5 +) +expect_equal( +as.vector(res5$lats), +10:13 +) +expect_equal( +as.vector(res5$data_array[1, 1, 1, , , 2, 1]), +c(seq(321, 353, by = 8), seq(401, 433, by = 8), seq(481, 513, by = 8), seq(561, 593, by = 8)) +) + +suppressWarnings( +res5_1 <- CDORemap(data5_1, lons = seq(1, 5.5, by = 0.5), lats = seq(10, 13.5, by = 0.5), + grid = 'r360x181', method = 'bil', crop = T) +) +expect_equal( +aperm(res5_1$data_array, c(1, 2, 3, 5, 6, 4, 7)), +res5$data_array +) +expect_equal( +as.vector(res5_1$lons), +1:5 +) +expect_equal( +as.vector(res5_1$lats), +10:13 +) +}) + diff --git a/tests/testthat/test-CRPSS.R b/tests/testthat/test-CRPSS.R index 6937edb2332888a8368663a1660c89db944e6749..db0eecd632c1494a332987d6bc0f31f079ae7ed3 100644 --- a/tests/testthat/test-CRPSS.R +++ b/tests/testthat/test-CRPSS.R @@ -141,7 +141,7 @@ test_that("2. Output checks: dat1", { # ref = NULL expect_equal( as.vector(CRPSS(exp1, obs1)$crpss), - c(-0.1582765, -0.2390707), + c(0.061796021, -0.003647287), tolerance = 0.0001 ) expect_equal( @@ -150,12 +150,12 @@ test_that("2. Output checks: dat1", { ) expect_equal( as.vector(CRPSS(exp1, obs1, Fair = T)$crpss), - c(0.07650872, -0.09326681), + c(0.2612070, 0.1253865), tolerance = 0.0001 ) expect_equal( as.vector(CRPSS(exp1, obs1)$crpss), - c(-0.1582765, -0.2390707), + c(0.061796021, -0.003647287), tolerance = 0.0001 ) # ref = ref @@ -175,7 +175,7 @@ test_that("2. Output checks: dat1", { ) expect_equal( as.vector(CRPSS(exp1, obs1, ref1)$crpss), - c( 0.3491793, 0.3379610), + c(0.3491793, 0.3379610), tolerance = 0.0001 ) @@ -211,7 +211,7 @@ test_that("3. Output checks: dat2", { # ref = NULL expect_equal( as.vector(CRPSS(exp2, obs2)$crpss), - c(-0.8209236), + -0.4749481, tolerance = 0.0001 ) expect_equal( @@ -222,11 +222,11 @@ test_that("3. Output checks: dat2", { expect_equal( as.vector(CRPSS(exp2, obs2)$sign), - TRUE, + FALSE, ) expect_equal( as.vector(CRPSS(exp2, obs2, Fair = T)$crpss), - c(-0.468189), + c(-0.1745512), tolerance = 0.0001 ) expect_equal( @@ -265,12 +265,12 @@ test_that("4. Output checks: dat3", { ) expect_equal( mean(CRPSS(exp3, obs3, dat_dim = 'dataset')$crpss), - c(-0.7390546), + c(-0.4086342), tolerance = 0.0001 ) expect_equal( as.vector(CRPSS(exp3, obs3, dat_dim = 'dataset')$crpss), - c(-0.8209236, -0.6270744, -1.0829403, -0.6574485, -0.5970569, -0.6488837), + c(-0.4749481, -0.3179303, -0.6871816, -0.3425333, -0.2936161, -0.3355958), tolerance = 0.0001 ) expect_equal( @@ -279,12 +279,12 @@ test_that("4. Output checks: dat3", { ) expect_equal( mean(CRPSS(exp3, obs3, dat_dim = 'dataset', Fair = T)$crpss), - c(-0.5302703), + -0.2242162, tolerance = 0.0001 ) expect_equal( as.vector(CRPSS(exp3, obs3, dat_dim = 'dataset', Fair = T)$crpss), - c(-0.4681890, -0.3580685, -1.0116628, -0.3730576, -0.3965618, -0.5740819), + c(-0.17455119, -0.08645477, -0.60933021, -0.09844606, -0.11724947, -0.25926556), tolerance = 0.0001 ) # ref = ref3 @@ -310,7 +310,7 @@ test_that("5. Output checks: dat4", { ) expect_equal( as.vector(CRPSS(exp4, obs4, dat_dim = 'dataset', Fair = T)$crpss)[1:6], - c(-0.4681890, -0.3580685, -1.0116628, -0.3730576, -0.3965618, -0.5740819), + c(-0.17455119, -0.08645477, -0.60933021, -0.09844606, -0.11724947, -0.25926556), tolerance = 0.0001 ) # ref = ref3 diff --git a/tests/testthat/test-Corr.R b/tests/testthat/test-Corr.R index 6e3c0162bb3c50cce3e28d0454c2da1672411ba2..ef013cab372b62984c359d70dc569cf9321e83b9 100644 --- a/tests/testthat/test-Corr.R +++ b/tests/testthat/test-Corr.R @@ -22,6 +22,11 @@ context("s2dv::Corr tests") obs2 <- array(rnorm(30), dim = c(member = 1, dataset = 1, sdate = 5, lat = 2, lon = 3)) + exp2_2 <- exp2 + exp2_2[1, 1, 1, 1, 1] <- NA + obs2_2 <- obs2 + obs2_2[1, 1, 1:3, 1, 1] <- NA + # dat3: memb_dim = member, obs has multiple memb set.seed(1) exp3 <- array(rnorm(180), dim = c(member = 3, dataset = 2, sdate = 5, @@ -121,8 +126,8 @@ test_that("1. Input checks", { "integers smaller than the length of paramter 'comp_dim'.") ) expect_error( - Corr(exp1, obs1, conf.lev = -1), - "Parameter 'conf.lev' must be a numeric number between 0 and 1." + Corr(exp1, obs1, alpha = -1), + "Parameter 'alpha' must be a numeric number between 0 and 1." ) expect_error( Corr(exp1, obs1, method = 1), @@ -207,7 +212,7 @@ suppressWarnings( ) suppressWarnings( expect_equal( - min(Corr(exp1, obs1, conf.lev = 0.99)$conf.upper, na.rm = TRUE), + min(Corr(exp1, obs1, alpha = 0.01)$conf.upper, na.rm = TRUE), 0.2747904, tolerance = 0.0001 ) @@ -253,6 +258,10 @@ test_that("3. Output checks: dat2", { dim(Corr(exp2, obs2, memb_dim = 'member')$corr), c(nexp = 2, nobs = 1, exp_memb = 3, obs_memb = 1, lat = 2, lon = 3) ) + expect_equal( + dim(Corr(exp2, obs2, memb_dim = 'member')$corr), + dim(Corr(exp2, obs2, memb_dim = 'member')$p) + ) expect_equal( names(Corr(exp2, obs2, memb_dim = 'member')), c("corr", "p.val", "conf.lower", "conf.upper") @@ -269,6 +278,11 @@ test_that("3. Output checks: dat2", { names(Corr(exp2, obs2, memb_dim = 'member', pval = FALSE)), c("corr", "conf.lower", "conf.upper") ) + expect_equal( + names(Corr(exp2, obs2, memb_dim = 'member', pval = FALSE, sign = T)), + c("corr", "conf.lower", "conf.upper", "sign") + ) + expect_equal( mean(Corr(exp2, obs2, memb_dim = 'member', pval = FALSE, conf = FALSE)$corr), 0.01645575, @@ -299,6 +313,10 @@ test_that("3. Output checks: dat2", { c(-0.9500121, -0.9547642, -0.9883400, -0.8817478, -0.6879465), tolerance = 0.0001 ) + expect_equal( + which(Corr(exp2, obs2, memb_dim = 'member', pval = FALSE, conf = F, sign = T)$sign), + c(3, 6, 12, 17, 23, 34) + ) # ensemble mean expect_equal( @@ -336,7 +354,18 @@ test_that("3. Output checks: dat2", { tolerance = 0.0001 ) + # exp2_2 + expect_equal( + which(is.na(Corr(exp2_2, obs2_2, memb_dim = 'member', memb = FALSE, pval = FALSE)$corr)), + 1:2 + ) + expect_equal( + Corr(exp2_2, obs2_2, memb_dim = 'member', memb = FALSE, pval = FALSE)$corr[-c(1:2)], + Corr(exp2, obs2, memb_dim = 'member', memb = FALSE, pval = FALSE)$corr[-c(1:2)] + ) + }) + ############################################## test_that("4. Output checks: dat3", { # individual member diff --git a/tests/testthat/test-NAO.R b/tests/testthat/test-NAO.R index 2da0d407ea986175a9a3d2c73605bd5aead29451..05fcd22caec34d412f254f64e9819484a0269e04 100644 --- a/tests/testthat/test-NAO.R +++ b/tests/testthat/test-NAO.R @@ -16,6 +16,7 @@ context("s2dv::NAO tests") obs2 <- array(rnorm(108), dim = c(sdate = 3, ftime = 4, lat = 3, lon = 3)) lat2 <- c(80, 50, 20) lon2 <- c(-80, 0, 40) + ############################################## test_that("1. Input checks", { @@ -208,6 +209,19 @@ test_that("2. dat1", { c(-0.1139683, 0.1056687, 0.1889449), tolerance = 0.0001 ) + expect_equal( + dim(NAO(exp1, obs1, lat = lat1, lon = lon1, ftime_avg = 1)$exp), + c(sdate = 3, member = 2, dataset = 1) + ) + expect_equal( + dim(NAO(exp1, obs1, lat = lat1, lon = lon1, ftime_avg = NULL)$exp), + c(sdate = 3, member = 2, dataset = 1, ftime = 4) + ) + expect_equal( + c(NAO(exp1, obs1, lat = lat1, lon = lon1, ftime_avg = NULL)$exp[,,1,1]), + c(NAO(exp1, obs1, lat = lat1, lon = lon1, ftime_avg = 1)$exp) + ) + }) ############################################## diff --git a/tests/testthat/test-RMS.R b/tests/testthat/test-RMS.R index f69cfd8a1de8e4db627dcfb5e9d0e0ca2a845080..bf059efa722e33951bbc0955189791abfe282e24 100644 --- a/tests/testthat/test-RMS.R +++ b/tests/testthat/test-RMS.R @@ -24,6 +24,12 @@ context("s2dv::RMS tests") set.seed(2) obs3 <- array(rnorm(80), dim = c(sdate = 5, ftime = 2, lon = 1, lat = 4)) + # dat4 + set.seed(1) + exp4 <- array(rnorm(120), dim = c(sdates = 5, ftimes = 2, lon = 1, lat = 1)) + + set.seed(2) + obs4 <- array(rnorm(80), dim = c(sdates = 5, ftimes = 2, lon = 1, lat = 1)) ############################################## test_that("1. Input checks", { @@ -205,6 +211,14 @@ test_that("4. Output checks: dat3", { c(1.6458118, 0.8860392, 0.8261295, 1.1681939, 2.1693538, 1.3064454, 0.5384229, 1.1215333), tolerance = 0.00001 ) + expect_equal( + dim(RMS(exp3, obs3, dat_dim = NULL, conf = FALSE)$rms), + c(ftime = 2, lon = 1, lat = 4) + ) + expect_equal( + dim(RMS(exp4, obs4, time_dim = 'sdates', dat_dim = NULL, conf = TRUE)$rms), + c(ftimes = 2, lon = 1, lat = 1) + ) }) ############################################## \ No newline at end of file diff --git a/tests/testthat/test-RMSSS.R b/tests/testthat/test-RMSSS.R index d592130fc7c8f1ea6478da94a4e5d8677586fdb0..fa019e60ab94ba053a54ecad7ce5289297e10ad2 100644 --- a/tests/testthat/test-RMSSS.R +++ b/tests/testthat/test-RMSSS.R @@ -1,23 +1,21 @@ context("s2dv::RMSSS tests") ############################################## - # case 0 - set.seed(1) - exp0 <- array(rnorm(15), dim = c(sdate = 3, dataset = 5)) - set.seed(2) - obs0 <- array(rnorm(6), dim = c(sdate = 3, dataset = 2)) - # case 1 set.seed(1) - exp1 <- array(rnorm(15), dim = c(time = 3, memb = 5)) + exp1 <- array(rnorm(15), dim = c(sdate = 3, dataset = 5)) set.seed(2) - obs1 <- array(rnorm(6), dim = c(time = 3, memb = 2)) - + obs1 <- array(rnorm(6), dim = c(sdate = 3, dataset = 2)) + set.seed(3) + ref1_1 <- array(rnorm(15), dim = c(sdate = 3, dataset = 5)) + ref1_2 <- exp1[, 3] + dim(ref1_2) <- c(sdate = 3) + # case 2 set.seed(3) - exp2 <- array(rnorm(120), dim = c(sdate = 10, dat = 1, lon = 3, lat = 2, dataset = 2)) + exp2 <- array(rnorm(120), dim = c(time = 10, dat = 1, lon = 3, lat = 2, dataset = 2)) set.seed(4) - obs2 <- array(rnorm(60), dim = c(dat = 1, sdate = 10, dataset = 1, lat = 2, lon = 3)) + obs2 <- array(rnorm(60), dim = c(dat = 1, time = 10, dataset = 1, lat = 2, lon = 3)) # case 3: vector set.seed(5) @@ -27,14 +25,23 @@ context("s2dv::RMSSS tests") # case 4 set.seed(7) - exp4 <- array(rnorm(120), dim = c(sdate = 10, dat = 1, lon = 3, lat = 2)) + exp4 <- array(rnorm(60), dim = c(sdate = 10, lon = 3, lat = 2)) set.seed(8) - obs4 <- array(rnorm(60), dim = c(dat = 1, sdate = 10, lat = 2, lon = 3)) + obs4 <- array(exp4 + rnorm(60) / 2, dim = dim(exp4)) + + # case 5: memb_dim + set.seed(1) + exp5 <- array(rnorm(45), dim = c(sdate = 3, dataset = 5, member = 3)) + set.seed(2) + obs5 <- array(rnorm(3), dim = c(sdate = 3, dataset = 1, member = 1)) + set.seed(3) + ref5 <- array(rnorm(6), dim = c(sdate = 3, member = 2)) + ############################################## test_that("1. Input checks", { - + ## exp and obs (1) expect_error( RMSSS(c(), c()), "Parameter 'exp' and 'obs' cannot be NULL." @@ -53,38 +60,53 @@ test_that("1. Input checks", { "Parameter 'exp' and 'obs' must have dimension names." ) expect_error( - RMSSS(array(1:10, dim = c(a = 2, c = 5)), array(1:4, dim = c(a = 2, b = 2))), + RMSSS(array(1:10, dim = c(a = 3, c = 5)), array(1:4, dim = c(a = 3, b = 5)), time_dim = 'a', dat_dim = NULL), "Parameter 'exp' and 'obs' must have same dimension name" ) + ## time_dim expect_error( RMSSS(exp1, obs1, time_dim = 1), "Parameter 'time_dim' must be a character string." ) expect_error( - RMSSS(exp0, obs0, time_dim = 'a'), + RMSSS(exp1, obs1, time_dim = 'a'), "Parameter 'time_dim' is not found in 'exp' or 'obs' dimension." ) + ## dat_dim expect_error( - RMSSS(exp0, obs0, dat_dim = NA), + RMSSS(exp1, obs1, dat_dim = NA), "Parameter 'dat_dim' must be a character string." ) expect_error( - RMSSS(exp0, obs0, dat_dim = 'memb'), + RMSSS(exp1, obs1, dat_dim = 'memb'), paste0("Parameter 'dat_dim' is not found in 'exp' or 'obs' dimension.", " Set it as NULL if there is no dataset dimension.") ) + ## pval expect_error( - RMSSS(exp0, obs0, pval = c(T, T)), + RMSSS(exp1, obs1, pval = c(T, T)), "Parameter 'pval' must be one logical value." ) + ## sign + expect_error( + RMSSS(exp1, obs1, sign = 0.05), + "Parameter 'sign' must be one logical value." + ) + ## alpha + expect_error( + RMSSS(exp1, obs1, alpha = T), + "Parameter 'alpha' must be one numeric value." + ) + ## ncores expect_error( - RMSSS(exp0, obs0, ncores = 1.4), + RMSSS(exp1, obs1, ncores = 1.4), "Parameter 'ncores' must be a positive integer." ) + ## exp and obs (2) expect_error( RMSSS(exp = array(1:10, dim = c(sdate = 1, dataset = 5, a = 1)), obs = array(1:4, dim = c(a = 1, sdate = 2, dataset = 2))), - "Parameter 'exp' and 'obs' must have same length of all dimension except 'dat_dim'." + "Parameter 'exp' and 'obs' must have same length of all dimension except 'memb_dim' and 'dat_dim'." ) expect_error( RMSSS(exp = array(1:10, dim = c(sdate = 1, dataset = 5, a = 1)), @@ -96,7 +118,7 @@ test_that("1. Input checks", { ############################################## test_that("2. Output checks: case 1", { - res1_1 <- RMSSS(exp1, obs1, time_dim = 'time', dat_dim = 'memb') + res1_1 <- RMSSS(exp1, obs1, dat_dim = 'dataset') expect_equal( dim(res1_1$rmsss), c(nexp = 5, nobs = 2) @@ -110,12 +132,17 @@ test_that("2. Output checks: case 1", { -0.5449538, tolerance = 0.00001 ) + expect_equal( + as.vector(res1_1$p.val)[3:7], + c(0.4914588, 0.5134658, 0.6428701, 0.4919943, 0.8672634), + tolerance = 0.001 + ) exp1_2 <- exp1 exp1_2[2:4] <- NA obs1_2 <- obs1 obs1_2[1:2] <- NA - res1_2 <- RMSSS(exp1_2, obs1_2, time_dim = 'time', dat_dim = 'memb', pval = TRUE) + res1_2 <- RMSSS(exp1_2, obs1_2, dat_dim = 'dataset', pval = TRUE) expect_equal( length(res1_2$rmsss[which(is.na(res1_2$rmsss))]), @@ -127,6 +154,66 @@ test_that("2. Output checks: case 1", { tolerance = 0.00001 ) + #ref + res1_3 <- RMSSS(exp1, obs1, ref = ref1_1, dat_dim = 'dataset') + expect_equal( + dim(res1_3$rmsss), + c(nexp = 5, nobs = 2) + ) + expect_equal( + dim(res1_3$p.val), + c(nexp = 5, nobs = 2) + ) + expect_equal( + as.vector(res1_3$rmsss[2, ]), + c(-1.197432, -8.879809), + tolerance = 0.0001 + ) + expect_equal( + as.vector(res1_3$p.val[2, ]), + c(0.8284354, 0.9898591), + tolerance = 0.0001 + ) + res1_4 <- RMSSS(exp1, obs1, ref = ref1_2, dat_dim = 'dataset', sign = T, alpha = 0.3) + expect_equal( + dim(res1_4$rmsss), + c(nexp = 5, nobs = 2) + ) + expect_equal( + dim(res1_4$sign), + c(nexp = 5, nobs = 2) + ) + expect_equal( + as.vector(res1_4$rmsss[2, ]), + c(-0.9249772, -0.5624465), + tolerance = 0.0001 + ) + expect_equal( + as.vector(res1_4$p[2, ]), + c(0.7874844, 0.7094070), + tolerance = 0.0001 + ) + expect_equal( + as.vector(res1_4$sign), + c(rep(F, 5), T, rep(F, 4)) + ) + + # Random Walk + suppressWarnings({ + res1_5 <- RMSSS(exp1, obs1, ref = ref1_1, dat_dim = 'dataset', sig_method = "Random Walk", pval = F, sign = T) + }) + expect_equal( + as.vector(res1_5$sign), + rep(F, 10) + ) + suppressWarnings({ + res1_6 <- RMSSS(exp1, obs1, ref = NULL, dat_dim = 'dataset', sig_method = "Random Walk", pval = F, sign = T) + }) + expect_equal( + as.vector(res1_6$sign), + rep(F, 10) + ) + }) @@ -134,16 +221,16 @@ test_that("2. Output checks: case 1", { test_that("3. Output checks: case 2", { expect_equal( - dim(RMSSS(exp2, obs2)$rmsss), + dim(RMSSS(exp2, obs2, time_dim = 'time')$rmsss), c(nexp = 2, nobs = 1, dat = 1, lon = 3, lat = 2) ) expect_equal( - mean(RMSSS(exp2, obs2)$rmsss), + mean(RMSSS(exp2, obs2, time_dim = 'time')$rmsss), -0.3912208, tolerance = 0.00001 ) expect_equal( - range(RMSSS(exp2, obs2)$p.val), + range(RMSSS(exp2, obs2, time_dim = 'time')$p.val), c(0.2627770, 0.9868412), tolerance = 0.00001 ) @@ -171,17 +258,55 @@ test_that("5. Output checks: case 4", { expect_equal( dim(RMSSS(exp4, obs4, dat_dim = NULL)$rmsss), - c(dat = 1, lon = 3, lat = 2) + c(lon = 3, lat = 2) + ) + expect_equal( + dim(RMSSS(exp4, obs4, dat_dim = NULL)$p.val), + c(lon = 3, lat = 2) ) expect_equal( - mean(RMSSS(exp4, obs4, dat_dim = NULL)$rmsss), - -0.3114424, + as.vector(RMSSS(exp4, obs4, dat_dim = NULL)$rmsss), + c(0.5393823, 0.6818405, 0.4953423, 0.4093817, 0.5972085, 0.5861135), tolerance = 0.00001 ) expect_equal( - range(RMSSS(exp4, obs4, dat_dim = NULL)$p.val), - c(0.3560534, 0.9192801), + as.vector(RMSSS(exp4, obs4, dat_dim = NULL)$p.val), + c(0.015203983, 0.001091360, 0.026987112, 0.066279877, 0.006161059, 0.007437649), tolerance = 0.00001 ) + expect_equal( + names(RMSSS(exp4, obs4, dat_dim = NULL)), + c('rmsss', 'p.val') + ) + expect_equal( + names(RMSSS(exp4, obs4, dat_dim = NULL, sign = T, pval = F)), + c('rmsss', 'sign') + ) + expect_equal( + names(RMSSS(exp4, obs4, dat_dim = NULL, sign = T, pval = T)), + c('rmsss', 'p.val', 'sign') + ) + expect_equal( + as.vector(RMSSS(exp4, obs4, dat_dim = NULL, sign = T, pval = F)$sign), + c(T, T, T, F, T, T) + ) + expect_equal( + as.vector(RMSSS(exp4, obs4, dat_dim = NULL, sign = T, pval = F, alpha = 0.01)$sign), + c(F, T, F, F, T, T) + ) + +}) + +############################################## +test_that("6. Output checks: case 5", { + res5_1 <- RMSSS(exp5, obs5, ref = ref5, dat_dim = 'dataset', memb_dim = 'member') + res5_2 <- RMSSS(s2dv::MeanDims(exp5, 'member'), s2dv::MeanDims(obs5, 'member'), + ref = s2dv::MeanDims(ref5, 'member'), dat_dim = 'dataset') + expect_equal( + res5_1, + res5_2 + ) + }) + diff --git a/tests/testthat/test-ROCSS.R b/tests/testthat/test-ROCSS.R new file mode 100644 index 0000000000000000000000000000000000000000..a95d0ba1b80f637556f4d0b014880d8d930a0b84 --- /dev/null +++ b/tests/testthat/test-ROCSS.R @@ -0,0 +1,229 @@ +context("s2dv::ROCSS tests") + +############################################## + +# dat1 +set.seed(1) +exp1 <- array(rnorm(60), dim = c(member = 3, sdate = 10, lat = 2)) +set.seed(2) +obs1 <- array(rnorm(20), dim = c(sdate = 10, lat = 2)) +set.seed(3) +ref1 <- array(rnorm(40), dim = c(member = 2, sdate = 10, lat = 2)) + +# dat2 +set.seed(1) +exp2 <- array(rnorm(30), dim = c(member = 3, sdate = 10)) +set.seed(2) +obs2 <- array(rnorm(10), dim = c(sdate = 10)) +set.seed(3) +ref2 <- array(rnorm(20), dim = c(member = 2, sdate = 10)) + + +# dat3 +set.seed(1) +exp3 <- array(rnorm(60), dim = c(member = 2, sdate = 10, dataset = 3)) +set.seed(2) +obs3 <- array(rnorm(20), dim = c(member = 1, sdate = 10, dataset = 2)) +set.seed(3) +ref3 <- array(rnorm(60), dim = c(member = 2, sdate = 10, dataset = 3)) +set.seed(3) +ref3_2 <- array(rnorm(20), dim = c(member = 2, sdate = 10)) + +############################################## +test_that("1. Input checks", { + # exp and obs (1) + expect_error( + ROCSS(c()), + "Parameter 'exp' must be a numeric array." + ) + expect_error( + ROCSS(exp1, c()), + "Parameter 'obs' must be a numeric array." + ) + + # time_dim + expect_error( + ROCSS(exp1, obs1, time_dim = 1), + "Parameter 'time_dim' must be a character string." + ) + expect_error( + ROCSS(exp1, obs1, time_dim = 'time'), + "Parameter 'time_dim' is not found in 'exp' or 'obs' dimension." + ) + expect_error( + ROCSS(exp2, obs2, array(rnorm(20), dim = c(member = 2, time = 10))), + "Parameter 'time_dim' is not found in 'ref' dimension." + ) + # memb_dim + expect_error( + ROCSS(exp1, obs1, memb_dim = TRUE), + "Parameter 'memb_dim' must be a character string." + ) + expect_error( + ROCSS(exp1, obs1, memb_dim = 'memb'), + "Parameter 'memb_dim' is not found in 'exp' dimension." + ) + expect_error( + ROCSS(exp2, obs2, array(rnorm(20), dim = c(memb = 2, sdate = 10))), + "Parameter 'memb_dim' is not found in 'ref' dimension." + ) + # exp, ref, and obs (2) + expect_error( + ROCSS(exp1, array(1:9, dim = c(sdate = 9))), + "Parameter 'exp' and 'obs' must have same length of all dimensions except 'memb_dim' and 'dat_dim'." + ) + expect_error( + ROCSS(exp3, obs3, array(rnorm(60), dim = c(member = 2, sdate = 10, dataset = 4)), dat_dim = 'dataset'), + "If parameter 'ref' has dataset dimension, it must be equal to dataset dimension of 'exp'." + ) + expect_error( + ROCSS(exp1, obs1, ref2), + "Parameter 'exp' and 'ref' must have the same length of all dimensions except 'memb_dim' and 'dat_dim' if there is only one reference dataset." + ) + # prob_thresholds + expect_error( + ROCSS(exp1, obs1, ref1, prob_thresholds = 1), + "Parameter 'prob_thresholds' must be a numeric vector between 0 and 1." + ) + # indices_for_clim + expect_error( + ROCSS(exp1, obs1, ref1, indices_for_clim = array(1:6, dim = c(2, 3))), + "Parameter 'indices_for_clim' must be NULL or a numeric vector." + ) + expect_error( + ROCSS(exp1, obs1, indices_for_clim = 3:11), + "Parameter 'indices_for_clim' should be the indices of 'time_dim'." + ) + # cross.val + expect_error( + ROCSS(exp1, obs1, cross.val = 1), + "Parameter 'cross.val' must be either TRUE or FALSE." + ) + + # ncores + expect_error( + ROCSS(exp2, obs2, ncores = 1.5), + "Parameter 'ncores' must be either NULL or a positive integer." + ) + +}) + +############################################## +test_that("2. Output checks: dat1", { + +expect_equal( + dim(ROCSS(exp1, obs1)), + c(cat = 3, lat = 2) +) +expect_equal( + dim(ROCSS(exp1, obs1, ref1)), + c(cat = 3, lat = 2) +) +expect_equal( +c(ROCSS(exp1, obs1)[1, ]), +c(0.3333333, 0.0000000), +tolerance = 0.0001 +) +expect_equal( +c(ROCSS(exp1, obs1)[2, ]), +c(0.0000000, 0.6666667), +tolerance = 0.0001 +) +expect_equal( +c(ROCSS(exp1, obs1)[3, ]), +c(0.5238095, 0.3809524), +tolerance = 0.0001 +) +# with ref +expect_equal( +c(ROCSS(exp1, obs1, ref1)[1, ]), +c(0.53333333, 0.08695652), +tolerance = 0.0001 +) +expect_equal( +c(ROCSS(exp1, obs1, ref1)[2, ]), +c(0.4545455, 0.6190476), +tolerance = 0.0001 +) +expect_equal( +c(ROCSS(exp1, obs1, ref1)[3, ]), +c(0.5238095, 0.5357143), +tolerance = 0.0001 +) + +}) + +############################################## +test_that("3. Output checks: dat2", { + +expect_equal( + dim(ROCSS(exp2, obs2, prob_thresholds = c(0.25, 0.5, 0.75))), + c(cat = 4) +) +expect_equal( + dim(ROCSS(exp2, obs2, ref2, prob_thresholds = c(0.25, 0.5, 0.75))), + c(cat = 4) +) +# without ref +expect_equal( +c(ROCSS(exp2, obs2)), +c(0.3333333, 0, 0.5238095), +tolerance = 0.0001 +) +expect_equal( +c(ROCSS(exp2, obs2, prob_thresholds = c(0.25, 0.5, 0.75))), +c(0.1875000, -0.0952381, 0.0000000, -0.7500000), +tolerance = 0.0001 +) +expect_equal( +c(ROCSS(exp2, obs2, indices_for_clim = 2:6)), +c(0.56, 0.00, 0.25), +tolerance = 0.0001 +) +expect_equal( +c(ROCSS(exp2, obs2, ref2, indices_for_clim = 2:6)), +c(0.6333333, 0.0000000, 0.2500000), +tolerance = 0.0001 +) + + +}) + +############################################## +test_that("4. Output checks: dat3", { + +expect_equal( + dim(ROCSS(exp3, obs3, dat_dim = 'dataset')), + c(nexp = 3, nobs = 2, cat = 3) +) +expect_equal( + dim(ROCSS(exp3, obs3, ref3, dat_dim = 'dataset')), + c(nexp = 3, nobs = 2, cat = 3) +) +expect_equal( + dim(ROCSS(exp3, obs3, ref3_2, dat_dim = 'dataset')), + c(nexp = 3, nobs = 2, cat = 3) +) + +expect_equal( + c(ROCSS(exp3, obs3, ref3, dat_dim = 'dataset')[1, , ]), + c(0.3, -0.25, 0.3636364, -0.8095238, -0.4285714,0), + tolerance = 0.0001 +) +expect_equal( + c(ROCSS(exp3, obs3, ref3, dat_dim = 'dataset')[2, , ]), + c(0.5652174, 0, 0, -0.4285714, -0.4, 0), + tolerance = 0.0001 +) +expect_equal( + c(ROCSS(exp3, obs3, ref3_2, dat_dim = 'dataset')[1, , ]), + c(0.3, -0.25, 0.3636364, -0.8095238, -0.4285714, 0), + tolerance = 0.0001 +) +expect_equal( + c(ROCSS(exp3, obs3, ref3_2, dat_dim = 'dataset')[2, , ]), + c(0.66666667, 0.04166667, 0.40909091, -0.42857143, -0.33333333, -0.33333333), + tolerance = 0.0001 +) + +}) diff --git a/tests/testthat/test-RPS.R b/tests/testthat/test-RPS.R index 9363a419f566712ce7fbd9d4a1c6f511b1c028eb..51ba992d93b3e1ee4828bb095bbe363126a8b716 100644 --- a/tests/testthat/test-RPS.R +++ b/tests/testthat/test-RPS.R @@ -84,6 +84,11 @@ test_that("1. Input checks", { RPS(exp1, obs1, Fair = 1), "Parameter 'Fair' must be either TRUE or FALSE." ) + # cross.val + expect_error( + RPS(exp1, obs1, cross.val = 1), + "Parameter 'cross.val' must be either TRUE or FALSE." + ) # weights expect_error( RPS(exp1, obs1, weights = c(0, 1)), @@ -141,6 +146,20 @@ test_that("2. Output checks: dat1", { c(0.3692964, 0.5346627), tolerance = 0.0001 ) + expect_equal( + dim(RPS(exp1, obs1, cross.val = T)), + c(lat = 2) + ) + expect_equal( + as.vector(RPS(exp1, obs1, cross.val = T)), + c(0.3111111, 0.5333333), + tolerance = 0.0001 + ) + expect_equal( + as.vector(RPS(exp1, obs1, cross.val = T, weights = weights1)), + c(0.3559286, 0.6032109), + tolerance = 0.0001 + ) }) diff --git a/tests/testthat/test-RPSS.R b/tests/testthat/test-RPSS.R index 79769057a6f69fe1f76a0d2f6338ea2b9e268318..36efee8df0c317e06a46439643da8dc81a0561ee 100644 --- a/tests/testthat/test-RPSS.R +++ b/tests/testthat/test-RPSS.R @@ -116,6 +116,11 @@ test_that("1. Input checks", { RPSS(exp1, obs1, Fair = 1), "Parameter 'Fair' must be either TRUE or FALSE." ) + # cross.val + expect_error( + RPSS(exp1, obs1, cross.val = 1), + "Parameter 'cross.val' must be either TRUE or FALSE." + ) # weights_exp and weights_ref expect_error( RPSS(exp1, obs1, weights_exp = c(0, 1)), @@ -235,6 +240,21 @@ test_that("2. Output checks: dat1", { as.vector(RPSS(exp1, obs1, ref1, prob_thresholds = seq(0.1, 0.9, 0.1))$sign), c(T, F) ) + expect_equal( + names(RPSS(exp1, obs1, cross.val = T)), + c("rpss", "sign") + ) + expect_equal( + as.vector(RPSS(exp1, obs1, cross.val = T)$rpss), + c(0.2631579, -0.1707317), + tolerance = 0.0001 + ) + expect_equal( + as.vector(RPSS(exp1, obs1, ref1, weights_exp = weights1, weights_ref = weights1, cross.val = T)$rpss), + c(0.6689428, 0.4396209), + tolerance = 0.0001 + ) + }) @@ -343,7 +363,7 @@ test_that("4. Output checks: dat3", { ) expect_equal( as.vector(RPSS(exp3, obs3, dat_dim = 'dataset')$sign)[1:3], - c(FALSE, FALSE, FALSE), + c(FALSE, FALSE, TRUE), ) expect_equal( mean(RPSS(exp3, obs3, dat_dim = 'dataset', weights_exp = weights3, Fair = T)$rpss), diff --git a/tests/testthat/test-RandomWalkTest.R b/tests/testthat/test-RandomWalkTest.R index d50b9ce9118910e7f7558dae1687b2f6467c0439..a0462c890c4052fada9b9516afd636e94f0baf3c 100644 --- a/tests/testthat/test-RandomWalkTest.R +++ b/tests/testthat/test-RandomWalkTest.R @@ -1,11 +1,16 @@ context("s2dv::RandomWalkTest tests") ############################################## + #dat1 set.seed(1) dat1_A <- array(rnorm(64), dim = c(sdate = 4, ftime = 4, lat = 2, lon = 2)) set.seed(2) dat1_B <- array(rnorm(64), dim = c(sdate = 4, ftime = 4, lat = 2, lon = 2)) - + #dat2 + set.seed(1) + dat2_A <- array(rnorm(8), dim = c(sdate = 4, ftime = 2)) + set.seed(2) + dat2_B <- array(rnorm(8), dim = c(sdate = 4, ftime = 2)) ############################################## test_that("1. Input checks", { @@ -31,7 +36,15 @@ test_that("1. Input checks", { "Parameter 'time_dim' is not found in 'skill_A' or 'skill_B' dimension." ) expect_error( - RandomWalkTest(dat1_A, dat1_B, ncores = T), + RandomWalkTest(dat1_A, dat1_B, alpha = 1), + "Parameter 'alpha' must be a number between 0 and 1." + ) + expect_error( + RandomWalkTest(dat1_A, dat1_B, test.type = 1), + "Parameter 'test.type' must be 'two.sided.approx', 'two.sided', 'greater', or 'less'." + ) + expect_error( + RandomWalkTest(dat1_A, dat1_B, ncores = T, sign = T, pval = F), "Parameter 'ncores' must be a positive integer." ) @@ -39,7 +52,7 @@ test_that("1. Input checks", { ############################################## test_that("2. Output checks: dat1", { - res <- RandomWalkTest(dat1_A, dat1_B) + res <- RandomWalkTest(dat1_A, dat1_B, sign = T, pval = F) expect_equal( length(res), @@ -47,14 +60,14 @@ test_that("2. Output checks: dat1", { ) expect_equal( names(res), - c("score", "signif") + c("score", "sign") ) expect_equal( dim(res$score), c(ftime = 4, lat = 2, lon = 2) ) expect_equal( - dim(res$signif), + dim(res$sign), c(ftime = 4, lat = 2, lon = 2) ) expect_equal( @@ -62,7 +75,7 @@ test_that("2. Output checks: dat1", { TRUE ) expect_equal( - is.logical(res$signif), + is.logical(res$sign), TRUE ) expect_equal( @@ -71,11 +84,80 @@ test_that("2. Output checks: dat1", { ) expect_equal( res$score[, 1, 1], - c(0, 0, -2, -2) + c(0, 0, 2, 2) ) expect_equal( res$score[, 1, 2], - c(0, 4, 2, 0) + c(0, -4, -2, 0) + ) + +}) + +############################################## +test_that("3. Output checks: dat2", { + + res1 <- RandomWalkTest(dat2_A, dat2_B, sign = T, pval = T, test.type = "two.sided", alpha = 0.1) + + expect_equal( + names(res1), + c("score", "p.val", "sign") + ) + expect_equal( + dim(res1$score), + c(ftime = 2) + ) + expect_equal( + dim(res1$p.val), + c(ftime = 2) + ) + expect_equal( + dim(res1$sign), + c(ftime = 2) + ) + expect_equal( + as.vector(res1$score), + c(0, 0) + ) + expect_equal( + as.vector(res1$p), + c(1, 1) + ) + expect_equal( + as.vector(res1$sign), + c(F, F) + ) + +#------------------------------ + res2 <- RandomWalkTest(dat2_A + 1, dat2_B + 2, sign = T, pval = T, test.type = "greater", alpha = 0.1) + + expect_equal( + as.vector(res2$score), + c(2, 4) + ) + expect_equal( + as.vector(res2$p), + c(0.3125, 0.0625) + ) + expect_equal( + as.vector(res2$sign), + c(F, T) + ) + +#------------------------------ + res3 <- RandomWalkTest(dat2_A, dat2_B - 2, sign = T, pval = T, test.type = "less", alpha = 0.1) + expect_equal( + as.vector(res3$score), + c(-2, -4) + ) + expect_equal( + as.vector(res3$p), + c(0.3125, 0.0625) + ) + expect_equal( + as.vector(res3$sign), + c(F, T) ) }) + + diff --git a/tests/testthat/test-Reorder.R b/tests/testthat/test-Reorder.R index 0e8e5b5564f2291e13ad614575bf2470e5c26ed0..b17259e672e21443ab010c03a6a97ded3c8c0505 100644 --- a/tests/testthat/test-Reorder.R +++ b/tests/testthat/test-Reorder.R @@ -7,10 +7,15 @@ context("s2dv::Reorder tests") # dat2 set.seed(10) dat2 <- array(rnorm(10), dim = c(2, 1, 5)) + dat2_1 <- dat2 + attr(dat2_1, 'dimensions') <- c('sdate', 'time', 'region') # dat3 dat3 <- array(c(1:30), dim = c(dat = 1, 3, ftime = 2, 5)) + # dat4: A character array + dat4 <- array(paste0('a', 1:24), dim = c(b = 2, c = 3, d = 4)) + ############################################## test_that("1. Input checks", { @@ -66,10 +71,18 @@ test_that("2. Output checks: dat1", { dim(Reorder(dat1, c(2,1,4,3))), c(sdate = 3, dat = 1, lon = 5, ftime = 2) ) + expect_equal( + c(Reorder(dat1, c(2,1,4,3))[2,1,,1]), + c(2, 8, 14, 20, 26) + ) expect_equal( dim(Reorder(dat1, c('sdate', 'dat', 'lon', 'ftime'))), c(sdate = 3, dat = 1, lon = 5, ftime = 2) ) + expect_equal( + c(Reorder(dat1, c('sdate', 'dat', 'lon', 'ftime'))[2,1,,1]), + c(2, 8, 14, 20, 26) + ) expect_equal( max(Reorder(dat1, c(2, 1, 4, 3)), na.rm = TRUE), 30 @@ -85,6 +98,26 @@ test_that("3. Output checks: dat2", { dim(Reorder(dat2, c(2, 1, 3))), c(1, 2, 5) ) + expect_equal( + c(Reorder(dat2, c(2, 1, 3))[1, , 3]), + c(0.2945451, 0.3897943), + tolerance = 0.0001 + ) + expect_equal( + suppressWarnings(dim(Reorder(dat2_1, c('time', 'sdate', 'region')))), + c(1, 2, 5) + ) + expect_equal( + suppressWarnings(c(Reorder(dat2_1, c('time', 'sdate', 'region'))[1, , 3])), + c(0.2945451, 0.3897943), + tolerance = 0.0001 + ) + expect_equal( + suppressWarnings( + attributes(Reorder(dat2_1, c('time', 'sdate', 'region')))$dimensions + ), + c('time', 'sdate', 'region') + ) }) @@ -100,3 +133,16 @@ test_that("4. Output checks: dat3", { }) ############################################## +test_that("5. Output checks: dat4", { + res <- Reorder(dat4, c('d', 'c', 'b')) + expect_equal( + dim(res), + c(d = 4, c = 3, b = 2) + ) + expect_equal( + c(res[1, 1, ]), + c("a1", "a2") + ) + +}) + diff --git a/tests/testthat/test-Season.R b/tests/testthat/test-Season.R index 0b009a6e8aeffda408123303e13f97977b84ff30..f9bfc43cf17e5564c9ccaba71212af5dc9e03fed 100644 --- a/tests/testthat/test-Season.R +++ b/tests/testthat/test-Season.R @@ -15,6 +15,10 @@ context("s2dv::Season tests") set.seed(1) dat3 <- array(rnorm(12), dim = c(ftime = 12)) + # dat4 + set.seed(1) + dat4 <- array(rnorm(16), dim = c(sdate = 2, ftime = 8)) + ############################################## test_that("1. Input checks", { @@ -54,6 +58,22 @@ test_that("1. Input checks", { Season(dat1, monini = 1, moninf = 1, monsup = 'Jan'), "Parameter 'monsup' must be a positive integer between 1 and 12." ) + expect_error( + Season(dat4, monini = 1, moninf = 9, monsup = 10), + "Parameter 'moninf' is out of the range because 'monini' is 1 and time dimenision length is 8." + ) + expect_error( + Season(dat4, monini = 11, moninf = 9, monsup = 10), + "Parameter 'moninf' is out of the range because 'monini' is 11 and time dimenision length is 8." + ) + expect_error( + Season(dat4, monini = 1, moninf = 2, monsup = 9), + "The chosen month length exceeds the time dimension of 'data'." + ) + expect_error( + Season(dat4, monini = 11, moninf = 12, monsup = 7), + "The chosen month length exceeds the time dimension of 'data'." + ) expect_error( Season(dat1, monini = 1, moninf = 1, monsup = 2, method = 'mean'), "Parameter 'method' should be an existing R function, e.g., mean or sum." diff --git a/tests/testthat/test-Trend.R b/tests/testthat/test-Trend.R index 49a998229f5ca9336bb5a7c06c1eb8144922411e..07da6ce45c6b687c8e0ef9a7f746be1794d83b50 100644 --- a/tests/testthat/test-Trend.R +++ b/tests/testthat/test-Trend.R @@ -112,6 +112,22 @@ test_that("2. Output checks: dat1", { 0.1153846, tolerance = 0.001 ) + expect_equal( + dim(Trend(dat1)$trend), + c(stats = 2, dat = 1, sdate = 2) + ) + expect_equal( + dim(Trend(dat1)$p), + c(stats = 1, dat = 1, sdate = 2) + ) + expect_equal( + dim(Trend(dat1)$detrend), + c(ftime = 13, dat = 1, sdate = 2) + ) + expect_equal( + dim(Trend(dat1, polydeg = 3)$trend), + c(stats = 4, dat = 1, sdate = 2) + ) }) @@ -176,8 +192,32 @@ test_that("5. Output checks: dat4", { as.numeric(rep(NA, 5)) ) expect_equal( - Trend(dat4)$p.val[, 1, 1, 1], - c(NA, 0.01800594), - tolerance = 0.0001 + dim(Trend(dat4)$p.val), + c(stats = 1, lat = 2, lon = 3, lev = 2) + ) + expect_equal( + is.na(Trend(dat4)$p.val[, 1, 1, 1]), + TRUE ) + expect_equal( + as.vector(Trend(dat4)$trend[1, 2, , ]), + as.vector(Trend(dat3)$trend[1, 2, , ]) + ) + expect_equal( + as.vector(Trend(dat4)$p.val[1, 2, , ]), + as.vector(Trend(dat3)$p.val[1, 2, , ]) + ) + expect_equal( + as.vector(Trend(dat4)$conf.l[1, 2, , ]), + as.vector(Trend(dat3)$conf.l[1, 2, , ]) + ) + expect_equal( + as.vector(Trend(dat4)$conf.u[1, 2, , ]), + as.vector(Trend(dat3)$conf.u[1, 2, , ]) + ) + expect_equal( + as.vector(Trend(dat4)$de[1, 2, , ]), + as.vector(Trend(dat3)$de[1, 2, , ]) + ) + }) diff --git a/vignettes/NAOindex_81to91.png b/vignettes/NAOindex_81to91.png new file mode 100644 index 0000000000000000000000000000000000000000..7ee1220690eab76395e8d4dcde3f11a1c4d76553 Binary files /dev/null and b/vignettes/NAOindex_81to91.png differ diff --git a/vignettes/NAOpredictions.png b/vignettes/NAOpredictions.png new file mode 100644 index 0000000000000000000000000000000000000000..ec96e8651f8ccf72933be25b53213ac1afa36d8f Binary files /dev/null and b/vignettes/NAOpredictions.png differ diff --git a/vignettes/RMSSSforNAOprediction.png b/vignettes/RMSSSforNAOprediction.png new file mode 100644 index 0000000000000000000000000000000000000000..6e394b353f118262080045fb28c7423945405514 Binary files /dev/null and b/vignettes/RMSSSforNAOprediction.png differ diff --git a/vignettes/ScoringForecast.md b/vignettes/ScoringForecast.md new file mode 100644 index 0000000000000000000000000000000000000000..37c53e0cc1b11dcf86b6ccc69723bccf08cc0788 --- /dev/null +++ b/vignettes/ScoringForecast.md @@ -0,0 +1,255 @@ +--- +title: "Untitled" +output: github_document +--- + + +This vignette illustrates several examples of how to score a seasonal forecast using different functions in s2dv. + + +### 1-Load dependencies + +This example requires the following system libraries: + +- libssl-dev +- libnecdf-dev +- cdo + + +The **s2dv R package** should be loaded by running the following lines in R, onces it is integrated into CRAN mirror. + +```r +library(s2dv) +``` + + + +### 2-Define the problem and loading the data with the corresponding parameters + +In this vignette we will quantify how skilled a seasonal forecast is. The model will be the EUROSIP multi-model seasonal forecasting system and our real truth will be observations represented by the ERAinterim reanalysis dataset. + +For more information about both datasets see the next documentation: +- [**ERAinterim**](https://www.ecmwf.int/en/forecasts/datasets/archive-datasets/reanalysis-datasets/era-interim). +- [**EUROSIP system**](https://www.ecmwf.int/en/forecasts/documentation-and-support/long-range/seasonal-forecast-documentation/eurosip-user-guide/multi-model). + +Both datasets can be downloaded from the corresponding server. However in this example we will use the BSC esarchive database. Files path parameters for the Load() function are defined as follows: + +```r +path_exp_EUROSIP <- list(name = 'system4_m1', + path = file.path('/esarchive/exp/EUROSIP/ecmwf', + '$EXP_NAME$/monthly_mean', + '$VAR_NAME$_f6h/$VAR_NAME$_$START_DATE$.nc')) + +path_obs_eraI <- list(name = 'erainterim', + path = file.path('/esarchive/recon/ecmwf', + '$OBS_NAME$/monthly_mean', + '$VAR_NAME$_f6h/$VAR_NAME$_$YEAR$$MONTH$.nc')) +``` + +Our case of study will be predicting the North Atlantic Oscillation index ([**NAO**](https://en.wikipedia.org/wiki/North_Atlantic_oscillation)), usually defined as the difference in the sea level pressure anomaly between the permanent High pressure system over the Azores Island and the Low pressure system over Iceland. Changes in the NAO index have a direct impact on European season weather since it controls the intensity and pathways of the storms in the continent; Positive NAO values are related with stronger storms and thus wet winters in Central and Northern Europe and its Atlantic facade. Negative NAO values correspond to reduced storm activity and rainfall in Northern Europe but increased in the South and Mediterranean Sea. Especially during the months of November to April, the NAO is responsible for much of the variability of weather in the North Atlantic region, affecting wind speed and wind direction changes, changes in temperature and moisture distribution and the intensity, number and track of storms. + +For this vignette we will use the whole North Atlantic as geographical domain and we will select all forecasts starting in November for the 1981 - 1991 time period. The EUROSIP system provides 7-months long window forecast inluding 51 different members. + +```r +#North Atlantic domain definition +lonNA <- c(-90, 40) +latNA <- c(20, 80) + +#Selected time periods: +startDates <- paste(1981:1991, '1101', sep = '') + +#Loading sea level pressure maps for the whole North Atlantic and for the specific period +data <- Load(var = 'psl', + exp = list(path_exp_EUROSIP), + obs = list(path_obs_eraI), + sdates = startDates, + output = 'lonlat', + lonmin = lonNA[1], lonmax = lonNA[2], + latmin = latNA[1], latmax = latNA[2]) + + +#Retrieving observational and forecast data +obs <- data$obs +exp <- data$mod + +``` + + +### 3-NAO index, Forecast vs Observations + +As a first step we will compare the predicted intensity of the NAO index for observations and for the forecast system. + +In s2dv the NAO index is computed using a common alternative definition; The NAO is defined as the first mode of variability (computed using Single Value Decomposition method) for the sea level pressure in the North Atlantic region [20N-80N, 80W-40E]. The `NAO()` function requires as input anomalies that are previously computed using `Ano_CrossValid()` function. + +```r +#Computing anomalies along sdates dimension +ano <- Ano_CrossValid(exp, obs) + +#Computing NAO index as the first mode of slp variability in the North Atlantic +## Change lon from range [0, 360] to [-180, 180] +nao <- NAO(ano$exp[, , , , , c(73:185, 1:57)], ano$obs[, , , , , c(73:185, 1:57)], lon = data$lon[c(73:185, 1:57)], lat = data$lat) + +#For clarification, we normalize the NAO index for each member dividing by the corresponding standard deviation +nao_exp_sd <- apply(nao$exp, MARGIN = 1, sd, na.rm = T) +nao_obs_sd <- sd(nao$obs) +nao_exp_n <- nao$exp / nao_exp_sd +nao_obs_n <- nao$obs / nao_obs_sd + + +# Finally plot the NAO index using Box plots for all decembers. +PlotBoxWhisker(t(nao_exp_n), nao_obs_n, toptitle = "NAO index, DJF", + monini = 12, yearini = 1981, freq = 1, + drawleg = F, fileout = NULL) +legend(x = 3.8, y = 2.6, c('EUROSIP', 'EraInterim'), col = c(2, 4), pch = 15) +``` + + + +The figure above does not represent a good agreement between observations (blue line) and forecast (whisker boxes) due to the large dispersion through the 51 model members. The NAO signal is too weak due to the large dispersion among ensemble members thus almost disappearing (close to 0). + + +### 4-Quantifying the skillfulness of the prediction. The RMSSS + +Let's quantify how good or bad it is this prediction looking also for a better understanding of what is happening. To do so, we will use the Root Mean Square Error (RMSE), that it is just the squared and then rooted difference between the predicted and the true value. + +s2dv includes the `RMS()` function to directly calculate the RMSE, however, in order to get a better understanding of the output values we will use the `RMSSS()` function (Root Mean Square Error Skill Score). The RMSSS equals the RMSE but it is normalized by a reference RMSE, usually asociated with the intrinsic variability of the system (for example the standard deviation of the climatology). In s2dv the score is computed such that the best RMSSS score would be 1 (RMSE equals 0), while if RMSSS equals 0, the RMSE equals the variability reference of the system (i.e. the standard deviation). RMSSS can also be negative, meaning RMSE is greater than the variability reference. + +```r +#Defining NAO events +Lmember <- length(exp[1, , 1, 1, 1, 1]) + +#Calculating a RMSSS for the ensemble of members average from the mean NAO timeseries of all members +rmsss_m <- RMSSS(exp = MeanDims(nao_exp_n, 'member'), + obs = nao_obs_n, dat_dim = NULL, pval = F)$rmsss + +#Computing the RMSSS for each member +rmsss <- RMSSS(exp = nao_exp_n, obs = array(nao_obs_n, dim = c(sdate = 11, member = 1)), dat_dim = 'member', pval = F)$rmsss + +#Plotting..... +layout(matrix(c(1, 2), 1 , 2, byrow = TRUE)) + +#Plotting RMSSS for all members +plot(rmsss, type = 'h', lwd = 5, col = 'grey', xlab = 'Members', mgp = c(3,1,0), + main = sprintf('RMSSS for each member (RMSSS average %1.3f )', rmsss_m), + ylim = c(-1,1), cex.lab=2, cex.axis=2, cex.main = 1.9) +grid(col ='grey') +lines(rep(0, Lmember), lwd = 1, col = 'black') + +#Plotting boxplot for selected members with higher rmsss +isel <- which(rmsss > 0.1) +rmsss_m_sel <- RMSSS(exp = MeanDims(nao_exp_n[, isel], 'member'), + obs = nao_obs_n, + dat_dim = NULL, pval = F) + +PlotBoxWhisker(t(nao_exp_n[, isel]), nao_obs_n, + toptitle = sprintf('NAO index, selected-members (RMSSS=%1.2f)', rmsss_m_sel), + monini = 12, yearini = 1981, freq = 1, + drawleg = F, fileout = NULL) +legend(x = 4.95, y = 2.4, c('EUROSIP', 'EraInterim'), + col = c(2, 4), pch = 15, cex = 0.9, lty = 0) +``` + + + + +The above figure shows very different RMSSS for different members (left plot). Most of them have RMSSS close to 0, thus the prediction error is close to the system variability. **The RMSSS for the whole ensemble is 0.091**, what means a not very useful ensemble prediction. + +However, we can select the members that present a better RMSSS, i.e. those closer to 1, and recompute the ensemble RMSSS. This is shown in the right plot where only members 8th, 23rd, 40th and 45th are used. Now most marked NAO events are correctly predicted giving a **RMSSS of 0.66 for this selected-members ensemble**. + + +### 5-Quantifying the skillfulness of the prediction. The Brier Score + +Another option to quantify the goodness of this prediction is using the `BrierScore()` function. The BS measures the accuracy of a probabilistic categorical prediction calculating the mean squared difference between the predicted probability and the actual observation. BS scores perfect prediction when BS equals 0 (no difference between prediction and observation) and worst score corresponds to value of 1. + +Moreover, the BS can be descomposed in 3 terms: BS = Reliability - Resolution + Uncertainty. Reliability refers to how close the predicted probabilities are to the true probabilities (i.e. the lower the reliability, the lower the difference between prediction and observation and the better the score). Resolution relates with the difference between the predicted observation and the climatology (i.e. the higher the resolution, the greater the ability of the forecast to predict events different from the climatology). Uncertainty represents the inherent uncertainty of ocurring the event (i.e. the higher the uncertainty, the more difficult to correctly predict the event). + +In the case of the NAO index, we can discretize it in to a categorical variable defining a NAO event always that the index is greater than 0 and asociating the event with the value 1. Negative NAO index values then correspond to the non-ocurrence of a NAO event and thus they are assigned the value 0. For the forecast, we can convert the NAO index prediction of all ensemble members in to a NAO prediction probability computing the proportion of members with a positive NAO prediction. For comparison we do the same for the good scored selected-members ensemble of previous section. + + +```r + +#Defining number of sdates +Lsdates <- length(startDates) + +#For observations +nao_event_obs <- rep(0, Lsdates) +nao_event_obs[which(nao_obs_n > 0)] <- 1 + +#For all members +nao_event_exp <- array(0, dim = c(sdate = Lsdates, member = Lmember)) +nao_event_exp[which(nao_exp_n > 0)] <- 1 +nao_event_exp_prob <- MeanDims(nao_event_exp, 'member') + +BS <- BrierScore(obs = nao_event_obs, exp = nao_event_exp_prob) + +BSscore <- BS$bs +BSrel <- BS$rel +BSres <- BS$res +BSunc <- BS$unc + +#For selected members only +nao_event_exp_sel <- array(0, dim = c(sdate = Lsdates, member = length(isel))) +nao_event_exp_sel[which(nao_exp_n[, isel] > 0)] <- 1 +nao_event_exp_sel_prob <- MeanDims(nao_event_exp_sel, 'member') + +BS_sel <- BrierScore(obs = nao_event_obs, exp = nao_event_exp_sel_prob) + +BSscore_sel <- BS_sel$bs +BSrel_sel <- BS_sel$rel +BSres_sel <- BS_sel$res +BSunc_sel <- BS_sel$unc + + +#Plotting NAO events and prediction probabilities +layout(matrix(c(1, 2), 1, 2, byrow = TRUE)) + +#For all ensemble members +plot(nao_event_obs - 0.5, type='p', lwd = 5, col = 'red', + xlab = 'NAO events', yaxt = 'n', ylab = 'NAO probabilities', mgp = c(3,1,0)) +grid(col = 'grey') +lines(nao_event_obs - 0.5, type = 'h', lwd = 5, col = 'red', yaxt = 'n') +lines(nao_event_exp_prob - 0.5, type = 'h', lwd = 4, col = 'blue', yaxt = 'n') +lines(nao_event_exp_prob - 0.5, type = 'p', lwd = 3, col = 'blue', yaxt = 'n') +axis(2, at = seq(-0.5, 0.5, 0.25), labels = seq(0, 1, 0.25), mgp = c(3,1,0)) +lines(rep(0, Lmember), lwd=2, col = 'black') +title('Predictions for all-members ensemble') + +#For selected-member ensemble only +plot(nao_event_obs - 0.5, type = 'p', lwd = 5, col = 'red', + xlab = 'NAO events', yaxt = 'n', ylab = 'NAO probabilities', mgp = c(3,1,0)) +grid(col = 'grey') +lines(nao_event_obs - 0.5, type = 'h', lwd = 5, col = 'red', yaxt = 'n') +lines(nao_event_exp_sel_prob - 0.5, type = 'h', lwd = 4, col = 'blue', yaxt = 'n') +lines(nao_event_exp_sel_prob - 0.5, type = 'p', lwd = 3, col = 'blue', yaxt = 'n') +axis(2, at = seq(-0.5, 0.5, 0.25), labels = seq(0, 1, 0.25), mgp = c(3, 1, 0)) +lines(rep(0, Lmember), lwd = 2, col = 'black') +title('Predictions for selected-members ensemble') + +``` + + + + +For the all-members ensemble, the results are: +**Total BS = 0.213,** +Reliability = 0.138, +Resolution = 0.172, +Uncertainty = 0.248. + +For the selected-members ensemble, the results are: +**Total BS = 0.136,** +Reliability = 0.015, +Resolution = 0.127, +Uncertainty = 0.248. + +To put the total Brier Score values in to context, note that the BS definition is such that in general a forecast that always gives a 50% probability prediction for all cases would have a BS of 0.25. + +Taking this into account, the all-members ensemble prediction with a BS of 0.213 seems only a bit better than flipping a coin in terms of Brier Scoring. The selected-members ensemble instead presents a clear improvement in the total Brier Score (lower value). This is due to the better scoring (lower value) achieved by the reliability term in this ensemble, in agreement with the previous RMSSS results and also shown in the right plot of the figure above; The prediction probabilities are much closer to the NAO ocurrence distribution. However, the resolution scores are not that different between the all-members and the selected-members ensemble. This is due to the fact that even with a notable difference in RMSSS and reliability, the two ensembles mostly predict equally the correct occurrence of positive/negative NAO events (left plot). Finally, the uncertainty is obviously the same for the two cases, since the observations to compare with are the same. + + +### 6-Appendix: Forecast scores ranges + +| SCORE | RMSSS | BS (total) | Reliability | Resolution | Uncertainty | +|:-----:|:-----:|:----------:|:-----------:|:----------:|:--------------:| +| Best | 1 | 0 | 0 | 1 | Obs. dependant | +| Worst | -Inf | 1 | 1 | 0 | Obs. dependant | diff --git a/vignettes/data_retrieval.md b/vignettes/data_retrieval.md new file mode 100644 index 0000000000000000000000000000000000000000..ec2d1273a50b2fd64f29ff34d22bc80684342682 --- /dev/null +++ b/vignettes/data_retrieval.md @@ -0,0 +1,668 @@ +--- +title: "Data Retrieval" +author: "Nicolau Manubens" +reviser: "An-Chi Ho" +date: "`r Sys.Date()`" +output: pdf_document +vignette: > + %\VignetteEngine{R.rsp::md} + %\VignetteIndexEntry{Data Retrieval} + %\usepackage[utf8]{inputenc} +--- + +Data retrieval +============== +One of the strong points of s2dv is its data retrieval module. +Its aim is to load and homogenize forecast or hindcast data generated by +climate models, usually in multiple runs and multiple ensemble members per +run, as well as the corresponding observational time series obtained from +either pure observations or reanalyses. + +A climate model run simulates a time period starting at a **starting date** +and generates data for a series of time steps regularly spaced along the +**forecast time** that conform the simulated time period. A model run is +also usually referred to as starting date. +The **lead-time** at each forecast time step is the amount of simulated time +since the beginning of the simulation, e.g., the first value of a +monthly-averaged forecast has a lead-time of 0 months, the second value has a +lead-time of 1 month and so on. In the context of s2dv, however, +the concept lead-time is used to make reference to the forecast time steps: +the first lead-time of a forecast is the value with a lead-time of 0 time units +(the first time step), the second lead-time is the one with a lead-time of 1 +time unit (the second time step) and so on. +Data at one forecast time (or lead-time) from different runs or starting dates +conform a time series defined along the **actual time**. + +`Load()` is the interface function to the data retrieval module. Through it +you can request previously monthly-averaged or daily-averaged 2-dimensional +variables or global mean variables from as many experimental and observational +multi-member datasets as needed, at the selected starting dates and forecast +time steps, over any longitude-latitude region of the Earth surface. + +`Load()` automatically fetches the selected starting dates of the requested +experimental datasets and the date-corresponding observational data, +homogenizes it and loads it into an R object ready to be processed. +If done manually this homogenization procedure can be time-consuming and +become a hurdle in the verification process. + +The object returned by `Load()` contains, among others, two arrays with the +requested data, one for the experimental data and the other for observational +data, with a disctintive dimension structure of s2dv: +```r +c(n. of datasets, n. of members, n. of starting dates, n. of lead-times, n. of latitudes, n. of longitudes) +``` + +The homogenization process consists of the following steps: + 1. Regridding data. `Load()` relies on CDO tools for this task (see +[**Installation**](../README.md)). + 2. Applying masks to disable certain grid cells. + 3. Disabling values beyond certain thresholds. + 4. Computing area-weighted averages. + +(The 1st, 2nd and 4th steps only apply if loading 2-dimensional variables.) + +Supported file formats +---------------------- +There are some guidelines that the files of the datasets must follow so that +`Load()` is able to recognize, fetch and process them: + - Data files must be in NetCDF 3/4 local or remote files or indexed in NCML +catalogs provided by THREDDS servers. + - Data files must contain monthly or daily time series. Data in other +frequencies can also be loaded as experimental data, but the automatical fetch +of date-corresponding observations will fail. + - The only supported source (and target) grids are regular and gaussian grids +(those definable with the CDO grid names that follow the patterns 'rx' +or 'tgrid'). + - The variable of interest must be defined over the dimensions +(time[, members]) if is a global means time-series or (time[, members], +latitudes, longitudes) if is a 2-dimensional field. These dimensions +can be named at will and can be in any order. The less time-consuming orders +to load are, however, (time[, members]) or (time[, members], latitudes, +longitudes). Here, the dimension time makes reference to forecast time. + - If the data is 2-dimensional, a variable with the grid latitudes and a +variable with the grid longitudes must be defined, with the same name as their +corresponding dimension. + - The files of an experimental dataset must be distributed in either of the +following two ways: + - File per starting date: each file must contain the starting date\* +somewhere in its path. + - File per starting date per member: each file must contain the the +starting date\* and the member number\*\* somewhere in its path. + - The files of an observational dataset must be distributed in either of the +following three ways: + - File per month: each file must contain the year and month\*\*\* somewhere +in its path. + - File per month per member: each file must contain the member +number\*\* and the year and month\*\*\* somewhere in its path. + - Single file: the time axis must be properly defined to work with +`cdo showmon` and `cdo showyear`. + +\*: starting dates in the format YYYYMMDD. +\*\*: member number, from 0 to total_number_of_members - 1, with as many +preceding 0s as needed to reach the needed number of digits to represent +total_number_of_members - 1. +\*\*\*: year in the format YYYY and month in the format MM. + +Load() options +-------------- + +`Load()` has 27 parameters. +14 of them are for selecting the data of interest, 8 of them are for adjusting +the homogenization steps and the other 5 are for general adjustments of +`Load()`. See them explained in 3 sections: + - [**Selection parameters**](#selection) + - [**Homogenization parameters**](#homogenization) + - [**General parameters**](#general) + +### Selection parameters +The selection parameters are `var`, `exp`, `obs`, `sdates`, `nmember`, +`nmemberobs`, `nleadtime` (deprecated), `leadtimemin`, `leadtimemax`, +`sampleperiod`, `lonmin`, `lonmax`, `latmin` and `latmax`. You can see most of +them in use in the [**Example**](example.md) except for `nmember`, +`nmemberobs`, `nleadtime` and `sampleperiod`. The first part of that example +is reproduced in more detail below in this section. + +`var` must be provided with the short name of a 2-dimensional or global mean +variable of interest, for example `var = 'tas'` (most usually the provided +name must be the same as the variable name inside the NetCDF data files). +`exp` and `obs` contain the information on the location of the requested +datasets, `sdates` and `leadtimemin`/ `leadtimemax` tell `Load()` which +starting dates and lead-times to pick and so on. +Check the [**user manual**](../s2dv-manual.pdf) or `?Load` for a +thorough explanation. + +Now, as an example, let's consider we have two experimental datasets and one +observational datasets, both with data for the variable 'tas' and with the +files distributed as depicted in the following directory tree: +``` + /path/to/ + |--experiments/ + | |--experimentA/ + | | |--monthly_mean/ + | | |--tas/ + | | |--tas_19911101.nc + | | |--tas_19921101.nc + | | | · + | | | · + | | | · + | | |--tas_20001101.nc + | |--experimentB/ + | |--monthly_mean/ + | |--tas/ + | |--tas_19911101.nc + | |--tas_19921101.nc + | | · + | | · + | | · + | |--tas_20001101.nc + |--observations/ + |--observationX/ + |--monthly_mean/ + |--tas/ + |--tas_199101.nc + |--tas_199102.nc + |--tas_199103.nc + |--tas_199104.nc + |--tas_199105.nc + |--tas_199106.nc + |--tas_199107.nc + |--tas_199108.nc + |--tas_199109.nc + |--tas_199110.nc + |--tas_199111.nc + |--tas_199112.nc + |--tas_199201.nc + |--tas_199202.nc + | · + | · + | · + |--tas_199401.nc + |--tas_199402.nc + |--tas_199403.nc +``` +We assume these files are NetCDF 3/4 compliant and have the variable 'tas' +defined inside, fulfilling the guidelines explained above in the previous +section. + +We can now build the information blocks of each dataset as follows: +```r +library(s2dv) + +expA <- list( + name = 'experimentA', + path = file.path('/path/to/experiments/$EXP_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc') + ) +expB <- list( + name = 'experimentB', + path = file.path('/path/to/experiments/$EXP_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc') + ) +obsX <- list( + name = 'observationX', + path = file.path('/path/to/observations/$OBS_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$YEAR$$MONTH$.nc') + ) +``` +`Load()` will automatically replace the wildcards (keywords starting and +ending with '$') with the corresponding starting dates, variable name, etc. +See help on parameters `exp` and `obs` in `?Load` for a full explanation. + +Now time to issue the `Load()` call: the target variable name will be `'tas'` +and the parameters `exp` and `obs` can be built from the previous information +blocks. Apart from that we just need to specify a set of starting dates for +`Load()` to work: + +```r +sdates <- paste0(1991:2000, '1101') +data <- Load(var = 'tas', exp = list(expA, expB), obs = list(obsX), + sdates = sdates) +``` + +This will pick by default as many members and lead-times of each experimental +dataset as found in the first one ('expA'), and as many members and lead-times +of each observational dataset (only 1 in this example) as found in the first +one ('obsX'). + +All the data will be returned in a named list with many components, in a +format similar to that used in the packages 'downscaleR' and 'loadR' (see +`?Load` for a full description of the returned metadata). +The two components that contain the experimental and observational data are +`$mod` and `$obs`. These are two arrays with the dimensions in the following +order: +```r +c(n. of datasets, n. of members, n. of starting dates, n. of lead-times, n. of latitudes, n. of longitudes) +``` +The presence of the last two dimensions will depend on whether the requested +variable is 2-dimensional or global mean and on the selected `output` type +(see below or in `?Load`). See +[**Normalizing fields**](statistics.md#normalizing) +for the array structure for observational data. + +In the case of the example, if 'expA' has 5 members but 'expB' has 8, the +resulting size of the members dimension will be 5 and the last 3 members of +'expB' will be missed. Besides, if 'expB' has only 2 members, the 3 remaining +members will be filled up with NA values. It happens exactly the same with the +lead-times. +To avoid this default behaviour you can specify the expected maximum number of +members in `nmember` and `nmemberobs` and the expected maximum number of +lead-times in `leadtimemax`. + +Also, if not interested in the few first lead-times of the starting dates, you +can discard them specifying the first lead-time of interest with the parameter +`leadtimemin`, and even pick one lead-time of each `sampleperiod` lead-times +starting from `leadtimemin`. + +The following code will pick the lead-times corresponding to a time period of +one year, starting from December, since the first month of the starting dates +is November and `leadtimemin` is 2: + +```r +sdates <- paste0(1991:2000, '1101') +data <- Load(var = 'tas', exp = list(expA, expB), obs = list(obsX), + sdates = sdates, leadtimemin = 2, leadtimemax = 13) +``` + +It is possible to load experimental data only or observational data only by +providing `NULL` to `exp` or `obs`. + +In the example just above, the loaded variable is 'tas', a 2-dimensional +variable. However though, `Load()` computes by default an area-weighted +average, hence the component `data$mod` will have dimensions +`c(2, 5, 3, 6)` and the component `data$obs` will have dimensions +`c(1, 1, 3, 6)`. Read below in **Homogenization parameters** on how to change +the `output` type of `Load()`. + +If willing to work with averages of a specific region of the Earth surface +you can specify the limits of its bounding box via the parameters `latmin`, +`latmax`, `lonmin` and `lonmax`. For example, for the North Pacific region: + +```r +sdates <- paste0(1991:2000, '1101') +data <- Load(var = 'tas', exp = list(expA, expB), obs = list(obsX), + sdates = sdates, leadtimemin = 2, leadtimemax = 13, + latmin = -10, latmax = 60, lonmin = 100, lonmax = 250) +``` + +### Homogenization parameters +This group of parameters is conformed by all the parameters that have effects +in any of the homogenization steps mentioned in the introduction of +[**Data retrieval**](data_retrieval.md) namely: `output`, `method`, `grid`, +`maskmod`, `maskobs`, `varmin`, `varmax` and `remapcells`. + +When loading 2-dimensional variables, `Load()` calculates and returns by +default area-weighted averages of the values in the grid cells that fall +inside the requested region. The area averages are computed from the original +datasets in their original grids. `Load()`, however, prior to computing the +averages, can remap all the data to a common grid if you specify it via the +`grid` argument. The grid specification must be a character string that follows +one of the patterns 'rx' or 'tgrid', where '' and '' are +the number of longitudes and latitudes of a rectangular grid and '' is the +factor of truncation of spherical harmonics of a gaussian grid. + E.g.: 'r320x160' or 't73grid'. + +Internally, `Load()` will dispatch CDO calls to interpolate the original files. +One of four methods of interpolation can be chosen through the parameter +`method`: 'conservative', 'bilinear', 'bicubic' or 'distance-weighted'. The +parameter `remapcells` is related to the precision of grid interpolations, you +can read more in `?Load`. + +Additionally, any grid cells of your choice can be disabled before computing +the averages by providing a mask for each dataset in `exp` and `obs`. These +masks must be provided with the arguments `maskmod` and `maskobs` and can +simply be a pointer to a NetCDF mask or an R array with the proper size. +Read more on the arguments `maskmod` and `maskobs` in `?Load` to see the +guidelines the mask files must follow or how to provide a mask as an R array. + +It is also possible to work directly with 2-dimensional data without averaging +by setting the parameter `output = 'lonlat'`. With this option the parameters +`grid`, `method`, `maskmod` and `maskobs` keep applying: you can request your +data to be in a common rectangular or gaussian grid of your choice and disable +some values with a mask. + +To summarize the homogenization parameters mentioned so far, we could dispatch the +following `Load()` call: + +```r +mask <- list(path = '/path/to/masks/land_mask_r320x160.nc') +sdates <- paste0(1991:2000, '1101') +data <- Load(var = 'tas', exp = list(expA, expB), obs = list(obsX), + sdates = sdates, leadtimemin = 2, leadtimemax = 13, + latmin = -10, latmax = 60, lonmin = 100, lonmax = 250, + output = 'lonlat', grid = 'r320x160', method = 'bicubic', + maskmod = list(mask, mask), maskobs = list(mask)) +``` + +Finally, extreme values can be disabled by setting a upper and a lower +threshold via the parameters `varmin` and `varmax`. + +### General parameters +The general parameters are `storefreq`, `configfile`, `silent`, `nprocs` and +`dimnames`. + +#### Loading daily data +It is possible to load not only monthly data but also daily data with exactly +the same funcionalities available in both modes. `storefreq` tells `Load()` +whether the experimental data to load is stored in a monthly frequency or in a +daily frequency. This way it will be able to work out the dates of the +corresponding observational data and fetch the corresponding files. The only +supported values for this parameter currently are 'monthly' (the default) and +'daily'. +Note that in the case that only experimental datasets are requested +(via `exp`) any storage frequency is be supported, since `Load()` doesn't +have to make date-correspondences. It is the user's responsibility in these +cases to ensure that the experimental datasets are stored in a common time +frequency. + +#### Adjusting dimension names +Regarding the guidelines that the files of a dataset must follow, the variable +of interest inside the NetCDF files must be defined over the dimensions +(time[, members][, latitudes, longitudes]). These dimensions can be named at +will but, by default, the expected names are 'ensemble', 'latitude' +and 'longitude' (the time dimension can take any name). If the name of any of +the dimensions in all the datasets to load is different to this default, it +can be adjusted with the parameter `dimnames` as in the following example: +```r +dimnames = list(member = 'members', lat = 'j', lon = 'i') +``` +However, if only one of the datasets breaks the rules or each dataset has its +own naming convention, you can adjust individually when building the +information blocks of each dataset as in the following example: +```r +expA <- list( + name = 'experimentA', + path = file.path('/path/to/experiments/$EXP_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc'), + dimnames = list(member = 'members') + ) +expB <- list( + name = 'experimentB', + path = file.path('/path/to/experiments/$EXP_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc'), + dimnames = list(lat = 'j', lon = 'i') + ) +obsX <- list( + name = 'observationX', + path = file.path('/path/to/observations/$OBS_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$YEAR$$MONTH$.nc'), + dimnames = list(member = 'ens', lat = 'y', lon = 'x') + ) +``` + +#### Workflow of `Load()` and parallelism +The workflow of `Load()` consists of 3 steps: + 1. Building up work pieces +Each work piece is a packet of information that describes the location of a +**single** data file requested in a `Load()` call (e.g. 'tas_19901101.nc') and +the homogenization and arranging operations to be applied to that file. + 2. Setting up a cluster +A multi-core cluster is set up. As of now `Load()` can only make the most of +multiple cores in a single processor but not of multiple processors. +The number of nodes in the multi-core cluster can be adjusted via `nprocs` +(number of processes, as each node is a full new R process). Note that the +number of nodes does not necessarly have to coincide with the number of cores: +if `nprocs` is lower than the number of cores, probably some of the available +cores will remain unused and if `nprocs` is higher, a single core will have +more than one task assigned at a time. Depending on the environment this may +be convenient. +Three aspects to take into account when selecting the appropriate `nprocs` are: + - Whether hyperthreading is activated in the system + - The fact that each node will dispatch parallel blocking calls to CDO +along its execution + - A 'bigmemory' shared memory region will be set up for the multiple cores +to arrange the resulting data of each work piece. +Setting `nprocs = 1` will avoid creating a cluster and all the workload will +be carried out in a single process (that may dispatch parallel blocking CDO +calls). + 3. Dispatching the work pieces to the cluster +Once the work pieces and the cluster are ready, `Load()` dispatches the work +pieces with load balancing to the nodes of the cluster. Each node will keep +applying the internal function `LoadDataFile()` to each work piece. + +#### Running `Load()` silently +All along the process `Load()` keeps informing about the status and any +detected potential issues. When run in batch mode it may be of your interest +to turn the messages off by setting `silent = TRUE`. + +#### Using a configuration file +When using `Load()` frequently, one may end up repeating many times the code +to build the same information blocks to load certain specific datasets. +Besides, when using `Load()` in a team, one of its members may be interested +in using datasets that other members usually work with or may want to make +accessible his datasets to the colleagues. + +To help in these two situations `Load()` has a built in mechanism to read +**configuration files** that contain, in a tabular and compact format, the +information blocks of the datasets of interest. This way, by defining the +information of a dataset only once in a configuration file, `Load()` will be +able to read its information as needed. Also, the members of a team can share +a configuration file to put the information of their datasets at others' +disposal. + +A configuration file can be created by hand, following the specifications in +`?ConfigFileOpen`, or can be created and edited through a set of functions +included in s2dv: `ConfigFileCreate()`, `ConfigFileOpen()`, +`ConfigAddEntry()`, `ConfigEditEntry()`, `ConfigRemoveEntry()`, +`ConfigEditDefinition()`, `ConfigRemoveDefinition()`, `ConfigFileSave()`, +`ConfigShowTable()`, `ConfigShowDefinitions()`, `ConfigShowSimilarEntries()` +and `ConfigApplyMatchingEntries()`. + +The first option can be faster for experienced users but the second option is +safer and useful to automatise the creation of configuration files. The +second option will be used in the following examples but the raw file will +also be displayed to show the internals of the configuration files and to +give some hints on how to create them manually. + +A blank new configuration file can be created as follows: +```r +conf_path <- '/path/to/configfile.conf' +ConfigFileCreate(conf_path) +``` +This will generate a file at the specified path with all the boilerplate: +``` +# s2dv configuration file +# +# Check ?ConfigFileOpen after loading s2dv for detailed +# documentation on this configuration file. + +############# +!!definitions +############# +DEFAULT_EXP_MAIN_PATH = $EXP_NAME$ +DEFAULT_EXP_FILE_PATH = $STORE_FREQ$/$VAR_NAME$_$START_DATE$.nc +DEFAULT_NC_VAR_NAME = $VAR_NAME$ +DEFAULT_SUFFIX = +DEFAULT_VAR_MIN = +DEFAULT_VAR_MAX = +DEFAULT_OBS_MAIN_PATH = $OBS_NAME$ +DEFAULT_OBS_FILE_PATH = $STORE_FREQ$/$VAR_NAME$_$YEAR$$MONTH$.nc +DEFAULT_DIM_NAME_LONGITUDES = longitude +DEFAULT_DIM_NAME_LATITUDES = latitude +DEFAULT_DIM_NAME_MEMBERS = ensemble + + +###################### +!!table of experiments +###################### +#exp_name, var_name[, exp_main_path[, exp_file_path[, nc_var_name[, suffix[, var_min[, var_max]]]]]] + +####################### +!!table of observations +####################### +#obs_name, var_name[, obs_main_path[, obs_file_path[, nc_var_name[, suffix[, var_min[, var_max]]]]]] +``` +The lines starting with '#' are comments and the ones starting with '!' are +key lines that can't be moved or removed. + +As just shown, the configuration file consists of a list of definitions and of +two tables, one to keep information on experimental datasets and the other +for observational datasets. + +The rows or entries of the tables will be filled up each with information +relative to a dataset. An entry will associate a pair of dataset name and +variable name to a set of information pieces. +Each entry keeps 8 columns, the pair of keys and 6 information pieces +associated to a dataset: the dataset short name or identifier, the variable +short name, the main path or path to the dataset main folder, the file path or +path to the NetCDF files of the dataset, the actual variable name inside the +files, a suffix (read further) and a lower and uper thresholds to disable +values of the dataset beyond them. +The list of definitions contains, initially, the default values for the columns +of the tables. It can be extended with other definitions that can be accessed +from throughout the configuration file, which is useful to avoid repeating +certain paths, for example, in multiple entries. A definition follows the +format `VARIABLE = value` and its value can be invoked with `$VARIABLE$`. +Apart from these explicit definitions, there are some implicit key definitions: + - `$EXP_NAME$` to retrieve, from an experimental dataset entry, the +identifier name of an experimenal dataset, as requested to `Load()`. + - `$OBS_NAME$`, similar to `$OBS_NAME$`. + - `$VAR_NAME$`, the short name of the variable, as requested to `Load()`. + - `$START_DATE$`, one of the starting dates, as requested to `Load()`. + - `$YEAR$`, `$MONTH$`, `$DAY$`, taken from the `$START_DATE$`. + - `$MEMBER_NUMBER$`, the number of the member in loadin process, from 0 to +total_number_of_members - 1 and with padding zeros. + - `$STORE_FREQ$`, the frequency of the experimental data in disk, as +specified to `Load()` via `storefreq`. + +The next step is to add the entries of the datasets of interest. It can be +achieved with a combination of the functions `ConfigFileOpen()`, +`ConfigAddEntry()` and `ConfigFileSave()`: +```r +c <- ConfigFileOpen(conf_path) +c <- ConfigAddEntry(c, 'experiments', dataset_name = 'expA', var_name = 'tas', + main_path = '/path/to/experiments/$EXP_NAME$/', + file_path = 'monthly_mean/$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc') +c <- ConfigAddEntry(c, 'experiments', dataset_name = 'expB', var_name = 'tas', + main_path = '/path/to/experiments/$EXP_NAME$/', + file_path = 'monthly_mean/$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc') +c <- ConfigAddEntry(c, 'observations', dataset_name = 'obsX', var_name = 'tas', + main_path = '/path/to/observations/$OBS_NAME$/', + file_path = 'monthly_mean/$VAR_NAME$/$VAR_NAME$_$YEAR$$MONTH$.nc') +ConfigFileSave(c, conf_path) +``` + +The tables in the file, at this point, are shown next: +``` +###################### +!!table of experiments +###################### +#exp_name, var_name[, exp_main_path[, exp_file_path[, nc_var_name[, suffix[, var_min[, var_max]]]]]] +expA, tas, /path/to/experiments/$EXP_NAME$/, monthly_mean/$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc, *, *, *, * +expB, tas, /path/to/experiments/$EXP_NAME$/, monthly_mean/$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc, *, *, *, * + +####################### +!!table of observations +####################### +#obs_name, var_name[, obs_main_path[, obs_file_path[, nc_var_name[, suffix[, var_min[, var_max]]]]]] +obsX, tas, /path/to/observations/$OBS_NAME$/, monthly_mean/$VAR_NAME$/$VAR_NAME$_$YEAR$$MONTH$.nc, *, *, *, * +``` + +An asterisk ('*') means that the cell will take the default value defined in +the defaults for the corresponding column. So the column 'nc_var_name' takes +'$VAR_NAME$' which is then replaced by 'tas' in this case. The other default +values are all empty. + +Once the file is ready it can be used from `Load()` by specifying its path via +the parameter `configfile`. Then, `exp` and `obs` can be used in its simple +format: a vector of character strings that tell the name of the datasets of +interest. For example: +```r +sdates <- paste0(1991:2000, '1101') +data <- Load(var = 'tas', exp = c('expA', 'expB'), obs = c('obsX'), + sdates = sdates, configfile = '/path/to/configfile.conf') +``` + +From this point on, one could create files with entries for each pair of +(dataset name, variable name) of interest. + +However, the first two columns of the tables in a configuration file can also +take regular expressions to match more than one dataset name or variable name. +This way, a more compact and generalized table can be obtained: +```r +c <- ConfigFileOpen(conf_path) +c <- ConfigAddEntry(c, 'experiments', dataset_name = '.*', var_name = '.*', + main_path = '/path/to/experiments/$EXP_NAME$/', + file_path = 'monthly_mean/$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc') +c <- ConfigAddEntry(c, 'observations', dataset_name = '.*', var_name = '.*', + main_path = '/path/to/observations/$OBS_NAME$/', + file_path = 'monthly_mean/$VAR_NAME$/$VAR_NAME$_$YEAR$$MONTH$.nc') +ConfigFileSave(c, conf_path) +``` + +The corresponding tables are: +``` +###################### +!!table of experiments +###################### +#exp_name, var_name[, exp_main_path[, exp_file_path[, nc_var_name[, suffix[, var_min[, var_max]]]]]] +.*, .*, /path/to/experiments/$EXP_NAME$/, monthly_mean/$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc, *, *, *, * + +####################### +!!table of observations +####################### +#obs_name, var_name[, obs_main_path[, obs_file_path[, nc_var_name[, suffix[, var_min[, var_max]]]]]] +.*, .*, /path/to/observations/$OBS_NAME$/, monthly_mean/$VAR_NAME$/$VAR_NAME$_$YEAR$$MONTH$.nc, *, *, *, * +``` + +The regular expression '.*' matches any sequence of characters, so these +entries will assign to all the (experiment name, variable name) pairs a path +with the pattern +`/path/to/experiments/$EXP_NAME$/monthly_mean/$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc` +or, to all the (observation name, variable name) pairs a path with the pattern +`/path/to/observations/$OBS_NAME$/monthly_mean/$VAR_NAME$/$VAR_NAME$_$YEAR$$MONTH$.nc`. +In other words, with only two entries in the configuration file you will be +able to load many variables from many datasets, as long as they follow a +common convention. + +If more than one entry match a pair (dataset name, variable name) all of +them will be applied from more general to more specific to obtain a value +for each of the 6 information pieces that, at the end, will tell the location +and information of the requested variable for the requested dataset. + +The configuration file, all in all, is a complex but powerful mechanism. +Read `?ConfigFileOpen` for a complete explanation of the details of the tables +in the configuration files and how multiple matches are applied to build up the +final information. Also see there why the 'suffix' column can be useful. + +#### Re-arranging long runs +Sometimes it can happen that one needs to compare data from two experimental +datasets, one stored in multiple files (one per starting dates) and the other +stored in a single long run (one file with one starting date). + +In this case running one single `Load()` call won't be enough since the +`sdates` argument has to take different values in function of the distribution +of the starting dates of the dataset to load. Besides, running two separate +`Load()` calls with different starting dates yields two non-comparable data +sets. + +To solve this, it is possible to re-arrange the long run into multiple starting +dates with `Histo2Hindcast()` by just specifying the original starting date, +the series of target starting dates and the number of target lead-times: + +Loading the multiple run experiment: +``` +expA <- list( + name = 'experimentA', + path = file.path('/path/to/experiments/$EXP_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc') + ) +sdatesA <- paste0(1991:2000, '1101') +leadtimesA <- 12 +data <- Load(var = 'tas', exp = list(expA), obs = NULL, sdates = sdatesA, + leadtimemax = leadtimesA) +``` + +Loading the single run experiment: +``` +expC <- list( + name = 'experimentC', + path = file.path('/path/to/experiments/$EXP_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc') + ) +sdatesC <- '19501101' +leadtimemin <- (1991 - 1950) * 12 + 1 +leadtimemax <- leadtimemin + (2000 - 1991 + 1) * 12 +data <- Load(var = 'tas', exp = list(expC), obs = NULL, sdates = sdatesC, + leadtimemin = leadtimemin, leadtimemax = leadtimemax) +data <- Histo2Hindcast(data, sdatesA[1], sdatesA, leadtimesA) +``` diff --git a/vignettes/ex_ano_expA_obsX.png b/vignettes/ex_ano_expA_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..cbb43565f8d978f0b0957822422226f306005a74 Binary files /dev/null and b/vignettes/ex_ano_expA_obsX.png differ diff --git a/vignettes/ex_ano_expB_obsX.png b/vignettes/ex_ano_expB_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..1726f6708be37f53a759bbaf3f0bb848212d2b29 Binary files /dev/null and b/vignettes/ex_ano_expB_obsX.png differ diff --git a/vignettes/ex_clim_expA_expB_obsX.png b/vignettes/ex_clim_expA_expB_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..2cf4157d566399f02ba5c7cfbd76dc6cd34f2768 Binary files /dev/null and b/vignettes/ex_clim_expA_expB_obsX.png differ diff --git a/vignettes/ex_corr_expA_expB_obsX.png b/vignettes/ex_corr_expA_expB_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..c5f8ce9c848acc3478cd1dcf83c9bea791837180 Binary files /dev/null and b/vignettes/ex_corr_expA_expB_obsX.png differ diff --git a/vignettes/ex_raw_expA_obsX.png b/vignettes/ex_raw_expA_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..801abb0741d730960143b2b85bbb0483838c9aa0 Binary files /dev/null and b/vignettes/ex_raw_expA_obsX.png differ diff --git a/vignettes/ex_raw_expB_obsX.png b/vignettes/ex_raw_expB_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..d8fe409d7e21f5b5d9b9d9398fc795e7b91abe65 Binary files /dev/null and b/vignettes/ex_raw_expB_obsX.png differ diff --git a/vignettes/example.md b/vignettes/example.md new file mode 100644 index 0000000000000000000000000000000000000000..9ac470da402a64a77690a8dccf52079fe3451e9d --- /dev/null +++ b/vignettes/example.md @@ -0,0 +1,263 @@ +--- +author: "Nicolau Manubens" +reviser: "An-Chi Ho" +date: "`r Sys.Date()`" +output: html_document +vignette: > + %\VignetteEngine{R.rsp::md} + %\VignetteIndexEntry{Example} + %\usepackage[utf8]{inputenc} +--- +Example +======= + +Next you can see an example of usage of s2dv spanning its +four modules ([**Data retrieval**](data_retrieval.md), +[**Statistics**](statistics.md), [**Verification**](verification.md) +and [**Visualisation**](visualisation.md)). + +The goal of the example is to load and analyse the skill of two sample +forecasts of the near-surface air temperature over the North Pacific region +and assess their skill against a reference reanalysis. +The data used in this example has gone already through a first post-processing +stage in which the monthly means have been computed. + +Loading data +------------ + +First the package is loaded and attached. + +The goal is to load data for two experimental datasets and one observational +dataset to be used as reference. The distribution in the file system and +details of these datasets are explained in [**Data retrieval**]. + +A list is built with information on the location of each dataset to load. +A path pattern is specified for each dataset, in this case, with the wildcards +'$VAR_NAME$', '$START_DATE$', '$YEAR$', '$MONTH$', '$EXP_NAME$' and +'$OBS_NAME$'. These wildcards will be automatically replaced by `Load()` with +the corresponding values (the requested variable name, the requested starting +dates, years or months, the dataset names, ...). See the available wildcards in `?Load`. + +```r +library(s2dv) + +expA <- list(name = 'experimentA', + path = file.path('/path/to/experiments/$EXP_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc')) +expB <- list(name = 'experimentB', + path = file.path('/path/to/experiments/$EXP_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc')) +obsX <- list(name = 'observationX', + path = file.path('/path/to/observations/$OBS_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$YEAR$$MONTH$.nc')) +``` + +Then the data is loaded with `Load()` providing the previously built lists +-to specify the desired datasets- and other parameters to select the Earth +surface region of interest, starting dates and forecast time steps to load +data from. +In this example, the requested format is 2-dimensional: all the loaded data +will be remapped onto the specified common 'grid' via CDO libraries. + +```r +sdates <- paste0(1991:2000, '1101') +data <- Load('tas', list(expA, expB), list(obsX), + sdates = sdates, + leadtimemin = 2, leadtimemax = 13, + latmin = -10, latmax = 60, + lonmin = 100, lonmax = 250, + output = 'lonlat', grid = 't106grid', + method = 'distance-weighted') +## * The load call you issued is: +## * Load(var = "tas", exp = list(structure(list(name = "experimentA", path = +## * "/path/to/experiments/$EXP_NAME$/monthly_mean/$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc"), +## * .Names = c("name", "path")), structure(list(name = +## * "experimentB", path = +## * "/path/to/experiments/$EXP_NAME$/monthly_mean/$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc"), +## * .Names = c("name", "path"))), obs = list(structure(list(name = +## * "observationX", path = +## * "/path/to/observations/$OBS_NAME$/monthly_mean/$VAR_NAME$/$VAR_NAME$_$YEAR$$MONTH$.nc"), +## * .Names = c("name", "path"))), sdates = c("19911101", +## * "19921101", ..., "20001101"), grid = "t106grid", output = +## * "lonlat", storefreq = "monthly", ...) +## * See the full call in '$load_parameters' after Load() finishes. +## * Fetching first experimental files to work out 'var_exp' size... +## * Exploring dimensions... /path/to/experiments/experimentA/monthly_mean/tas/tas_19911101.nc +## * Success. Detected dimensions of experimental data: 2, 5, 10, 12, 63, 134 +## * Fetching first observational files to work out 'var_obs' size... +## * Exploring dimensions... /path/to/observations/observationX/monthly_mean/tas/tas_199112.nc +## * Success. Detected dimensions of observational data: 1, 1, 10, 12, 63, 134 +## * Will now proceed to read and process 140 data files: +## * The list of files is long. You can check it after Load() finishes in the output '$source_files'. +## * Total size of requested data: 89147520 bytes. +## * - Experimental data: ( 2 x 5 x 10 x 12 x 63 x 134 ) x 8 bytes = 81043200 bytes. +## * - Observational data: ( 1 x 1 x 10 x 12 x 63 x 134 ) x 8 bytes = 8104320 bytes. +## * If size of requested data is close to or above the free shared RAM memory, R will crash. +## starting worker pid=3674 on localhost:11060 at 15:11:26.226 +## starting worker pid=3690 on localhost:11060 at 15:11:26.419 +## starting worker pid=3706 on localhost:11060 at 15:11:26.614 +## starting worker pid=3722 on localhost:11060 at 15:11:26.805 +## starting worker pid=3739 on localhost:11060 at 15:11:26.997 +## starting worker pid=3755 on localhost:11060 at 15:11:27.190 +## starting worker pid=3771 on localhost:11060 at 15:11:27.380 +## starting worker pid=3787 on localhost:11060 at 15:11:27.572 +## * Loading... This may take several minutes... +## * Progress: 0% + 10% + 10% + 10% + 10% + 10% + 10% + 10% + 10% + 10% + 10% +``` + +The output consists of two arrays of data (experimental and observational +data) with labelled dimensions, a list of loaded files, a list of not found +files and a call stamp to exactly reproduce as needed, among others. +See [**Data +retrieval**](data_retrieval.md) +for a full explanation of the capabilities and outputs of `Load()`. + +Keep in mind that the two arrays obtained from `Load()` will have the +following dimensions in the following order: +```r + c(n. of datasets, n. of members, n. of starting dates, n. of lead-times, n. of latitudes, n. of longitudes) +``` + +Checking and summarizing data +----------------------------- + + +Next, the data of the first member, starting date and leadtime of each dataset +can be visualised on a cylindrical equidistant projection with `PlotEquiMap()` +to check that the loaded data is as expected. +```r +PlotEquiMap(data$mod[1, 1, 1, 1, , ], data$lon, data$lat) +PlotEquiMap(data$mod[2, 1, 1, 1, , ], data$lon, data$lat) +PlotEquiMap(data$obs[1, 1, 1, 1, , ], data$lon, data$lat) +``` + + +See the full code used to obtain this figure in +[**Snippet 1**](snippets.md#snippet1). + +The values for some starting dates at a single grid point of an experimental +dataset can be plotted together with the observations with the function +`PlotAno()`: +```r +mod <- data$mod[, , , , 30, 60, drop = FALSE] +dim(mod) <- dim(mod)[1:4] +obs <- data$obs[, , , , 30, 60, drop = FALSE] +dim(obs) <- dim(obs)[1:4] +PlotAno(mod, obs, gsub('1101', '1201', sdates), + toptitle = paste0("Experiment ", c("A", "B"), + ": raw 'tas' at 166.5ºE, 27.47ºN."), + ytitle = "K", + fileout = c('ex_raw_expA_obsX.eps', 'ex_raw_expB_obsX.eps')) +``` + + + + +Each coloured region represents data corresponding to a single starting date. +The bold line represents the mean value and the thin lines represent the values +of each ensemble member. The upper and lower limits of each region are +defined, respectively, by the maximum and minimum values among the members. +The black line represents the reference reanalysis values. + +We can see in the plot how the 'tas' reaches a minimum in February/March. + +We can work out the exact coordinates of the selected grid cell as follows: +```r +data$lon[60] +## 166.5 +data$lat[30] +## 27.47649 +``` + +At this point, additional numerical checks could be carried out to ensure the +data retrieval stage has been sucessful. + +Bias correction +--------------- + +The next common step is to perform statistics on the raw data such as +computing climatologies and subtracting them to the raw data to obtain +anomalies that account for the inherent drift of the forecasts. +This is straightforward with the functions `Clim()` and `Ano()` (or +`Ano_CrossValid()`, see [**Statistics**]). Before, however, it is usual to +average the data over the region of interest instead of focusing on a single +grid cell. This can be done by directly requesting the data to `Load()` with +the 'areave' output type, which will apply area weights and calculate the +total average: +```r +data <- Load('tas', list(expA, expB), list(obsX), + sdates = sdates, + leadtimemin = 2, leadtimemax = 13, + latmin = -10, latmax = 60, + lonmin = 100, lonmax = 250, + output = 'areave', grid = 't106grid', + method = 'distance-weighted') +``` + +Then we calculate and plot the per-pair climatologies with `Clim()` and +`PlotClim()`: +```r +clim <- Clim(data$mod, data$obs) +PlotClim(clim$clim_exp, clim$clim_obs, monini = 12, + toptitle = "Per-pair climatologies of 'tas' over North Pacific region.", + listexp = c('Experiment A', 'Experiment B'), + listobs = c('Observation X'), + ytitle = "K", fileout = 'ex_clim_expA_expB_obsX.eps') +``` + + + +Each line in this plot represents the climatology of each member of the +corresponding dataset. A single climatology of the ensemble mean could be +obtained providing the parameter memb = FALSE to `Clim()`, if distinction +of members is irrelevant. + +We can obtain the anomalies with `Ano()`, which simply subtracts the +climatologies to the corresponding starting dates and members of the raw data. +These can be plotted with `PlotAno()`: +```r +ano_mod <- Ano(data$mod, clim$clim_exp) +ano_obs <- Ano(data$obs, clim$clim_obs) +PlotAno(ano_mod, ano_obs, gsub('1101', '1201', sdates), + toptitle = paste0("Experiment ", c("A", "B"), + ": drift-corrected 'tas' anomalies over North Pacific region."), + ytitle = 'K', linezero = TRUE, + fileout = c('ex_ano_expA_obsX.eps', 'ex_ano_expB_obsX.eps')) +``` + + + + +To fulfill the bias correction we would need to add the observed climatologies +to these anomalies. The working units of the package, however, are the +anomalies so this step is not needed. + +Other statistics such as trends, detrended anomalies, frequency filtering, +EOF/PCA and empirical model generation are implemented in the package. +Check [**Statistics**](statistics.md) for a thorough explanation. +Also see [**Visualisation**](visualisation.md) for details on visualisation +tools. + +Assessing the quality +--------------------- + +As a measure of the skill of the two forecasts we will compute their +correlation against the reanalysis. + +First we compute the ensemble mean and then calculate the Pearson correlation +with `Corr()` and plot the results with `PlotVsLTime()`, which allows to plot +any other computed variable with confidence intervals and signification values. +```r +corr <- Corr(Mean1Dim(ano_mod, 2), Mean1Dim(ano_obs, 2)) +PlotVsLTime(corr, toptitle = "Correlations with Observation X over North Pacific region.", + ytitle = 'corr', monini = 12, + listexp = c('Exp A', 'Exp B'), + fileout = 'ex_corr_expA_expB_obsX.eps') +``` + + + +See [**Verification**](verification.md) for a detailed explanation of the +available deterministic and probabilistic scores or +[**Plotting time series**](visualisation.md#time_series) for quick reference of +how each score can be plotted. diff --git a/vignettes/s2dv_modules.png b/vignettes/s2dv_modules.png new file mode 100644 index 0000000000000000000000000000000000000000..35465c8e0985fe4f08cf7dd0c578de0bd3e8fe04 Binary files /dev/null and b/vignettes/s2dv_modules.png differ diff --git a/vignettes/snip1_equi_map_raw_all.png b/vignettes/snip1_equi_map_raw_all.png new file mode 100644 index 0000000000000000000000000000000000000000..06e3c5ae4dcf10d37f080e80e5f5922fea06ea5a Binary files /dev/null and b/vignettes/snip1_equi_map_raw_all.png differ diff --git a/vignettes/snip2_anim_corr_expA_obsX.gif b/vignettes/snip2_anim_corr_expA_obsX.gif new file mode 100644 index 0000000000000000000000000000000000000000..cdf646c43fe1bd86b0a2d58fdeddeddbdb2d4ef2 Binary files /dev/null and b/vignettes/snip2_anim_corr_expA_obsX.gif differ diff --git a/vignettes/snip2_anim_corr_expB_obsX.gif b/vignettes/snip2_anim_corr_expB_obsX.gif new file mode 100644 index 0000000000000000000000000000000000000000..40f6d1c2203edd2d19be336c16472d8205e6ad36 Binary files /dev/null and b/vignettes/snip2_anim_corr_expB_obsX.gif differ diff --git a/vignettes/snip2_equimap_corr_raw_expA_obsX.png b/vignettes/snip2_equimap_corr_raw_expA_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..edfbb9ee61d024caa3995a49a953ae2c4f9d2b86 Binary files /dev/null and b/vignettes/snip2_equimap_corr_raw_expA_obsX.png differ diff --git a/vignettes/snip2_equimap_corr_raw_expB_obsX.png b/vignettes/snip2_equimap_corr_raw_expB_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..ecfeb0a5b3f58d820cda25359125aa1725576473 Binary files /dev/null and b/vignettes/snip2_equimap_corr_raw_expB_obsX.png differ diff --git a/vignettes/snippets.md b/vignettes/snippets.md new file mode 100644 index 0000000000000000000000000000000000000000..40929c2bd62acc0aa0bbb2564abb4d4e040c50d0 --- /dev/null +++ b/vignettes/snippets.md @@ -0,0 +1,117 @@ +--- +author: "Nicolau Manubens" +date: "`r Sys.Date()`" +output: html_document +vignette: > + %\VignetteEngine{R.rsp::md} + %\VignetteIndexEntry{Snippets} + %\usepackage[utf8]{inputenc} +--- +Snippets +======== +This page contains extended snippets of code used in some sections of the wiki +page to obtain figures or results. + +The example data used is the same used in [**Example**](example.md) and +throughout in the vignettes referenced in [**Overview**](../README.md): +```r +library(s2dv) + +expA <- list(name = 'experimentA', + path = file.path('/path/to/experiments/$EXP_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc')) +expB <- list(name = 'experimentB', + path = file.path('/path/to/experiments/$EXP_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc')) +obsX <- list(name = 'observationX', + path = file.path('/path/to/observations/$OBS_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$YEAR$$MONTH$.nc')) +``` + +Snippet 1 +--------- +Plots 3 equidistant projection maps on a single figure with a single color bar. +See in [context](example.md#snippet1). +```r +sdates <- paste0(1991:2000, '1101') +data <- Load('tas', list(expA, expB), list(obsX), + sdates = sdates, + leadtimemin = 2, leadtimemax = 13, + latmin = -10, latmax = 60, + lonmin = 100, lonmax = 250, + output = 'lonlat', grid = 't106grid', + method = 'distance-weighted') +brks <- seq(min(data$mod, data$obs, na.rm = TRUE), + max(data$mod, data$obs, na.rm = TRUE), + length.out = 21) +cols <- clim.colors(20) +png('snip1_equi_map_raw_all.png', width = 800, height = 300, type = "Xlib") +layout(matrix(c(1, 1, 2, 2, 3, 3, + 0, 4, 4, 4, 4, 0), + 2, 6, byrow = TRUE), + heights = c(5, 1)) +PlotEquiMap(data$mod[1,1,1,1,,], data$lon, data$lat, brks = brks, cols = cols, drawleg = FALSE, toptitle = 'Experiment A: tas (K)', sizetit = 0.7) +PlotEquiMap(data$mod[2,1,1,1,,], data$lon, data$lat, brks = brks, cols = cols, drawleg = FALSE, toptitle = 'Experiment B: tas (K)', sizetit = 0.7) +PlotEquiMap(data$obs[1,1,1,1,,], data$lon, data$lat, brks = brks, cols = cols, drawleg = FALSE, toptitle = 'Observation X: tas (K)', sizetit = 0.7) +ColorBar(brks, cols, vert = FALSE, subsampleg = 5) +dev.off() +``` + + +Snippet 2 +--------- +Plots time correlations (along actual time) of Decembers of Experiment A and +Experiment B at each grid point over the Atlantic. The black dots mean the +correlation at that point exceeds the 95% significance level. See in +[context](visualisation.md#snippet2). +```r +map_data <- Load('tas', list(expA, expB), list(obsX), + sdates = sdates, + leadtimemin = 2, leadtimemax = 13, + latmin = -30, latmax = 60, + lonmin = -90, lonmax = 20, + output = 'lonlat', grid = 't106grid', + method = 'distance-weighted') +ano <- Ano_CrossValid(map_data$mod, map_data$obs) +corr <- Corr(Mean1Dim(ano$ano_exp, 2), Mean1Dim(ano$ano_obs, 2)) + +cols <- clim.colors(50) +corr_min <- min(corr, na.rm = TRUE) +corr_max <- max(corr, na.rm = TRUE) +corr_brks <- round(seq(corr_min, corr_max, length.out = length(cols) + 1), 2) + +png('snip2_equimap_corr_raw_expA_obsX.png', width = 800, height = 600) +PlotEquiMap(corr[1, 1, 2, 1, , ], map_data$lon, map_data$lat, + toptitle = "Exp. A: 'tas' correlation along Decembers with Obs. X.", + units = "K", brks = corr_brks, cols = cols, subsampleg = 5, + dots = t(corr[1, 1, 2, 1, , ] > corr[1, 1, 4, 1, , ])) +dev.off() + +png('snip2_equimap_corr_raw_expB_obsX.png', width = 800, height = 600) +PlotEquiMap(corr[2, 1, 2, 1, , ], map_data$lon, map_data$lat, + toptitle = "Exp. B: 'tas' correlation along Decembers with Obs. X.", + units = "K", brks = corr_brks, cols = cols, subsampleg = 5, + dots = t(corr[2, 1, 2, 1, , ] > corr[2, 1, 4, 1, , ])) +dev.off() +``` + + + +And generates the animations of the actual time correlations of Experiment A +and B against Observation X over the Atlantic, with black dots on values that +reach a 95% significance level: + +```r +AnimVsLTime(corr, map_data$lon, map_data$lat, monini = 12, + msk95lev = TRUE, + toptitle = c("Exp. A: actual time correlation with Obs. X.", + "Exp. B: actual time correlation with Obs. X."), + units = "K", + brk = corr_brks, col = cols, subsampleg = 5, + fileout = c("snip2_anim_corr_expA_obsX", + "snip2_anim_corr_expB_obsX")) +``` + + + + diff --git a/vignettes/stat_ano_expA_Y_obsX.png b/vignettes/stat_ano_expA_Y_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..2450ad7fb1e3da92db01d423f3fa266d6c0405c3 Binary files /dev/null and b/vignettes/stat_ano_expA_Y_obsX.png differ diff --git a/vignettes/stat_ano_expA_obsX.png b/vignettes/stat_ano_expA_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..3d37c05269ab1b0311e308a2cdd5858ff7a9f88a Binary files /dev/null and b/vignettes/stat_ano_expA_obsX.png differ diff --git a/vignettes/stat_ano_expB_obsX.png b/vignettes/stat_ano_expB_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..de4abb19af655e29c0c1b3189abe90f620fdccb0 Binary files /dev/null and b/vignettes/stat_ano_expB_obsX.png differ diff --git a/vignettes/stat_clim_expA_expB_obsX.png b/vignettes/stat_clim_expA_expB_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..b7a90d8b4495085719904b30982fa5deae95397d Binary files /dev/null and b/vignettes/stat_clim_expA_expB_obsX.png differ diff --git a/vignettes/stat_detr_ano_expA_obsX.png b/vignettes/stat_detr_ano_expA_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..576aaee5d8f9e853786346f9a99d154b18f26335 Binary files /dev/null and b/vignettes/stat_detr_ano_expA_obsX.png differ diff --git a/vignettes/stat_filter_ano_expA.png b/vignettes/stat_filter_ano_expA.png new file mode 100644 index 0000000000000000000000000000000000000000..21e57909725fe599e7a3974c47cd771cee0099e9 Binary files /dev/null and b/vignettes/stat_filter_ano_expA.png differ diff --git a/vignettes/stat_raw_expA_obsX.png b/vignettes/stat_raw_expA_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..6cd4ac41397daf51a628a6457054d147452d0730 Binary files /dev/null and b/vignettes/stat_raw_expA_obsX.png differ diff --git a/vignettes/stat_raw_expB_obsX.png b/vignettes/stat_raw_expB_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..9a7914496c73debd8506f62ce5630c7eb43c997f Binary files /dev/null and b/vignettes/stat_raw_expB_obsX.png differ diff --git a/vignettes/stat_season_mam_expA.png b/vignettes/stat_season_mam_expA.png new file mode 100644 index 0000000000000000000000000000000000000000..af20a8851a81fd17deaba125d852d05c15868035 Binary files /dev/null and b/vignettes/stat_season_mam_expA.png differ diff --git a/vignettes/stat_season_mam_obsX.png b/vignettes/stat_season_mam_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..9a7acc9cd09d6da05953d9e2c0efd2d5cd437373 Binary files /dev/null and b/vignettes/stat_season_mam_obsX.png differ diff --git a/vignettes/stat_smooth_ano_expA_obsX.png b/vignettes/stat_smooth_ano_expA_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..86f150f5c7f9b77c397dd33c6292b955537abe1f Binary files /dev/null and b/vignettes/stat_smooth_ano_expA_obsX.png differ diff --git a/vignettes/stat_toy_forecast_ano.png b/vignettes/stat_toy_forecast_ano.png new file mode 100644 index 0000000000000000000000000000000000000000..eba3f1a7eff45c71be43a78bfd14aad88aa33ed7 Binary files /dev/null and b/vignettes/stat_toy_forecast_ano.png differ diff --git a/vignettes/stat_trend_expA_expB.png b/vignettes/stat_trend_expA_expB.png new file mode 100644 index 0000000000000000000000000000000000000000..3e86f459f55c559ff9a7ee398ec840b6841f347d Binary files /dev/null and b/vignettes/stat_trend_expA_expB.png differ diff --git a/vignettes/statistics.md b/vignettes/statistics.md new file mode 100644 index 0000000000000000000000000000000000000000..ff127c4c9fe5550da5a650954f06c8c4fe845063 --- /dev/null +++ b/vignettes/statistics.md @@ -0,0 +1,561 @@ +--- +author: "Nicolau Manubens" +reviser: "An-Chi Ho" +date: "`r Sys.Date()`" +output: html_document +vignette: > + %\VignetteEngine{R.rsp::md} + %\VignetteIndexEntry{Statistics} + %\usepackage[utf8]{inputenc} +--- +Statistics +========== +The **Statistics** module consists of functions that are commonly used in the +forecast verification process to modify, describe or generate **fields**, a +field being a series of modelled or observed measurements for a certain physic +variable of interest. + +These functions can be classified in 5 groups as a function of their purpose: + + - [**Normalizing fields**](#normalizing): `Clim()`, `Ano()` and +`Ano_Crossvalid()`. + - [**Describing fields**](#describing): `Trend()`, `Consist_Trend()`, +`Regression()`, `FitAcfCoef()`, `Alpha()`, `FitAutocor()`, `Spectrum()`, and +`Eno()`. + - [**Filtering fields**](#filtering): `Trend()`, `Consist_Trend()`, +`Regression()`, `Smoothing()` and `Filter()`. + - [**Generating derivative fields**](#generating_der): `Season()`, +`StatSeasAtlHurr()` and `ProbBins()`. + - [**Generating synthetic fields**](#generating_synth): `Toymodel()`. + +It is important, to master these functions, to have in mind the concepts +introduced in **Data retrieval** as well as the structure and meaning of the +dimensions of the data arrays provided by the **Data retrieval** module used +throughout in s2dv: +```r + c(n. of datasets, n. of members, n. of starting dates, n. of lead-times, n. of latitudes, n. of longitudes) +``` +the last two being present in function of the parameter `output`. Review +[**Data retrieval**](data_retrieval.md) to settle these concepts. + +The explanations in the following sections will be based on the same data used +in **Data retrieval**. See [**Data retrieval**](data_retrieval.md) for details. +```r +library(s2dv) +expA <- list(name = 'experimentA', + path = file.path('/path/to/experiments/$EXP_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc')) +expB <- list(name = 'experimentB', + path = file.path('/path/to/experiments/$EXP_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc')) +obsX <- list(name = 'observationX', + path = file.path('/path/to/observations/$OBS_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$YEAR$$MONTH$.nc')) +sdates <- paste0(1991:2000, '1101') +data <- Load('tas', list(expA, expB), list(obsX), + sdates = sdates, + leadtimemin = 2, leadtimemax = 13, + latmin = -10, latmax = 60, + lonmin = 100, lonmax = 250, + output = 'areave', grid = 't106grid', + method = 'distance-weighted') +``` +The dimensions of the data are: +```r +print(dim(data$mod)) +# [1] 2, 5, 10, 12 +print(dim(data$obs)) +# [1] 1, 1, 10, 12 +``` + +Normalizing fields +------------------ +Due to systematic errors in the models their mean behaviour along the forecast +time can differ or depart from the mean behaviour of the observed data over +that forecast time period, a phenomenon also known as **drift**. +It is hence a standard practice in climate forecast verification to compare +only the deviations of the experimental and observational data from their mean +behaviour along the forecast time. For this reason most of the functions in +s2dv use series of deviations as inputs and provide series of +deviations as outputs. + +The mean behaviour along the forecast time of a data set, also known as the +**climatology**, can be computed with multiple methods. The deviation from +the climatology is known as the **anomaly** or, more explicitly, +drift-corrected anomaly, and it can be calculated by simply subtracting the +climatology of a data set to its raw data. + +A general purpose climatology could be computed as follows (the 3rd dimension +corresponds to the starting dates or model runs): +```r +clim_exp <- apply(data$mod, 1:length(dim(data$mod))[-3], mean, na.rm = TRUE) +``` +or with the function `MeanDims()` in s2dv: +```r +clim_exp <- MeanDims(data$mod, 3) +``` + +s2dv, however, builds upon the per-pair method to compute +climatologies, which is implemented in the function `Clim()`. +The `Clim()` parameters `kharin` and `NDV` allow to select two other more +refined methods: the trend method and the initial-conditions method. All of +them are documented in +``` +Fučkar N, Volpi D, Guemas V, Doblas-Reyes F, 2014, A posteriori adjustment of near-term climate predictions: Accounting for the drift dependence on the initial conditions. Geophysical Research Letters, 41 (14), 5200–5207, doi:10.1002/2014GL060815 +``` +Following the per-pair method the climatology of an experimental dataset is +obtained, as in the general purpose climatology, by calculating an average +across the starting dates at each forecast time, yielding a time series +defined along the forecast times of the experiment. The climatology of an +observational dataset is obtained by computing, for each forecast time, the +average of the observations that date-correspond that leadtime of each +starting date. +To make easy this process `Load()` dispones both experimental and +observational data in a forecast-like arrangement. + +`Clim()` and the per-pair method only account, at each forecast time, for the +starting dates for which data is available in all the experimental and +observational datasets, hence both experimental and observational data must be +provided as inputs: + +```r +clim <- Clim(data$mod, data$obs) +``` + +The raw data and the climatologies can be visualized with `PlotAno()` and +`PlotClim()`, respectively: +```r +selected_sdates <- gsub('1101', '1201', sdates) +PlotAno(data$exp, data$obs, selected_sdates, + toptitle = paste0(c("Experiment A", "Experiment B"), + ": Raw 'tas' over North Pacific region."), + ytitle = c("K", "K"), + fileout = paste0("stat_raw_exp", c("A", "B"), "_obsX.eps")) +PlotClim(clim$exp, clim$obs, monini = 12, + toptitle = "Per-pair 'tas' climatologies over the North Pacific region.", + ytitle = "K", + listexp = c('Experiment A', 'Experiment B'), + listobs = c('Observation X'), + fileout = "stat_clim_expA_expB_obsX.eps") +``` + + + +Each coloured curve in the `PlotAno()` figures corresponds to a starting date, +with the various ensemble members and the ensemble mean in bold. The coloured +area is delimited by the minimum and maximum ensemble values. + + + +Each plot in the `PlotClim()` figure corresponds to the climatology of a +member of the corresponding experiment or observation. + +See [**Visualisation**](visualisation.md) for a complete explanation of the +plotting functions. + +Usually, if working with atmospheric variables, it is not relevant when +computing climatologies to make a distinction among members because they are +considered to be equivalent. It is possible, after seeing they are indeed +equivalent, to compute the ensemble mean climatology with +```r +exp_ens_clim <- MeanDims(clim$exp, 2) +obs_ens_clim <- MeanDims(clim$obs, 2) +``` +or, if sure a priori of their equivalence, by setting the parameter +`memb = FALSE` in `Clim()`: +```r +clim <- Clim(data$mod, data$obs, memb = FALSE) +``` + +If working with oceanic variables this may not necessarly apply. See +``` +Du, H., F.J. Doblas-Reyes, J. García-Serrano, V. Guemas, Y. Soufflet and B. Wouters (2012). Sensitivity of decadal predictions to the initial atmospheric and oceanic perturbations. Climate Dynamics, 39, 2013-2023, doi:10.1007/s00382-011-1285-9. http://link.springer.com/article/10.1007%2Fs00382-011-1285-9 Fig 2d is a very good example. If only one climatology is used for this variable, the skill is substantially lower. +``` + +Once the climatologies are computed, the anomalies can be obtained with +`Ano()`, that subtracts the climatologies to each member, starting date, +latitude and longitude of their corresponding dataset: +```r +ano_exp <- Ano(data$mod, clim$exp) +ano_obs <- Ano(data$obs, clim$obs) +``` + +All the starting dates of an experiment contribute to the per-pair climatology +and, when computing the anomaly of that starting date, its contribution to the +climatology is subtracted, which can imply a loss of information [citation]. +To avoid this, the function `Ano_CrossValid()` automatically computes a +climatology for each starting date without taking that starting date into +account and subtracts it to the original data: +```r +ano <- Ano_CrossValid(data$mod, data$obs, memb = FALSE) +``` + +The anomalies can be plotted with `PlotAno()`: +```r +PlotAno(ano$exp, ano$obs, selected_sdates, + toptitle = paste0(c("Experiment A", "Experiment B"), + ": c.v. 'tas' anomalies over North Pacific region."), + ytitle = c("K", "K"), linezero = TRUE, + fileout = paste0("stat_ano_exp", c("A", "B"), "_obsX.eps")) +``` + + + +To fulfill the bias correction of the forecasts, i.e. transforming the forecast +data from the biased model mean state to the real observed mean state, the +climatology of the observation has to be added to the anomalies of the +forecasts. Since the score functions take anomalies as working units this step +is not needed for verification. + +Describing fields +----------------- +While the drift correction had effect on series along the forecast time, the +description and filtering functions are aimed to analyse and manipulate data +also along the actual time (or starting dates). +The functions to describe fields allow you to compute the trend of a field, +its regression with another field, its frequency spectrum and its +autocorrelation. + +### Linear trend and regression +`Trend()` fits first degree linear models by least square fitting taking as +dependent variable a field given to its argument `var` and a line of slope 1 +as independent variable. A linear model is fitted for each actual time-series +of each dataset, member, lead-time, latitude and longitude in `var`. The +coefficient and intercept of the linear model, the confidence interval relying +on a T-student distribution and the detrended data are provided as output. + +`Trend()` can be useful, for example, in global warming scenarios (frequent +in seasonal and decadal prediction) to detect, describe and remove any +constantly increasing or decreasing trend. The following example computes the +trends of all the datasets at each lead-time and plots the slope of the trends +of the experimental datasets with `PlotVsLTime()` (see +[**Visualisation**](visualisation.md)): +```r +trend_exp <- Trend(MeanDims(ano$exp, 2)) +trend_obs <- Trend(MeanDims(ano$obs, 2)) +PlotVsLTime(trend_exp$trend, + toptitle = "Slopes of trends of 'tas' over North Pacific region", + ytitle = "K/year", + monini = 12, freq = 1, + listexp = c('Experiment A', 'Experiment B'), + fileout = 'stat_trend_expA_expB.eps') +``` + + +In this case the slopes of the trends are nearly zero at all lead-times. The +raw anomalies of the experiment A and observations are plotted next, side to +side with the detrended ones: +```r +PlotAno(plyr::take(ano$exp, 1, 1), ano$obs, + selected_sdates, + toptitle = "Experiment A: c.v. 'tas' anomalies over North Pacific region.", + ytitle = "K", linezero = TRUE, + fileout = paste0("stat_ano_expA_obsX.eps")) +## PlotAno() requires that the members dimension is present in its inputs. +## Since we squeezed the members dimension before computing the trend, we need +## to put it again with InsertDim(): a dimension of length 1 at the 2nd place. +PlotAno(InsertDim(plyr::take(trend_exp$detrended, 1, 1), 2, 1), + InsertDim(trend_obs$detrended, 2, 1), + selected_sdates, + toptitle = "Experiment A: detrended c.v. 'tas' anomalies over North Pacific region.", + ytitle = "K", linezero = TRUE, + fileout = paste0("stat_detr_ano_expA_obsX.eps")) +``` + + + +Since the anomaly members have been averaged to compute the trend, the provided +detrended data by `Trend()` is also an ensemble average. + +The parameter `posTR` of `Trend()` allows to specify the position in `var` of +the actual time dimension, which by default is expected to be in second place +as it is also expected that the input field has been averaged along members +with `Mean1Dim()`. It also allows to compute the trend along other dimensions +than actual time; for example, forecast time. +The parameter `interval` allows to specify the number of time steps between +consecutive values along the `posTR`th dimension, `1` by default. + +The function `Consist_Trend()` only takes into account those lead-times and +starting dates at which observations are available (when using on +2-dimensional fields, a lead-time or starting date will only be discarded if +all its grid values are not available). + +`Regression()`, in contrast to `Trend()`, takes a second field as parameter +and uses it as independent variable to fit the linear model. It can be useful +to detect, describe and filter any relationship between two fields or to +quantify the +As example, let's suppose the experiment A is under the effect of the +phenomena X and Y but only the phenomenon Y is relevant for the study case. It +may be threfore needed to remove any variability of A related to X. Let's +suppose that the experiment B is only under the effect of the phenomena X. +Then the variability of A related to Y can be isolated by subtracting its +regression with B: +```r +ano_expA_Y <- Regression(plyr::take(ano$ano, 1, 1), + plyr::take(ano$ano, 1, 2), posREG = 3) +PlotAno(ano_expA_Y$filtered, ano$obs, + selected_sdates, + toptitle = "Experiment A: c.v. 'tas' anomalies over North Pacific region, X subtracted.", + ytitle = "K", linezero = TRUE, + fileout = "stat_ano_expA_Y_obsX.eps") +``` + +### Frequency spectrum +The function `Spectrum()` estimates the frequency components of the input +time series relying on the R base `spectrum()` function. Additionally to the +frequency and power of each component it also computes the 95% and 99% +significance levels _____for accepting each frequency as being a component of the +provided series_____. These are estimated with a Monte-Carlo method. + +The spectrum of the ensemble mean of the first starting date of the experiment +A is computed next and the frequency, power and significance levels of the +most powerful component are printed: +```r +components <- Spectrum(MeanDims(plyr::take(ano$exp, c(1, 3), c(1, 1)), 2)) +print(components[1, ]) +# [1] 0.083333333 0.001797855 0.035540044 0.043555352 +``` + +`Spectrum()` assumes the input data is evenly spaced in time. + +Check [**Filtering fields**](filtering.md) for an example of filtering +frequencies estimated with `Spectrum()`. + +### Autocorrelation and persistence +Knowing whether a field is time-correlated with itself is a crucial step to +determine whether the size of the sample is large enough to yield a robust +validation. It is also useful to build synthetic models with the same time +correlation behaviour as another. Knowing the best estimate of autocorrelation +at lag 1 is needed to _________. + +`FitAutocor()` estimates the autocorrelation from the first autocorrelation +estimate, that can be obtained with the R built-in function `acf()`, a +window where to seek in with a dichotomial method and a precision factor: +```r +estacf <- acf(ano$obs[1, 1, , 1], plot = FALSE)$acf +autocorr <- FitAutocor(estacf, c(-1, 1), 0.01) +print(autocorr) +# [1] 0 +``` + +The obtained autocorrelation of 0 means that _____________. + +`FitAcfCoef()` computes _____the best estimate of the +persistence/autocorrelation at lag 1 from the first estimates of the +autocorrelation at lags 1 and 2, that can be +obtained providing the original time series to the R built-in function `acf()`: +```r +estacf <- acf(ano$obs[1, 1, , 1], plot = FALSE)$acf +persistence <- FitAcfCoef(max(estacf[2], 0), max(estacf[3], 0)) +``` + +The function `Alpha()` automatically computes the persistence by calling +`FitAcfCoef()` after computing the first estimates with `acf()`: +```r +persistence <- Alpha(ano$obs[1, 1, , 1]) +print(persistence) +# [1] 0 +``` + +`Alpha()`, additionally, has two parameters to enable linear detrending and/or +filtering of frequency peaks prior to the computation of the persistence, +convenient if ___________. + +### Effective sample size +`Eno()` and `EnoNew()` compute the effective number of independent data. +While `Eno()` relies on the `eno` function from "rclim.txt" from Caio Coelho +and operates on s2dv common formatted arrays, `EnoNew()` relies on +the method described in +``` +Guemas V., Auger L., Doblas-Reyes F., JAMC, 2013 +``` +and operates on time series and applies, if specified in the parameters `detrend` +and `filter`, a detrending and filtering of frequency peaks, similarly to +`Alpha()`. +```r +eno <- Eno(data$mod, 3) +print(unique(as.vector(eno))) +# [1] 10.000000 9.104402 6.805723 8.270278 9.492232 8.783534 7.756835 +# [8] 9.270968 8.025609 + +``` +This means that __________. + + +Filtering fields +---------------- +Depending on the source and nature of the field to study and on the study +case it is often needed to isolate, remove or modulate its components. + +As seen in [**Describing fields**](#describing), the functions `Trend()` and +`Consist_Trend()` remove linear trends and provide detrended data in the +component `detrended` of their output. Likewise, `Regression()` removes +behaviours related to other fields and provides the filtered data in the +output component `filtered`. + +### Running means + +`Smoothing()` calculates running means to remove high frequency variability. +The parameter `runemanlen` allows to specify the running window width and +`numdimt` which dimension to smooth along (4 by default; along forecast time). +In this __not_very_useful___ example the anomalies are smoothed along the +actual time with a window of 3 months to remove the inter-seasonal variability. +The first and last forecast months are lost as appreciated in the figure: +```r +smoothed_ano_exp <- Smoothing(ano$exp, 3) +smoothed_ano_obs <- Smoothing(ano$obs, 3) +PlotAno(plyr::take(smoothed_ano_exp, 1, 1), smoothed_ano_obs, + selected_sdates, + toptitle = "Experiment A: smoothed c.v. 'tas' anomalies over North Pacific region.", + ytitle = "K", linezero = TRUE, + fileout = "stat_smooth_ano_expA_obsX.eps") +``` + + +### Frequency filtering +`Filter()` filters a specified frequency from the input data. The filtering is +performed by dichotomy, seeking for the frequency around the specified one and +for the phase that maximizes the signal to subtract to the data. +The maximization of the signal to subtract relies on a minimization of the +mean square differences between the data and a cosine of each frequency and +phase within the seek range. + +The following example filters all the frequencies in experiment A ___with a +power beyond the 99% signification level___ with the purpose of _____________: +```r +ens_mean_ano_expA <- MeanDims(plyr::take(ano$exp, 1, 1), 2) +for (sdate in 1:length(sdates)) { + spectrum <- Spectrum(ens_mean_ano_expA[1, sdate, ]) + for (freq in 1:dim(spectrum)[1]) { + if (spectrum[freq, 2] > spectrum[freq, 4]) { + ens_mean_ano_expA[1, sdate, ] <- Filter(ens_mean_ano_expA[1, sdate, ], + spectrum[freq, 1]) + } + } +} +PlotAno(InsertDim(ens_mean_ano_expA, 2, 1), ano$obs, + selected_sdates, + toptitle = "Experiment A: filtered c.v. 'tas' anomalies over North Pacific region.", + ytitle = "K", linezero = TRUE, + fileout = "stat_filter_ano_expA.eps") +``` + + +Generating derivative fields +---------------------------- +Some of the functions in the package generate fields based on data from other +fields, for example seasonal means from monthly time series or downscaled data. + +### Seasonal means +`Season()` computes seasonal means from data objects in the common +s2dv structure along the forecast time dimension, which is expected +to be, by default, the 4th. The data is expected to be evenly spaced along +that dimension. +The initial month of the time-series must be specified, as well as the initial +and final months of the desired season. The parameter `posdim` allows to +specify the dimension to compute the seasonal mean along, 4 by default, +forecast time. + +The following example loads the data without area averaging, i.e. gridded data, +and computes and plots the MAM seasonal mean climatology of each grid cell over +the North Pacific region: +```r +data_map <- Load('tas', list(expA, expB), list(obsX), + sdates = sdates, + leadtimemin = 2, leadtimemax = 13, + latmin = -10, latmax = 60, + lonmin = 100, lonmax = 250, + output = 'lonlat', grid = 't106grid', + method = 'distance-weighted') +clim_map <- Clim(data_map$mod, data_map$obs, memb = FALSE) +mam_clim_exp <- Season(clim_map$exp, posdim = 2, + monini = 12, moninf = 3, monsup = 5) +mam_clim_obs <- Season(clim_map$obs, posdim = 2, + monini = 12, moninf = 3, monsup = 5) +brks <- round(seq(min(mam_clim_exp, mam_clim_obs, na.rm = TRUE), + max(mam_clim_exp, mam_clim_obs, na.rm = TRUE), + length.out = 51), 2) +cols <- clim.colors(50) +png('stat_season_mam_expA.png', width = 800, height = 600, type = "Xlib") +PlotEquiMap(mam_clim_exp[1, 1, , ], data_map$lon, data_map$lat, + toptitle = "Experiment A: Spring (MAM) 'tas' climatologies.", + units = "K", brks = brks, cols = cols, subsampleg = 10) +dev.off() +png('stat_season_mam_obsX.png', width = 800, height = 600, type = "Xlib") +PlotEquiMap(mam_clim_obs[1, 1, , ], data_map$lon, data_map$lat, + toptitle = "Observation X: Spring (MAM) 'tas' climatologies.", + units = "K", brks = brks, cols = cols, subsampleg = 10) +dev.off() +``` + + + +### Cathegorizing data +`ProbBins()` + +### Statistical downscaling +`StatSeasAtlHurr()` + +Generating synthetic fields +--------------------------- +To perform statistical/inference tests it is required to generate new data +obtained with empirical models that comply with a given set of distribution +parameters. + +**NOTE:** `GenSeries()` is not included in s2dv. It can be found in s2dverification. +If you need this function, contact the maintainer and we can include it in s2dv. + +`GenSeries()` generates a series with a specified length, autocorrelation at +lag 1, mean and standard deviation. For example, a synthetic model can be +parametrized to generate a forecast with the same parameters as the +experiment A: +```r +synth_exp_A <- apply(plyr::take(data$mod, 1, 1), c(1, 2, 3), + function(x) { + GenSeries(length(x), Alpha(x), mean(x, na.rm = TRUE), + sd(x, na.rm = TRUE)) + }) +``` + +`Toymodel()` directly generates forecasts in the s2dv common data +structure, imitating seasonal to decadal forecasts. The toymodel is based on +the model presented in +``` +Weigel et al. (2008) QJRS +``` +with an extension to consider non-stationary distributions prescribing a +linear trend. The toymodel allows to generate an aritifical forecast based on +real obsevations provided by the input (from `Load()`) or artificially +generated observations based on the input parameters `sig` and `trend`. The +forecast can be specfied for any number of starting dates, lead-times and +ensemble members. It imitates components of a forecast: (1) predictabiltiy (2) +forecast error (3) non-stationarity and (4) ensemble generation. + +The following example generates a forecast for seasonal prediction with +synthetic observations, and plots its anomalies: +```r +nstartd <- 30 +sdates_toy <- paste0(1981:(1981 + nstartd), '1101') +toy_forecast <- Toymodel(alpha = 0.1, beta = 0.3, gamma = 1, sig = 1, + trend = 0.02, nstartd = nstartd, nleadt = 12, + nmemb = 10) +ano_toy <- Ano_CrossValid(toy_forecast$mod, toy_forecast$obs) +PlotAno(ano_toy$ano_exp, ano_toy$ano_obs, sdates_toy, + toptitle = "Toy forecast: anomalies.", + ytitle = "units", linezero = TRUE, + fileout = "stat_toy_forecast_ano.eps") +``` + + +It is possible, however, to generate model data from observational data from +`Load()`. The only required parameters are, then, the predictability, error +standard deviation and factor on linear trend to sample uncertainty: + +```r +toy_forecast_C <- Toymodel(alpha = 0.1, beta = 0.3, gamma = 1, nmemb = 10, + obsini = data$obs) +PlotAno(toy_forecast_C$mod, toy_forecast_C$obs, selected_sdates, + toptitle = "Toy forecast from observation X: anomalies.", + ytitle = "K", linezero = TRUE, + fileout = "stat_toy_expC_obsX_ano.eps") +``` diff --git a/vignettes/tutorial.md b/vignettes/tutorial.md new file mode 100644 index 0000000000000000000000000000000000000000..5186797844c356c201aa79c17f631fb1be50da04 --- /dev/null +++ b/vignettes/tutorial.md @@ -0,0 +1,201 @@ +--- +author: "Nicolau Manubens" +reviser: "An-Chi Ho" +date: "`r Sys.Date()`" +output: html_document +vignette: > + %\VignetteEngine{R.rsp::md} + %\VignetteIndexEntry{Tutorial} + %\usepackage[utf8]{inputenc} +--- +Tutorial +======== + +Seasonal forecast of extreme events +----------------------------------- + +### Objectives +- Basics of R language +- Learning to use s2dv + +### 1- Load files, compute basic statistics, plot map and time series + +#### Exercise 1 - First steps with R and s2dv +- Make sure you have CDO tools installed in your system, or download and install +as described in: + +https://code.zmaw.de/projects/cdo/wiki/cdo + +- Open R (typing R in the terminal). +- Install s2dv with: + +```r +install.packages('s2dv') +``` + +- Load s2dv with the following command: + +```r +library(s2dv) +``` + +- The documentation of s2dv is available online here: + +https://CRAN.R-project.org/package=s2dv/s2dv.pdf + +- You can see the list of available functions in the package by typing: +```r +help(package = s2dv) +``` +- To see the help of a specific function, you can type: + +```r +help(FunctionName) +``` + +- Here I give you an example of how to use the `Load()` function. It assumes +you have some *tas* data for the experiments from all the ENSEMBLES models +(named *EnsCmccSeas*, *EnsIfmSeas*, *EnsEcmwfSeas*, *EnsMetfrSeas* and +*EnsUkmoSeas* at BSC-ES) and for the ERA-interim reanalysis (*ERAint*) in the +folders */example/experiments* and */example/observations*. +See the [**Example**](example.md) of use or +[**Data retrieval**](data_retrieval.md) for a full explanation of what is done +here. + +```r +exp_path_pattern <- file.path("/example/experiments/ENSEMBLES/$EXP_NAME$", + "$STORE_FREQ$_mean/$VAR_NAME$", + "$VAR_NAME$_$START_DATE$.nc") +cmcc_seas <- list(name = "EnsCmccSeas", path = exp_path_pattern) +ifm_seas <- list(name = "EnsIfmSeas", path = exp_path_pattern) +ecmwf_seas <- list(name = "EnsEcmwfSeas", path = exp_path_pattern) +metfr_seas <- list(name = "EnsMetfrSeas", path = exp_path_pattern) +ukmo_seas <- list(name = "EnsUkmoSeas", path = exp_path_pattern) + +obs_path_pattern <- file.path("/example/observations/$OBS_NAME$", + "$STORE_FREQ$_mean/$VAR_NAME$", + "$VAR_NAME$_$YEAR$$MONTH$.nc") +era_int <- list(name = "ERAint", path = obs_path_pattern) + +Data <- Load("tas", + exp = list(cmcc_seas, ifm_seas, ecmwf_seas, metfr_seas, ukmo_seas), + obs = list(era_int), + sdates = c("19930501", "19940501"), + leadtimemin = 1, leadtimemax = 7, storefreq = "monthly", + sampleperiod = 1, nmember = 9, output = "areave", + lonmin = 190, lonmax = 240, latmin = -5, latmax = 5) +``` + +- You can copy paste it in R, run and check everything is working fine. Once it is finished have a look at the dimension of `Data` by typing this command in R: +``` +dim(Data$mod) +dim(Data$obs) +``` + +- With the help of the `Load()` function try to undestand what the different +dimensions are. +- Now run another time `Load()` with the same parameters, but change +`output = 'areave'` to `output = 'lonlat'`. +- What are the dimensions of `Data` now? +- What happens if you remove the `lonmin`, `lonmax`, `latmin`, `latmax` +options? +- What happens if you change the `leadtimemin` and `leadtimemax` options? + +#### Exercise 2 - First compute and plot 2m-temperature skill over Europe + +- In the directory named *handson* create a directory named *R* and browse into +that directory. +- Open a new file named *corrskill-europe.R*. +- At the beginning of the file, load the s2dv package: + +```r +library(s2dv) +``` + +- If you are using emacs and you don't have colour highlighting just execute +this line in terminal and reopen the file: + +``` +/usr/local/bin/ictp-install ess +``` + +- Generate a list of starting dates to be used in the `sdates` argument of +`Load()` which includes all May starting dates between 1979 and 2005 (you can +have a look at the help of the function `seq` and `paste` to do it). +- Based on the example of the previous exercise, try to load the lead-times 2, +3 and 4 of 2m-temperature (*tas*) of all ENSEMBLES models, over Europe +(20W70E-25N75N) for all May starting dates between 1979 and 2005. +- Calculate the June-July-August seasonal mean using the `MeanDims()` function +for both models and observations. +- With the same function, calculate the ensemble mean of each model. +- With the `Corr()` function, calculate the time correlation in +June-July-August between each model and the ERA-interim reanalysis. +- Using `PlotEquiMap()`, plot a map of the correlation coefficient for each +model and save it in a postscript (You can use the `postscript()` and +`dev.off()` functions). You can choose the following interval and the following +color bar: + +```r +colors <- clim.colors(20) +interval <- seq(-1, 1, length.out = length(colors) + 1) +``` + +or generate another one using the `brewer.pal()` function. +- If you want to generate a multipanel plot you can use the `layout()` +function (you can have a look at this document for more help: +http://seananderson.ca/courses/11-multipanel/multipanel.pdf). Be carefeul, +some features of `PlotEquiMap()` are not available with multipanel plots. + +#### Exercise 3 - Compare the skill over land over the mediterranean region + +- In the directory named *handson/R*, open a new file named +*Timeserie-skill-mediter.R*. +- The mediterranean region 3E25E-36N44N covers both sea and land, but we would +like to calculate the skill only over land. For this, you need to use a +have the land-sea masks of ENSEMBLES and ERA-interim in NetCDF files, for +example, */path/to/experiments/ENSEMBLES/masks/land_sea.nc* and +*/path/to/observations/ERAint/masks/land_sea.nc*. +- To create masks usable in the `maskobs` and `maskmod` arguments of the +`Load()` function, you can use the following lines: + +```r +fnc <- nc_open('/path/to/experiments/ENSEMBLES/masks/land_sea.nc') +maskERAI <- ncvar_get(fnc, 'LSM') +nc_close(fnc) +maskERAI[which(is.na(maskERAI))] <- 1 + +fnc <- nc_open('/path/to/observations/ERAint/masks/land_sea.nc') +maskENS <- ncvar_get(fnc, 'LSM') +nc_close(fnc) +maskENS[maskENS > 0.5] <- 1 +maskENS[maskENS <= 0.5] <- 0 + +listmaskmod <- list(maskENS, maskENS, maskENS, maskENS, maskENS) +``` + +- Use the `Load()` function with `maskobs` and `maskmod` arguments to load the +data averaged over the land in the mediterranean region for all ENSEMBLES +models and ERA-interim reanalisys, for all May starting dates between 1979 and +2005. +- With `Clim()` and `Ano()` functions of s2dv, calculate the +anomalies of both models and observation. +- Calculate the ensemble mean of the models. +- Calculate the Root Mean Square Skill Score (RMSSS) for all lead-times, with +the RMSSS function of s2dv. +- Use the `PlotVsLTime()` function to plot this score and save it in a +PostScript file. `PlotVsLTime()` expects a third dimension of the score of +size 4 (lower confidence interval, score, upper confidence interval and +significance level), while with `RMSSS()` you will have only 2 values (score +and p-value). To do it you can use the following commands to reshape your +score matrix: + +```r +skillreshape <- array(dim = c(5, 1, 4, 7)) +skillreshape[, , c(2, 4), ] <- skill[, , c(1, 2), ] +``` + +- Do the same plot, but instead of plotting a line for the p-values, mark with +a dot the significance values (p-values under 0.05). To do this you can have a +look at the tutorial at + +http://www.harding.edu/fmccown/r/ diff --git a/vignettes/vis_acc_expA_expB_obsX.png b/vignettes/vis_acc_expA_expB_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..f5f66774fb9c7c02d64ba453852249e20408db1d Binary files /dev/null and b/vignettes/vis_acc_expA_expB_obsX.png differ diff --git a/vignettes/vis_anim_clim_expA.gif b/vignettes/vis_anim_clim_expA.gif new file mode 100644 index 0000000000000000000000000000000000000000..7554148d5550fcb194effa41520b06b8a7624f4d Binary files /dev/null and b/vignettes/vis_anim_clim_expA.gif differ diff --git a/vignettes/vis_anim_clim_expA_world.gif b/vignettes/vis_anim_clim_expA_world.gif new file mode 100644 index 0000000000000000000000000000000000000000..f5980fc63ac320a6a99457e2e3c18fdc274d81c6 Binary files /dev/null and b/vignettes/vis_anim_clim_expA_world.gif differ diff --git a/vignettes/vis_anim_clim_expB.gif b/vignettes/vis_anim_clim_expB.gif new file mode 100644 index 0000000000000000000000000000000000000000..465a71528f933b7add55df5ab8d0c81572df1a94 Binary files /dev/null and b/vignettes/vis_anim_clim_expB.gif differ diff --git a/vignettes/vis_anim_clim_obsX.gif b/vignettes/vis_anim_clim_obsX.gif new file mode 100644 index 0000000000000000000000000000000000000000..c8d2aefe08a5b115e480434581210b44f5205da8 Binary files /dev/null and b/vignettes/vis_anim_clim_obsX.gif differ diff --git a/vignettes/vis_anim_clim_obsX_world.gif b/vignettes/vis_anim_clim_obsX_world.gif new file mode 100644 index 0000000000000000000000000000000000000000..1d94c28ce9ce9afa3c3c8f0a4c1b09a819fae6ea Binary files /dev/null and b/vignettes/vis_anim_clim_obsX_world.gif differ diff --git a/vignettes/vis_ano_exp_obs.png b/vignettes/vis_ano_exp_obs.png new file mode 100644 index 0000000000000000000000000000000000000000..2047c681ebdde2fcdd4c8d8e445f4a897a5f9df3 Binary files /dev/null and b/vignettes/vis_ano_exp_obs.png differ diff --git a/vignettes/vis_ano_exp_points.png b/vignettes/vis_ano_exp_points.png new file mode 100644 index 0000000000000000000000000000000000000000..cdcfd8e5c3c8ae2adea46ccb439adedda89f2f4e Binary files /dev/null and b/vignettes/vis_ano_exp_points.png differ diff --git a/vignettes/vis_clim_expA_expB_obsX.png b/vignettes/vis_clim_expA_expB_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..48dbcfb1302fbea352432917b42846d7b8e1a702 Binary files /dev/null and b/vignettes/vis_clim_expA_expB_obsX.png differ diff --git a/vignettes/vis_conf_interval_exp.png b/vignettes/vis_conf_interval_exp.png new file mode 100644 index 0000000000000000000000000000000000000000..85cf56d5a37ff14720c42316ad30a853cef6c89a Binary files /dev/null and b/vignettes/vis_conf_interval_exp.png differ diff --git a/vignettes/vis_corr_expA_expB_obsX.png b/vignettes/vis_corr_expA_expB_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..dcaa96aaefd82516cdc9ac8a0c18fd94b4b931eb Binary files /dev/null and b/vignettes/vis_corr_expA_expB_obsX.png differ diff --git a/vignettes/vis_corr_rms_expA_expB_obsX.png b/vignettes/vis_corr_rms_expA_expB_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..ec8e5fee5429d883949815f8e5b8e9f59b44f7b6 Binary files /dev/null and b/vignettes/vis_corr_rms_expA_expB_obsX.png differ diff --git a/vignettes/vis_eno_expA_expB.png b/vignettes/vis_eno_expA_expB.png new file mode 100644 index 0000000000000000000000000000000000000000..e5fc71dc41ad5fec242029c3d3b6b452bff51b52 Binary files /dev/null and b/vignettes/vis_eno_expA_expB.png differ diff --git a/vignettes/vis_equimap_box_expA.png b/vignettes/vis_equimap_box_expA.png new file mode 100644 index 0000000000000000000000000000000000000000..7c5686ad31e46de4e976fbb9e78e6ce505bdc7b5 Binary files /dev/null and b/vignettes/vis_equimap_box_expA.png differ diff --git a/vignettes/vis_equimap_cols_raw_expA.png b/vignettes/vis_equimap_cols_raw_expA.png new file mode 100644 index 0000000000000000000000000000000000000000..afd7550eba4cba35630387787faa43d425cc37db Binary files /dev/null and b/vignettes/vis_equimap_cols_raw_expA.png differ diff --git a/vignettes/vis_equimap_cols_raw_obsX.png b/vignettes/vis_equimap_cols_raw_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..b3cf03532291470b231ce90e295c8c79f5fc87f9 Binary files /dev/null and b/vignettes/vis_equimap_cols_raw_obsX.png differ diff --git a/vignettes/vis_equimap_contour_raw_expA.png b/vignettes/vis_equimap_contour_raw_expA.png new file mode 100644 index 0000000000000000000000000000000000000000..15b3a630fa62b4e49ec63e7529affcdb804ffeb7 Binary files /dev/null and b/vignettes/vis_equimap_contour_raw_expA.png differ diff --git a/vignettes/vis_equimap_contour_raw_obsX.png b/vignettes/vis_equimap_contour_raw_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..a526700e7c12c690de0b37fc943e5d58a48f82ec Binary files /dev/null and b/vignettes/vis_equimap_contour_raw_obsX.png differ diff --git a/vignettes/vis_equimap_raw_expA.png b/vignettes/vis_equimap_raw_expA.png new file mode 100644 index 0000000000000000000000000000000000000000..7f8441331ec7047b9e787f794abbe9a1b9077d36 Binary files /dev/null and b/vignettes/vis_equimap_raw_expA.png differ diff --git a/vignettes/vis_equimap_raw_obsX.png b/vignettes/vis_equimap_raw_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..f99ae26dd438ff43698d3d66872fcdbf5a5fb1dd Binary files /dev/null and b/vignettes/vis_equimap_raw_obsX.png differ diff --git a/vignettes/vis_error_bar.png b/vignettes/vis_error_bar.png new file mode 100644 index 0000000000000000000000000000000000000000..87718e965faac97ba5f1f866016372ea57c37719 Binary files /dev/null and b/vignettes/vis_error_bar.png differ diff --git a/vignettes/vis_iqr_expA_expB.png b/vignettes/vis_iqr_expA_expB.png new file mode 100644 index 0000000000000000000000000000000000000000..111893da0341bb854cfea68c2c63db2c7a80f565 Binary files /dev/null and b/vignettes/vis_iqr_expA_expB.png differ diff --git a/vignettes/vis_layout_complex.png b/vignettes/vis_layout_complex.png new file mode 100644 index 0000000000000000000000000000000000000000..dfbfc96b356901a0360b25512bd8b31da9947019 Binary files /dev/null and b/vignettes/vis_layout_complex.png differ diff --git a/vignettes/vis_layout_equimap_expA.png b/vignettes/vis_layout_equimap_expA.png new file mode 100644 index 0000000000000000000000000000000000000000..6804590a36d0c071f241c65519618f708d2fe4c1 Binary files /dev/null and b/vignettes/vis_layout_equimap_expA.png differ diff --git a/vignettes/vis_mad_expA_expB.png b/vignettes/vis_mad_expA_expB.png new file mode 100644 index 0000000000000000000000000000000000000000..23ff8a7fa62a9f234ee2e7555f41bec03c43db1b Binary files /dev/null and b/vignettes/vis_mad_expA_expB.png differ diff --git a/vignettes/vis_maxmin_expA_expB.png b/vignettes/vis_maxmin_expA_expB.png new file mode 100644 index 0000000000000000000000000000000000000000..00996ecc8093d15d915450c0ada52c25d912927b Binary files /dev/null and b/vignettes/vis_maxmin_expA_expB.png differ diff --git a/vignettes/vis_ratiorms_expA_expB_obsX.png b/vignettes/vis_ratiorms_expA_expB_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..3b5bafcbf3dd02c8d4d3e9279ff005b835733d1c Binary files /dev/null and b/vignettes/vis_ratiorms_expA_expB_obsX.png differ diff --git a/vignettes/vis_ratiosdrms_expA_expB_obsX.png b/vignettes/vis_ratiosdrms_expA_expB_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..47d65fdb78aa7c8e4015b80c685e7564f1b7b18d Binary files /dev/null and b/vignettes/vis_ratiosdrms_expA_expB_obsX.png differ diff --git a/vignettes/vis_ratiosdrms_expA_obsX_obsXrnorm.png b/vignettes/vis_ratiosdrms_expA_obsX_obsXrnorm.png new file mode 100644 index 0000000000000000000000000000000000000000..36dc1f03f737fae63fa3e0c0a9dd70c74575d713 Binary files /dev/null and b/vignettes/vis_ratiosdrms_expA_obsX_obsXrnorm.png differ diff --git a/vignettes/vis_raw_expA_obsX.png b/vignettes/vis_raw_expA_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..bec40259be7a2d4c0aa93f74bae39e6ce18f4437 Binary files /dev/null and b/vignettes/vis_raw_expA_obsX.png differ diff --git a/vignettes/vis_raw_expB_obsX.png b/vignettes/vis_raw_expB_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..de1e17e23cabf5136fef8c4cc65ad760a5704b96 Binary files /dev/null and b/vignettes/vis_raw_expB_obsX.png differ diff --git a/vignettes/vis_regression_expA_expB.png b/vignettes/vis_regression_expA_expB.png new file mode 100644 index 0000000000000000000000000000000000000000..41481789122710bbd4663c296f3f7e63dae53616 Binary files /dev/null and b/vignettes/vis_regression_expA_expB.png differ diff --git a/vignettes/vis_rms_expA_expB_obsX.png b/vignettes/vis_rms_expA_expB_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..cbc57cebe04135094f6100aa5b04181be5a3bb4b Binary files /dev/null and b/vignettes/vis_rms_expA_expB_obsX.png differ diff --git a/vignettes/vis_rmsss_expA_expB_obsX.png b/vignettes/vis_rmsss_expA_expB_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..5f2aa5a0450ebf163c6029131f00bfb5962472df Binary files /dev/null and b/vignettes/vis_rmsss_expA_expB_obsX.png differ diff --git a/vignettes/vis_sd_expA_expB.png b/vignettes/vis_sd_expA_expB.png new file mode 100644 index 0000000000000000000000000000000000000000..f5ec2917a256d135e922fbc798cb3b77f608ad53 Binary files /dev/null and b/vignettes/vis_sd_expA_expB.png differ diff --git a/vignettes/vis_stereomap_raw_expA.png b/vignettes/vis_stereomap_raw_expA.png new file mode 100644 index 0000000000000000000000000000000000000000..fb7b265f17f3208599f60be708cd619be2179b21 Binary files /dev/null and b/vignettes/vis_stereomap_raw_expA.png differ diff --git a/vignettes/vis_stereomap_raw_obsX.png b/vignettes/vis_stereomap_raw_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..4d1ffe56ed243fc73a7d136bbadf341347d5b58f Binary files /dev/null and b/vignettes/vis_stereomap_raw_obsX.png differ diff --git a/vignettes/vis_trend_expA_expB.png b/vignettes/vis_trend_expA_expB.png new file mode 100644 index 0000000000000000000000000000000000000000..1ec1cb998f2ac9f016cb1ee11c800019507f6320 Binary files /dev/null and b/vignettes/vis_trend_expA_expB.png differ diff --git a/vignettes/visualisation.md b/vignettes/visualisation.md new file mode 100644 index 0000000000000000000000000000000000000000..2c526ca0406f9b1300d5a14ebf2b573d080c9992 --- /dev/null +++ b/vignettes/visualisation.md @@ -0,0 +1,772 @@ +--- +author: "Nicolau Manubens" +date: "`r Sys.Date()`" +output: html_document +vignette: > + %\VignetteEngine{R.rsp::md} + %\VignetteIndexEntry{Visualisation} + %\usepackage[utf8]{inputenc} +--- +Visualisation +============= + +s2dverification contains a set of functions to plot data at every stage of the +verification process, most based directly on R graphics plotting tools. +These functions are essential to: + - Quickly inspect the results of a newly produced experiment, i.e. to check +the physical consistency of the results. + - Assess the added value of a new prediction system, i.e. to compare new +results with a reference (observation, reconstruction or other experiment). + - Assess visually the significance of results, i.e. to display in a +user-friendly way confidence intervals and other statistics. + +The visualisation functions, most with a name following the pattern +`Plot*()`, can be grouped as a function of the kind of plot they provide: + - [**Plotting time series**](#time_series): `PlotClim()`, `PlotAno()`, +`PlotVsLtime()`, `Plot2VarsVsLTime()`, `PlotBoxWhisker()` and `PlotACC()`. + - [**Plotting maps**](#maps): `PlotEquiMap()`, +`PlotStereoMap()`, `AnimateMap()`, `PlotLayout()` and `PlotSection()`. + +To master these functions it is convenient to have in mind the common array +dimension structure used throughout in s2dverification and how it evolves as +the data objects go through the statistics and verification stages. For that +you can review the introduction in [**Data retrieval**](data_retrieval.md) and +the sections [**Statistics**](statistics.md) and +[**Verification**](verification.md). + +Next an explanation of which situations they fit the best, details of the +options they provide and short examples of usage. The data used hereunder will +be the same as in [**Data retrieval**](data_retrieval.md): +```r +library(s2dverification) +expA <- list(name = 'experimentA', + path = file.path('/path/to/experiments/$EXP_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc')) +expB <- list(name = 'experimentB', + path = file.path('/path/to/experiments/$EXP_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc')) +obsX <- list(name = 'observationX', + path = file.path('/path/to/observations/$OBS_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$YEAR$$MONTH$.nc')) +sdates <- paste0(1991:2000, '1101') +data <- Load('tas', list(expA, expB), list(obsX), + sdates = sdates, + leadtimemin = 2, leadtimemax = 13, + latmin = -10, latmax = 60, + lonmin = 100, lonmax = 250, + output = 'areave', grid = 't106grid', + method = 'distance-weighted') +ano <- Ano_CrossValid(data$mod, data$obs, memb = FALSE) +``` + +Plotting time series +-------------------- + +All the functions devoted to plotting time series have some common traits and +parameters: + - Aimed to plot monthly, seasonal or yearly time series, adjustable via the +parameter `freq`. + - All of them generate plots and save them to PostScript files the path and +file name of which you have to provide to the `fileout` parameter. Can plot in +a presentation oriented style or in a paper oriented style, adjustable with the +parameter `biglab`. + - Can plot a legend automatically, adjustable via the parameters `leg`, +`listexp` and `listobs`/`listvar` in `PlotVsLTime()`, `Plot2VarVsLtime()` and +`PlotClim()` or via the parameter `legends` in `PlotAno()` and `PlotACC()`. + - Accept any additional parameters via the parameter `...` to be sent to +the underlying R graphics `plot()` function for a fine tuning. + +**Note:** A general purpose time-series plotting function, `PlotTimeSeries()`, +is currently being developed. This function will agglomerate all the +functionality required to generate the plots resulting from all the currently +available functions in `s2dverification` and will be based on the `ggplot2` +package. The current functions will be kept as they are but will simply be an +interface to `PlotTimeSeries()`. See +[**this report**](https://earth.bsc.es/gitlab/es/s2dverification/blob/develop-PlotTimeSeries/inst/doc/PlotTimeSeries/PlotTimeSeries.pdf) +for an extended explanation and examples of the new +time-series plotting function. + +### Plotting climatologies + +`PlotClim()` takes as inputs an experimental and an observational array of 2 +to 3 dimensions each: +```r + c(n. of datasets, n. of members, n. of forecast times) +``` +the second one being optional. This means that the latitude and longitude +dimensions must have been removed either because working on area-averages or +because working on a single grid point and that the starting dates dimension +must have also been removed either because willing to plot data of a single +starting date or because working on averages across starting dates, i.e. +climatologies (see [**Statistics**](statistics.md)). + +A curve is plotted for each member of each experiment or observation. The +members of a same experiment or observation are equally coloured whereas the +different experiments and observations are plotted with different colours. +`PlotClim()` assumes by default that the provided climatologies start in +January. Otherwise the initial month can be specified with the parameter +`monini`: + +```r +clim <- Clim(data$mod, data$obs) +PlotClim(clim$exp, clim$obs, monini = 12, + toptitle = "Per-pair 'tas' climatologies, North Pacific", + ytitle = "K", + listexp = c('Experiment A', 'Experiment B'), + listobs = c('Observation X'), + fileout = "vis_clim_expA_expB_obsX.png") +``` + + +### Plotting multi-member raw data or anomalies + +`PlotAno()` takes also two data arrays as inputs with 4 dimensions each: +```r + c(n. of datasets, n. of members, n. of starting dates, n. of forecast times) +``` +all being mandatory. It means that if willing to plot single member data or +data from a single starting date, the corresponding dimensions still have to +be present in the provided arrays; `InsertDim()` helps in that. + +The data in the provided arrays can be raw data, anomalies or any other kind +of data, as long as it is arranged with the required dimensions. +For each run of a model (i.e. for each group of members of a same starting +date from one experiment) a thick curve with the ensemble mean is plotted +together with a finer curve for each ensemble member, unless any of these are +disabled with `ensmean` or `memb`, respectively. The area delimited by the +minimum and maximum ensemble values at each forecast time is filled in with a +different colour for each starting date, unless disabled with `fill`. +A fine black curve is plotted with the observational data alongside each +starting date. +A separate new plot is created for each experimental dataset provided in the +experimental data array. + +The starting dates the provided data corresponds to must be provided with the +parameter `sdates` for them to be plotted accordingly along the x axis. + +`linezero` draws a line at y = 0 to help perceiving whether the anomaly is +positive or negative, `vlines` draws vertical lines at any specified abscissae +and `points` will draw all the curves in pointed style. + +```r +selected_sdates <- gsub('1101', '1201', sdates) +PlotAno(data$mod, data$obs, selected_sdates, + toptitle = paste0(c("Experiment A", "Experiment B"), + ": Raw 'tas' over North Pacific"), + ytitle = c("K", "K"), + fileout = paste0("vis_raw_exp", c("A", "B"), "_obsX.png")) +``` + + + +### Plotting statistics and scores + +The functions `PlotVsLTime()`, `Plot2VarsVsLTime()` and `PlotACC()` serve to +plot time series of variables obtained with the +[**Statistics**](statistics.md) or [**Verification**](verification) modules +that yield forecast time series of indices or scores that usually come +together with confidence intervals and significance levels. + +`PlotVsLTime()` takes as main input an array of data with the following +dimensions: +```r + c(n. of experiments, n. of observational datasets, 3 or 4, n. of forecast times) +``` +the second one being optional and the third one corresponding to the lower +limit of a confidence interval, the measurement or index or score, the upper +limit of a confidence interval and, optionally, a significance level. This +dimension format requires that the index or score to be plotted is computed +along the actual time (starting dates) and over the average of the ensemble +members, either at a single grid point or from an area average. + +A curve is drawn for each pair of (experiment, observation) with a different +colour and line style. The scores or indices that correspond to a same +experiment are coloured equally but a different line style is used for each +observation that experiment has been compared to when computing the index or +score. The confidence intervals are plotted with a finer line of the same +colour and line style, unless disabled with `show_conf`. To plot the +signification level instead the parameter `siglev` can be set to `TRUE`. + +As in `PlotClim()`, each curve is defined along the forecast times and is +assumed to start at January. Otherwise it can be adjusted via `monini`. +Additionally the number of ticks along the x axis can be adjusted with +`nticks` and any number of horizontal lines can be drawn by specifying the +target ordinates to `hlines`. + +`PlotVsLTime()` can be useful in the following situations: + - To plot `Trend()` or `Regression()` fitted coefficients of ensemble averages +of data at a single grid cell or averaged over a region: + +```r +trend_exp <- Trend(MeanDims(ano$exp, 2)) +trend_obs <- Trend(MeanDims(ano$obs, 2)) +PlotVsLTime(trend_exp$trend, + toptitle = "Slopes of trends of 'tas' over North Pacific", + ytitle = "K/year", + monini = 12, freq = 1, + listexp = c('Experiment A', 'Experiment B'), + fileout = 'vis_trend_expA_expB.png') +``` +```r +ano_expA_Y <- Regression(MeanDims(plyr::take(ano$exp, 1, 1), 2), + MeanDims(plyr::take(ano$exp, 1, 2), 2)) +PlotVsLTime(ano_expA_Y$regression, + toptitle = "Regr. slopes of exp. A with exp. B, N. Pacific", + ytitle = "K/year", + monini = 12, freq = 1, leg = FALSE, + fileout = 'vis_regression_expA_expB.png') +``` + + + - To plot the `Spread()` across ensemble members and starting dates of area +averaged data (interquartile range, maximum minus minimum, standard deviation +or median absolute deviation): + +```r +spread <- Spread(ano$exp, posdim = c(2, 3)) +PlotVsLTime(spread$iqr, + toptitle = "IQR across members and starting dates over North Pacific", + ytitle = "K", sizetit = 0.7, + monini = 12, freq = 1, + listexp = c('Experiment A', 'Experiment B'), + fileout = 'vis_iqr_expA_expB.png') +PlotVsLTime(spread$maxmin, + toptitle = "Max.-min. across members and s. dates over North Pacific", + ytitle = "K", sizetit = 0.7, + monini = 12, freq = 1, + listexp = c('Experiment A', 'Experiment B'), + fileout = 'vis_maxmin_expA_expB.png') +PlotVsLTime(spread$sd, + toptitle = "S. deviation across members and s. dates over North Pacific", + ytitle = "K", sizetit = 0.7, + monini = 12, freq = 1, + listexp = c('Experiment A', 'Experiment B'), + fileout = 'vis_sd_expA_expB.png') +PlotVsLTime(spread$iqr, + toptitle = "MAD across members and starting dates over North Pacific", + ytitle = "K", sizetit = 0.7, + monini = 12, freq = 1, + listexp = c('Experiment A', 'Experiment B'), + fileout = 'vis_mad_expA_expB.png') +``` + + + + + - To plot the correlation (`Corr()`) and RMSE (`RMS()`) between experiments +(averaged across ensemble members) and observations: + +```r +corr <- Corr(MeanDims(ano$exp, 2), MeanDims(ano$obs, 2)) +PlotVsLTime(corr, + toptitle = "Time correlation with observation X, over North Pacific", + ytitle = "K", sizetit = 0.7, + monini = 12, freq = 1, + listexp = c('Experiment A', 'Experiment B'), + fileout = 'vis_corr_expA_expB_obsX.png') +``` +```r +rms <- RMS(MeanDims(ano$exp, 2), MeanDims(ano$obs, 2)) +PlotVsLTime(rms, + toptitle = "RMSE with observation X, over North Pacific", + ytitle = "K", sizetit = 0.7, + monini = 12, freq = 1, + listexp = c('Experiment A', 'Experiment B'), + fileout = 'vis_rms_expA_expB_obsX.png') +``` + + + - To plot the ratio between the RMSE of the ensemble mean of two experiments +with a same observation at a single grid point or area averaged: + +```r +ratio_rms <- RatioRMS(MeanDims(ano$exp[1, , , ], 1), + MeanDims(ano$exp[2, , , ], 1), + ano$obs[1, 1, , ]) +ratio_rms2 <- array(dim = c(1, 4, dim(ratio_rms)[2])) +ratio_rms2[1, c(2, 4), ] <- ratio_rms +PlotVsLTime(ratio_rms2, + toptitle = "RMSE exp. A / RMSE exp. B (against obs. X), over N. Pacific", + ytitle = "K", sizetit = 0.7, + monini = 12, freq = 1, siglev = TRUE, leg = FALSE, + fileout = 'vis_ratiorms_expA_expB_obsX.png') +``` + + - To plot the ratio between the ensemble spread of the experiments and their +RMSE against the observations (`RatioSDRMS()`) at a single grid point or area +averaged: + +```r +ratio_sdrms <- RatioSDRMS(ano$exp, ano$obs) +ratio_sdrms2 <- array(dim = c(dim(ratio_sdrms)[1:2], 4, dim(ratio_sdrms)[4])) +ratio_sdrms2[, , c(2, 4), ] <- ratio_sdrms +PlotVsLTime(ratio_sdrms2, + toptitle = "S. dev. over members and s. dates / RMSE, over North Pacific", + ytitle = "K", sizetit = 0.7, + monini = 12, freq = 1, siglev = TRUE, + listexp = c('Experiment A', 'Experiment B'), + fileout = 'vis_ratiosdrms_expA_expB_obsX.png') +``` + + +In this example, the ratio SD / RMS is calculated for the experiment A only +but against two observational datasets: + +```r +ratio_sdrms <- RatioSDRMS(plyr::take(ano$exp, 1, 1), + abind::abind(ano$obs, + ano$obs + rnorm(length(ano$obs), 0, 0.1), + along = 1)) +ratio_sdrms2 <- array(dim = c(dim(ratio_sdrms)[1:2], 4, dim(ratio_sdrms)[4])) +ratio_sdrms2[, , c(2, 4), ] <- ratio_sdrms +PlotVsLTime(ratio_sdrms2, + toptitle = "S. dev. over members and s. dates / RMSE, over North Pacific", + ytitle = "K", sizetit = 0.7, + monini = 12, freq = 1, siglev = TRUE, + listexp = c('Experiment A'), + listobs = c('Observation X', 'Observation X + rnorm(n, 0, 0.1)'), + fileout = 'vis_ratiosdrms_expA_obsX_obsXrnorm.png') +``` + + - To plot `RMSSS()` of ensemble mean at a single grid point or area averaged: + +```r +rmsss <- RMSSS(MeanDims(ano$exp, 2), MeanDims(ano$obs, 2)) +rmsss2 <- array(dim = c(dim(rmsss)[1:2], 4, dim(rmsss)[4])) +rmsss2[, , c(2, 4), ] <- rmsss +PlotVsLTime(rmsss2, + toptitle = "RMSSS with observation X, over North Pacific", + ytitle = "K", sizetit = 0.7, + monini = 12, freq = 1, siglev = TRUE, + listexp = c('Experiment A', 'Experiment B'), + fileout = 'vis_rmsss_expA_expB_obsX.png') +``` + + - To plot effective number of independent data (`Eno()`): + +```r +eno <- Eno(MeanDims(ano$exp, 2), posdim = 2) +eno2 <- array(dim = c(dim(eno)[1], 4, dim(eno)[2])) +eno2[, 2, ] <- eno +PlotVsLTime(eno2, + toptitle = "Effective n. of independent data, over North Pacific", + sizetit = 0.7, + monini = 12, freq = 1, siglev = FALSE, show_conf = FALSE, + listexp = c('Experiment A', 'Experiment B'), + fileout = 'vis_eno_expA_expB.png') +``` + + +`Plot2VarsVsLTime()` allows to plot two indices or scores at a time on the same +plot, each with its confidence intervals. It accepts as inputs arrays of only +3 dimensions: +```r + c(n. of experiments, 3/4, n. of forecast times) +``` +i.e., it accepts only indices or scores from a set of experimental datasets +against a single observation. +Its options are identical to the ones of `PlotVsLTime()` except that the +parameter for specifying legends for observations (`listobs`) is replaced by a +parameter to put a legend to distinguish the plotted variables, `listvars`. +``` +Plot2VarsVsLTime(corr[, 1, 1:3, ], rms[, 1, , ], + toptitle = "Time correlation and RMSE with obs. X, over North Pacific", + ytitle = "K", sizetit = 0.7, + monini = 12, freq = 1, + listexp = c('Experiment A', 'Experiment B'), + listvars = c('Corr', 'RMSE'), + fileout = 'vis_corr_rms_expA_expB_obsX.png') +``` + + +`PlotACC()`, in contrast to `PlotVsLTime()`, accepts an additional dimension +for the starting dates in the input and the dimension of the confidence +intervals is positioned at the end in place of the latitudes and longitudes, +as provided by `ACC()`: +```r + c(n. of experiments, n. of observational datasets, n. of starting dates, n. of forecast times, 4) +``` +The ACCs and confidence intervals of all forecast times and starting dates of +all experiments are plotted. In contrast to `PlotAno()`, the consecutive +starting dates are overlapped so as to make it easy to compare their behaviour +overall. + +All the starting dates of an experiment are painted with the same colour. +There are two possible styles, switchable via the parameter `points`: + - Drawing curves along forecast times, as in `PlotVsLTime()`. In that case +`fill` can enable filling the area limited by the confidence intervals. + - Drawing a point for each forecast time, with vertical lines joined to the +extremes of the confidence interval, limited with notches (default). +Besides, `linezero` draws a line at ordinate 0 and `vlines` draws vertical +lines at any specified set of abscissae. + +```r +map_data <- Load('tas', list(expA, expB), list(obsX), + sdates = sdates, + leadtimemin = 2, leadtimemax = 13, + latmin = -30, latmax = 60, + lonmin = -90, lonmax = 20, + output = 'lonlat', grid = 't106grid', + method = 'distance-weighted') +ano <- Ano_CrossValid(map_data$mod, map_data$obs, memb = FALSE) +acc <- ACC(ano$exp, ano$obs) +PlotACC(acc$ACC, selected_sdates, + toptitle = "Spatial anomaly corr. coeff. with obs X over Atlantic", + ytitle = "K", sizetit = 0.7, freq = 1, + legends = c('Experiment A', 'Experiment B'), + fileout = 'vis_acc_expA_expB_obsX.png') +``` + + +`PlotBoxWhisker()` + +PlotTimeSeries +------------- +**Note:** PlotTimeSeries is currently still under development, but a beta-version can be downloaded from the s2dverification gitlab page, from the develop-PlotTimeSeries branch. +**Note_29122022:** PlotTimeSeries development is paused now. It doesn't exist in s2dv. + +The function, `PlotTimeSeries`, is an agglomerate of the time series plotting functions from previous versions of `s2dverification`, with enhanced functionality. The function is still evolving, to ensure that it is compatible with the common R data structure which is currently being discussed within the QA4Seas project. + +In brief, the `PlotTimeSeries` function takes common data model (CDM) objects or arrays of any number of (named) dimensions and creates scatter plots or draws curves along them, as well as plotting their means and max/mins. The inputted data can include metadata that is automatically incorporated into the plot, such as the name of the plotted variable, the start dates of a set of provided seasonal forecasts, the date at each forecast time step or the names of the members. The user can manually specify which dimensions to colour or style along, but the function is designed to anticipate the user's requirements. The advantages of the new function are its increased ease-of-use and versatility, as well as the improved quality of the final plots due to the use of the `ggplot2` package. + +### Plotting anomalies and climatologies + +`PlotTimeSeries` takes CDM objects or arrays as input, with the following dimensions: + +```r +c(dataset, member, start dates, forecast times). +``` +Objects with different numbers of dimensions, or different orderings can also be handled by the function, by appropriate specification of the `curves_along`, `mean_along`, and `colour_along` parameters. When a CDM is inputted, the default setting is for the function to calculate the multi-member means for each dataset and start date and plot them in different colours, and shade between the min/max. + +```r +PlotTimeSeries(ano_exp) + PlotTimeSeries(ano_obs, add = T) +``` + + + +-------------------- + +The `PlotTimeSeries` function has read the x- and y- axis labels, the title and the legend from the metadata automatically. By default, a horizontal line is plotted along `y = 0`, and this line can be shifted or removed with `intercept`. Layers can be added by selecting `add = TRUE`, as in the above example, where the dataset of observations have been added to the plot. The user can plot the geometric objects (the mean, confidence intervals, curves etc.) along any of the dimensions, as well as adding points, changing the linestyles and removing any of the objects. For example the curves for the individual members can be replaced with points, with different shapes for the different members, and the shading between the minimum and maximum can be removed as follows. + +```r +PlotTimeSeries(ano_exp, minmax_along = NA, points = T, shape_along = 2, curves_along = NA) +``` + + + + + +### Plotting scores and sample statistics + +If the input to the function is a CDM object containing sample statistics, e.g. the correlation, then a plot of these statistics and their associated confidence intervals (if availble) will automatically be plotted as error bars. Alternatively, the user can specify the confidence intervals to be plotted as dashed lines (and the sample statistic as a solid line) with `interval_type`. + +```r +PlotTimeSeries(Corr) +PlotTimeSeries(Corr, interval_type = "line") +``` + + + + + + +### Conclusions + +The use of the R package `ggplot2` in `PlotTimeSeries` results in high quality time series plots, as well as simplifying the creation of sophisticated graphics. Another benefit of the `ggplot2` package is that the user can easily modify the output of `PlotTimeSeries` to configure the plot title, axis labels and legends. + +The function is being refined to accommodate a wide range of user requirements. This work is being conducted alongside changes currently being made to the `s2dverification` and `downscaleR` packages to ensure a common R data structure. + + +Plotting maps +------------- + +This group of functions allows to plot grid data (i.e. defined over latitudes +and longitudes) on a rectangular equidistant projection or on a stereographic +projection (as of s2dverification v2.5.0) as well as depth sections (i.e. +defined over latitudes/longitudes and depth levels). + +Regarding the functions to plot maps, by default each grid point is drawn on a +world map with a colour as a function of the magnitude of the provided field, +index or score at that point. While `PlotEquiMap()` and `PlotStereoMap()` plot +data of a single forecast time, `AnimateMap()` creates an animation with the +plotted maps of all forecast times obtained with one of the previous functions at +choice. + +`PlotEquiMap()` and `PlotStereoMap()` share some traits: + - Both expect a data matrix in the parameter `var` of dimensions +`c(n. of latitudes, n. of longitudes)`, `lat` with a vector with the latitudes +of the centers of the grid cells and `lon`, a vector with the longitudes. +`Load()` already provides these in the correct format, through the components +`$lon` and `$lat`. + - The N colours to paint the grid cells with (via `cols`) as well as the N + +1 threshold magnitudes (via `brks`) that will allow to assign each grid cell +value one of the N colours. The colour for any missing values can be adjusted +with `colNA`. + - Whether to fill the map continents or only draw coast wires can be adjusted +with `filled.continents`. + - A matrix of `dots` can be plotted over the drawn map to, e.g., highlight +cells where a skill has been greatest or most significant. + - A legend can be drawn (unless disabled with `drawleg`) and the amount of +ticks on that legend can be adjusted with `subsampleg`. + - They accept any additional parameters via the parameter `...` to be sent to +the underlying R graphics `image()` function for a fine tuning. + +### PlotEquiMap() + +Next some examples of maps obtained with `PlotEquiMap()`: +```r +PlotEquiMap(MeanDims(map_data$mod, 2)[1, 1, 1, , ], map_data$lon, map_data$lat, + toptitle = "Exp. A 'tas', s. date: 1990-11-01, f. time: 1 month", + units = "K", fileout = 'vis_equimap_raw_expA.png') + +PlotEquiMap(MeanDims(map_data$obs, 2)[1, 1, 1, , ], map_data$lon, map_data$lat, + toptitle = "Obs. X: 'tas', 1990-12-01", + units = "K", filled.continents = FALSE, fileout = 'vis_equimap_raw_obsX.png') +``` + + + +```r +PlotEquiMap(Mean1Dim(map_data$mod, 2)[1, 1, 1, , ], map_data$lon, map_data$lat, + toptitle = "Exp. A 'tas', s. date: 1990-11-01, f. time: 1 month", + units = "K", brks = 100, bar_limits = c(255, 300), + filled.continents = 'black', + fileout = 'vis_equimap_cols_raw_expA.png') + +PlotEquiMap(Mean1Dim(map_data$obs, 2)[1, 1, 1, , ], map_data$lon, map_data$lat, + toptitle = "Obs. X 'tas', 1990-12-01", + units = "K", brks = 100, bar_limits = c(255, 300), + filled.continents = FALSE, + fileout = 'vis_equimap_cols_raw_obsX.png') +``` + + + + +Or, as seen in the example from [**Snippet 2**](snippets.md#snippet2): + + + + + +`PlotEquiMap()` has some other additional features: + - Drawing contour lines: `square` allows to smooth the borders of each +coloured region and draws contour lines and figures. The contours can be +adjusted by providing a matrix to `contours` with dimensions +`c(n. of longitudes, n. of latitudes)`. The contours of the data in that +matrix will be drawn over the map, using the thresholds provided in `brks2`. +This is useful to plot the contours of the original data without border +smoothing, to plot only some of the default contours or to plot other contours +defined by another field. + +```r +PlotEquiMap(MeanDims(map_data$mod, 2)[1, 1, 1, , ], map_data$lon, map_data$lat, + toptitle = "Exp. A 'tas', s. date: 1990-11-01, f. time: 1 month", + units = "K", brks = 100, bar_limits = c(255, 300), square = FALSE, + fileout = 'vis_equimap_contour_raw_expA.png') + +PlotEquiMap(MeanDims(map_data$obs, 2)[1, 1, 1, , ], map_data$lon, map_data$lat, + toptitle = "Obs. X 'tas', 1990-12-01", + units = "K", brks = 100, bar_limits = c(255, 300), + square = FALSE, contour_lty = 2, + fileout = 'vis_equimap_contour_raw_obsX.png') +``` + + + + + - Drawing boxes on the map: `boxlim`, `boxcol` and `boxlwd` allow to +specify the position of the corners, colour and thickness of a box to be drawn +on the map. + +```r +PlotEquiMap(MeanDims(map_data$obs, 2)[1, 1, 1, , ], map_data$lon, map_data$lat, + toptitle = "Obs. X 'tas', 1990-12-01", + units = "K", brks = 100, bar_limits = c(255, 300), + boxlim = list(c(-62, -22, -23, 15), c(-30, 10, 0, 32)), + boxcol = c('purple', 'blue'), boxlwd = c(8, 4), + coast_color = 'black', + fileout = 'vis_equimap_box_expA.png') +``` + + + + - Ticks on the longitude/latitude axes can be adjusted with `axelab`, `labW`, +`intylat` and `intxlon`. + - `numbfig` allows to specify the number of figures in the final layout in +order to adjust some spacing parameters. + +### PlotStereoMap() + +`PlotStereoMap()` can be called almost identically to `PlotEquiMap()`, except +that it requires the `latlims` parameter to specify which range of latitudes to +be plotted on the stereographic projection map. +The only special parameter is the `intlat`, to set the interval in degrees +between consecutive latitude circles on the plot. + +Next a couple of examples: + +```r +## Loading data over the entire globe +world_data <- Load('tas', list(expA, expB), list(obsX), + sdates = sdates[1], + leadtimemin = 2, leadtimemax = 13, + latmin = -90, latmax = 90, + lonmin = 0, lonmax = 360, + output = 'lonlat', grid = 't106grid', + method = 'distance-weighted') +PlotStereoMap(MeanDims(world_data$mod, 2)[1, 1, 1, , ], + world_data$lon, world_data$lat, c(40, 90), + brks = 100, bar_limits = c(240, 290), + toptitle = "Exp. A 'tas', s. date: 1990-11-01, f. time: 1 month", + title_scale = 0.5, + units = "K", filled.continents = 'black', + fileout = 'vis_stereomap_raw_expA.png') + +PlotStereoMap(MeanDims(world_data$obs, 2)[1, 1, 10, , ], + world_data$lon, world_data$lat, c(-90, -60), intlat = 10, + brks = 100, bar_limits = c(240, 290), + boxlim = list(c(0, -85, 135, -70)), + toptitle = "Obs. X 'tas', 1990-12-01", + title_scale = 0.5, + units = "K", fileout = 'vis_stereomap_raw_obsX.png') +``` + + + + +### AnimateMap() + +`AnimateMap()` generates animations of maps from `PlotEquiMap()` or +`PlotStereoMap()` and saves them in GIF files at the paths specified in the +parameter `fileout`. It receives as main parameter, similarly as +`PlotVsLTimes()`, an array with the dimensions: +```r + c(n. of experiments/observations, n. of observations, 3, n. of forecast times, n. of latitudes, n. of longitudes) +``` +the first 3 being optional and the 3rd dimension corresponding to the lower +confidence interval, the actual value and the upper confidence interval. This +means it can plot maps of a field of one or multiple experimental or +observational datasets, averaged across ensemble members and starting dates +(climatologies) or indices or scores computed across starting dates against +one or more observations. The confidence intervals, if provided, are used to +draw black dots on grid cells that reach a 95% significance level, if +requested via `msk95lev`. + +The provided data is assumed to be in a monthly frequency and to start at +January. Otherwise it can be adjusted with `monini` and `freq`. + +Whether to plot with a equidistant rectangular projection or a stereographic +projection can be chosen with the parameter `equi`. The plot can be limited to +a sub-region of the provided data via the parameters `lonmin`, `lonmax`, +`latmin` and `latmax`. + +`AnimateMap()` allows for the typical adjustments in the map plotting +functions: specifying the palette colors and breaks (`col` and `brk`), +displaying or not a legend (`drawleg`, `subsampleg`), selecting a color for +the NA values (`colNA`) and filling or not the continents +(`filled.continents`). + +Further title and axis tick adjustments can be achieved with `toptitle`, +`sizetit`, `units`, `intlon` and `intlat`. +brk col drawleg subsampleg colNA filled.continents + +Next a few examples: + +```r +map_clim <- Clim(map_data$mod, map_data$obs, memb = FALSE) + +cols <- clim.colors(50) + +data_min <- min(map_data$mod, map_data$obs, na.rm = TRUE) +data_max <- max(map_data$mod, map_data$obs, na.rm = TRUE) +brks <- round(seq(data_min, data_max, length.out = length(cols) + 1), 2) + +AnimateMap(Subset(map_clim$exp, 'dataset', 1), + map_data$lon, map_data$lat, monini = 12, + toptitle = "Exp. A climatologies.", + units = "K", brks = brks, cols = cols, + fileout = "vis_anim_clim_expA.gif") +``` + + +And, as seen in [**Snippet 2**](snippets.md#snippet2), the animations of the +actual time correlations of Experiment A and B against Observation X over the +Atlantic, with black dots on values that reach a 95% significance level: + + + + + +Also the entire globe and stereographic projection maps can be animated: + +```r +world_clim <- Clim(world_data$mod, world_data$obs, memb = FALSE) +AnimateMap(Subset(world_clim$exp, 'dataset', 1), + world_data$lon, world_data$lat, + filled.continents = FALSE, monini = 12, + toptitle = "Exp. A climatologies.", units = "K", + brks = brks, cols = cols, + fileout = 'vis_anim_clim_expA_world.gif') +AnimateMap(world_clim$obs, + world_data$lon, world_data$lat, equi = FALSE, + latmin = 60, latmax = 90, monini = 12, + toptitle = "Obs. X climatologies.", + units = "K", brks = brks, cols = cols, + fileout = "vis_anim_clim_obsX_world.gif") +``` + + + + + +### PlotLayout() + +This function allows to easily combine plots from any R plot function in +layouts. The input data can be provided in multiple data arrays, and a common +colour bar can be set for the multi-panel. + +A common use case is to plot the ensemble members at a given time step: + +```r +PlotLayout(PlotEquiMap, c('lat', 'lon'), + Subset(map_data$mod, + list('dataset', 'sdate', 'ftime'), + list(1, 1, 1)), + map_data$lon, map_data$lat, units = 'K', + toptitle = "Exp. A 'tas', s. date: 1990-11-01, f.time: 1 month", + titles = paste("Member", 1:dim(map_data$mod)[2]), + brks = 50, bar_limits = c(250, 300), coast_color = 'black', + fileout = "vis_layout_equimap_expA.png") +``` + + + +But really complex layouts can be achieved thanks to the great number of +available parameters: + +```r +ens_mean <- Subset(world_data$mod, + list('dataset', 'sdate', 'ftime'), + list(1, 1, 1)) +layout <- PlotLayout(fun = c('PlotEquiMap', 'plot', 'plot', 'PlotStereoMap'), + plot_dims = c('lat', 'lon'), + var = list(ens_mean, array(1:10), + array(10:1), ens_mean), + lon = world_data$lon, lat = world_data$lat, + fill = 'black', sizetit = 0.5, axes_label_scale = 0.6, + coast_color = 'yellow', + titles = paste('Fig.', 1:12), toptitle = 'Multipanel', + row_titles = paste('Row', 1:3), col_titles = paste('Col', 1:4), + drawleg = 'E', units = 'K', units_scale = 2, + bar_limits = c(275, 300), brks = 26, + bar_extra_labels = 281:284, + width = 12, height = 10, res = 200, + fileout = 'vis_layout_complex.png') +``` + + + +### PlotSection() + +