diff --git a/README.md b/README.md index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..e540516abb7c8fc62f42f8db72f9a766cc11cba3 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,21 @@ + +ESS Verification Suite +====================== + +This is the Git project for the ESS Verification Suite, which will serve as a tool for research projects and operational workflows involving subseasonal to seasonal to decadal forecast verification. + +The main developers of the tool are Victòria Agudetse (@vagudets), An-Chi Ho (@aho), Lluís Palma (@lpalma) and Núria Pérez-Zanón (@nperez). + +Resources +--------- + +Here you can access a presentation containing information relevant to the tool: +[ESS Verification Suite](https://docs.google.com/presentation/d/1R8Gcz5R_NTgcBQvXBkCPG3jY31BVPDur/edit#slide=id.p1?target=_blank) + +Branching strategy +------------------ + +Branches containing developments that are to be merged into the tool must contain "dev-" at the beginning of the name, followed by a short, meaningful description of the development in question. E.g. "dev-loading-subseasonal" for the branch containing developments related to the loading of subseasonal datasets. + +Users may have their own branches for personal use. These branches should start with "user-\", which can optionally be followed by a brief description. E.g. "user-vagudets-FOCUS". + diff --git a/modules/Calibration/Calibration.R b/modules/Calibration/Calibration.R index 2463fcbd1e22ae49883bcf290aee93d70635ede9..1478ecd5f6a4e606b771fa4b326e7d545e60d0e6 100644 --- a/modules/Calibration/Calibration.R +++ b/modules/Calibration/Calibration.R @@ -1,238 +1,112 @@ -## Code to apply any correction method: -## simple bias adjustment -## variance inflation -## quantile mapping -# -### Which parameter are required? -#if (!("obs" %in% ls()) || is.null(obs)) { -# error(logger, -# "There is no object 'obs' in the global environment or it is NULL") -# stop("EXECUTION FAILED") -#} -#if (stream == "fcst" && (!("fcst" %in% ls()) || is.null(fcst))) { -# error(logger, -# "There is no object 'fcst' in the global environment or it is NULL") -# stop("EXECUTION FAILED") -#} -#if (!("hcst" %in% ls()) || is.null(hcst)) { -# error(logger, -# "There is no object 'hcst' in the global environment or it is NULL") -# stop("EXECUTION FAILED") -#} -#if (!("method" %in% ls()) || is.null(method)) { -# warn(logger, -# "Calibration method not found and it is set as 'SBC'.") -# method <- 'SBC' -#} -#if (method %in% c('SBC', 'SimpleBiasCorrection')) { -# cal_fun <- "CSTools::Calibration" -# cal_method <- "bias" -#} else if (method %in% c("Inflation", "VarianceInflation")) { -# cal_fun <- "CSTools::Calibration" -# cal_method <- "evmos" -#} else if (method %in% c("QM", "QuantileMapping")) { -# cal_fun <- "CSTools::QuantileMapping" -#} else { -# error(logger, "Unknown calibration method definde in the recipe.") -# stop("EXECUTION FAILED") -#} -#info(logger, paste("#-------------------------- ", "\n", -# "running Calibration module", "\n", -# "it can call", cal_fun )) -#' Function that bias-adjust the S2S hindcast -# -#' this function bias adjust the hindcast data using -#' the leave-one-out method and the specified fun -# -#'@param obs A numeric array with named dimensions, -#' representing the observational data used -#' in the bias-adjustment method. -#' 'sday','syear','ensemble' are mandatory dims - -#'@param hcst A numeric array with named dimensions, -#' representing the system hindcast data used -#' in the bias-adjustment method. -#' 'sday','syear','ensemble' are mandatory dims - -#'@param fun bias adjustment function - -#'@param mm TRUE if the experiment data is conformed by -#' multiple systems (dat > 1) - -#'@param ncores An integer indicating the number of cores to -#' use for parallel computation. The default value is NULL. - -#'@param target_dims Dims needed to do the calibration -#'@param output_dims Output dims from the calib fun - -#'@return A numeric array with the bias adjusted hindcast, -#' - -CST_CALIB_METHODS <- c("bias", "evmos", "mse_min", "crps_min", "rpc-based") - -hcst_calib <- function(obs, hcst, method, mm=F, na.rm=T, - split_factor=1, ncores=32) -{ - - # Replicates observations for bias adjusting each - # system of the multi-model separately - ## TODO: Adapt code for multi-model case - if(mm){ - obs.mm <- obs - for(dat in 1:(dim(hcst)['dat'][[1]]-1)){ - obs.mm <- abind(obs2, obs, - along = which(names(dim(obs)) == 'dat')) +## TODO: Remove once Alba's fun is merged in CSTools +source("tools/tmp/CST_Calibration.R") + +## Entry params data and recipe? +calibrate_datasets <- function(data, recipe) { + # Function that calibrates the hindcast using the method stated in the + # recipe. If the forecast is not null, it calibrates it as well. + # + # data: list of s2dv_cube objects containing the hcst, obs and fcst. + # recipe: object obtained when passing the .yml recipe file to read_yaml() + hcst <- data$hcst + obs <- data$obs + fcst <- data$fcst + + # Calibration function params + method <- recipe$Analysis$Workflow$Calibration$method + mm <- recipe$Analysis$Datasets$Multimodel + ncores <- 4 + na.rm <- T + + # Replicate observation array for the multi-model case + if (mm) { + obs.mm <- obs$data + for(dat in 1:(dim(hcst$data)['dat'][[1]]-1)) { + obs.mm <- abind(obs.mm, obs$data, + along=which(names(dim(obs$data)) == 'dat')) } - names(dim(obs.mm)) <- names(dim(obs)) - obs <- obs.mm + names(dim(obs.mm)) <- names(dim(obs$data)) + obs$data <- obs.mm remove(obs.mm) } - - if (method %in% CST_CALIB_METHODS) { - # Hcst calibration - hcst <- CSTools::CST_Calibration(hcst, obs, - cal.method = method, - eval.method = "leave-one-out", - multi.model = mm, - na.fill = TRUE, - na.rm = na.rm, - apply_to = NULL, - alpha = NULL, - memb_dim = "ensemble", - sdate_dim = "syear", - ncores = ncores) - - hcst$data[!is.finite(hcst$data)] <- NA - - } else if (method %in% c("s2s4e")) { - hcst <- Apply(data=list(obs = obs$data, hcst = hcst$data), - extra_info = list(na.rm = na.rm), - target_dims = c('sday', 'syear', 'ensemble'), - output_dims = c('sday', 'syear', 'ensemble'), - na.rm = na.rm, - ncores = ncores, - fun = .cal)[[1]] - - hcst[!is.finite(hcst)] <- NA - } else { - ## TODO: Implement other calibration methods, implement logger - stop("Calibration method is not implemented in CSTools") - # error(logger, - # "Calibration method is not implemented in CSTools") - } - - # hcst$data[!is.finite(hcst$data)] <- NA - ## TODO: Is this line necessary? - # remove(hcst) - - # Merges all the ensembles from the different systems into - # one single ensemble - if(mm){ - hcst <- MergeDims(hcst, - merge_dims = c('dat', 'ensemble'), - rename_dim = 'ensemble') - hcst <- drop_na_dim(hcst, 'ensemble') - } - -# # removes dat and var dimensions if needed -# try(hcst<-Subset(hcst, -# c('var'), list(1), drop='selected')) -# try(hcst<-Subset(hcst, -# c('dat'), list(1), drop='selected')) - - return(hcst) -} - -#' Function that bias-adjust the S2S forecast -# -#' this function bias adjust the forecast data using -#' hindcast and observational data -# -#'@param obs A numeric array with named dimensions, -#' representing the observational data used -#' in the bias-adjustment method. -#' 'sday','syear','ensemble' are mandatory dims - -#'@param hcst A numeric array with named dimensions, -#' representing the system hindcast data used -#' in the bias-adjustment method. -#' 'sday','syear','ensemble' are mandatory dims - -#'@param fcst A numeric array with named dimensions, -#' representing the system forecast to be -#' bias-adjusted. -#' 'sday','syear','ensemble' are mandatory dims - -#'@param fun bias adjustment function - -#'@param mm TRUE if the experiment data is conformed by -#' multiple systems (dat > 1) - -#'@param ncores An integer indicating the number of cores to -#' use for parallel computation. The default value is NULL. - -#'@param target_dims Dims needed to do the calibration -#'@param output_dims Output dims from the calib fun - -#'@return A numeric array with the bias adjusted forecast, -#' - -## TODO: -## needs and update: look at hcst version -fcst_calib <- function(obs, hcst, fcst, fun, mm=F, - na.rm=T, split_factor=1, ncores=32, - ## TODO: establish target dims and output dims - target_dims=c('sday', 'syear', 'ensemble'), - output_dims=c('sday', 'syear', 'ensemble')) -{ - - # Replicates observations for bias adjusting each - # system of the multi-model separately - ## TODO: Adapt multi-model case - if(mm){ - obs.mm <- obs - for(dat in 1:(dim(hcst)['dat'][[1]]-1)){ - obs.mm <- abind(obs2,obs, - along=which(names(dim(obs)) == 'dat')) + if (recipe$Analysis$Variables$freq == "monthly_mean") { + + CST_CALIB_METHODS <- c("bias", "evmos", "mse_min", "crps_min", "rpc-based") + ## TODO: implement other calibration methods + ## TODO: Restructure the code? + if (!(method %in% CST_CALIB_METHODS)) { + stop("Calibration method in the recipe is not available for monthly", + " data.") + } else { + ## Alba's version of CST_Calibration (pending merge) is being used + # Calibrate the hindcast + hcst_calibrated <- CST_Calibration(hcst, obs, + cal.method = method, + eval.method = "leave-one-out", + multi.model = mm, + na.fill = TRUE, + na.rm = na.rm, + apply_to = NULL, + alpha = NULL, + memb_dim = "ensemble", + sdate_dim = "syear", + ncores = ncores) + if (!is.null(fcst)) { + # Calibrate the forecast + fcst_calibrated <- CST_Calibration(hcst, obs, fcst, + cal.method = method, + eval.method = "leave-one-out", + multi.model = mm, + na.fill = TRUE, + na.rm = na.rm, + apply_to = NULL, + alpha = NULL, + memb_dim = "ensemble", + sdate_dim = "syear", + ncores = ncores) + } else { + fcst_calibrated <- NULL + } + + } + + } else if (recipe$Analysis$Variables$freq == "daily_mean") { + # Daily data calibration using Quantile Mapping + if (!(method %in% c("qmap"))) { + stop("Calibration method in the recipe is not available at daily ", + "frequency. Only quantile mapping 'qmap' is implemented.") + } + # Calibrate the hindcast + hcst_calibrated <- CST_QuantileMapping(hcst, obs, + exp_cor = NULL, + sample_dims = c("syear", + "time", + "ensemble"), + sample_length = NULL, + method = "QUANT", + ncores = ncores, + na.rm = na.rm) + + if (!is.null(fcst)) { + # Calibrate the forecast + fcst_calibrated <- CST_QuantileMapping(hcst, obs, + exp_cor = fcst, + sample_dims = c("syear", + "time", + "ensemble"), + sample_length = NULL, + method = "QUANT", + ncores = ncores, + na.rm = na.rm) + } else { + fcst_calibrated <- NULL } - names(dim(obs.mm)) <- names(dim(obs)) - obs <- obs.mm - remove(obs.mm) } - - # Fcst Calibration - ## TODO: Return s2dv_cube - calibrated_fcst <- Apply(data=list(obs = obs$data, hcst = hcst$data, fcst = fcst$data), - extra_info = list(na.rm = na.rm), - target_dims = target_dims, - output_dims = output_dims, - na.rm = na.rm, - ncores = ncores, - fun = .fcstcal)[[1]] - - ## TODO: Restore this line - # calibrated_fcst$data[!is.finite(calibrated_fcst$data)] <- NA - calibrated_fcst[!is.finite(calibrated_fcst)] <- NA - # Merges all the ensembles from the different systems into - # one single ensemble - ## TODO: Adapt multi-model case - if(mm){ - calibrated_fcst <- MergeDims(calibrated_fcst, - merge_dims=c('dat', 'ensemble'), - rename_dim = 'ensemble') - calibrated_fcst <- drop_na_dim(calibrated_fcst, 'ensemble') - } - -# # removes dat and var dimensions if needed -# try(calibrated_fcst<-Subset(calibrated_fcst, -# c('var'),list(1),drop='selected')) -# try(calibrated_fcst<-Subset(calibrated_fcst, -# c('dat'),list(1),drop='selected')) - - return(calibrated_fcst) - - + print("##### CALIBRATION COMPLETE #####") + ## TODO: Return observations too? + ## TODO: Change naming convention? + return(list(hcst = hcst_calibrated, fcst = fcst_calibrated)) } diff --git a/modules/Calibration/Calibration_fcst3.R b/modules/Calibration/Calibration_fcst3.R deleted file mode 100644 index 3d24226be716b015f994ae4830612606206489f0..0000000000000000000000000000000000000000 --- a/modules/Calibration/Calibration_fcst3.R +++ /dev/null @@ -1,186 +0,0 @@ - -# Performs the hcst calibration for a centered time -# window of sdays. Objects must have the dimensions; -# sday, ensemble & syear. -# If sday dim = 1; no time window calibration is performed. -.cal <- function(var_obs, var_exp, na.rm=F) { - - ntime <- dim(var_exp)[which(names(dim(var_exp)) == 'syear')][] - nmembers <- dim(var_exp)[which(names(dim(var_exp)) == 'ensemble')][] - ndays <- dim(var_exp)[which(names(dim(var_exp)) == 'sday')][] - - # selects only the centered day of time window - if ( ndays == 1 ){ - fcst.index <- 1 - } else { - fcst.index <- (ndays + 1)/2 - } - - ind <- 1:ntime - - climObs <- sapply(ind, function(x) {mean( - Subset(var_obs, along='syear', indices=-x), na.rm=na.rm)}) - - climObs2 <- sapply(ind, function(x) {mean(var_obs, na.rm=na.rm)}) - - climPred <- sapply(ind, function(x) {mean( - Subset(var_exp, along='syear', indices=-x), na.rm=na.rm)}) - - climPred2 <- sapply(ind, function(x) {mean(var_exp, na.rm=na.rm)}) - - var_obs.ano <- var_obs - climObs2 - var_fcst.ano <- var_exp - climPred - var_exp.ano <- var_exp - climPred2 - - calib.hcst <- NA * Subset(var_exp, along=c('sday', 'syear'), - c(fcst.index, 0)) - # loop over the hindcast years - for (t in 1 : ntime) { - - # defining forecast,hindcast in cross-validation - fcst.ano <- Subset(var_fcst.ano, along=c('syear', 'sday'), - indices=list(t, fcst.index)) - - hcst.ano <- Subset(var_exp.ano, along=c('syear'), indices=list(-t)) - - obs.ano <- Subset(var_obs.ano, along='syear', indices=-t) - - # computes ensemble mean of the fcst & hcst - em_fcst <- mean(fcst.ano, na.rm=na.rm) - em_hcst <- Apply(hcst.ano, target_dims=c('ensemble'), fun=mean, - na.rm=na.rm)$output1 - - # Changes the correlation method in case of NAs - if (na.rm){ - corr.method <- "pairwise.complete.obs" - } else { - corr.method <- "everything" - } - corr <- cor(as.vector(em_hcst), obs.ano, use = corr.method) - sd_obs <- sd(obs.ano, na.rm=na.rm) - sd_em_hcst <- sd(as.vector(em_hcst), na.rm=na.rm) - - # Computes difference between fcst anomaly and ensemble mean - fcst_diff <- fcst.ano - em_fcst - hcst_diff <- array(numeric(), c(ndays,ntime-1,0)) - names(dim(hcst_diff)) <- names(dim(hcst.ano)) - for (n in 1 : nmembers) { - diff <- Subset(hcst.ano, along='ensemble', indices=n, - drop='selected') - em_hcst - hcst_diff <- abind(hcst_diff, diff, - along=which(names(dim(var_exp)) == 'ensemble'), - hier.names=F) - } - - sd_hcst_diff <- sd(hcst_diff, na.rm=na.rm) - - a <- abs(corr) * (sd_obs / sd_em_hcst) - b <- (sd_obs / sd_hcst_diff) * sqrt(1 - (corr ^ 2)) - # print(paste0("climObs: ", climObs[t])) # - # print(paste0("sd_obs :", sd_obs)) # - # print(paste0("sd_hcst_diff :", sd_hcst_diff)) - # print(paste0("sd_em_hcst :", sd_em_hcst)) # - # print(paste0("em_hcst :", em_hcst)) - # print(paste0("Alpha - Lluís:", a)) # - # print(paste0("Beta - Lluís:", b)) # - # print(paste0("fcst_diff: ", fcst_diff)) - # print(paste0("climPred: ", climPred)) - # print(paste0("climCross: ", climCross[t])) - # print(paste0("em_fcst :", em_fcst)) - - # Calibrates hcst sdate - calib.syear <- ((a * em_fcst) + (b * fcst_diff) + climObs[t]) - # Appends calibrated sdate to the hcst array - calib.hcst <- abind(calib.hcst, calib.syear, - along=which(names(dim(var_exp)) == 'syear'), - hier.names=F) - } - - names(dim(calib.hcst)) <- names(dim(calib.syear)) - # removes NAs added in array init - calib.hcst <- Subset(calib.hcst, along='syear', indices=-1) - - return(calib.hcst) - -} - -# Performs the fcst calibration for a centered time -# window of sdays. Objects must have the dimensions; -# sday, ensemble & syear. -# If sday dim = 1; no time window calibration is performed. -.fcstcal <- function(obs, hcst, fcst, na.rm=F) { - - ntime <- dim(hcst)['syear'] - nmembers <- dim(hcst)['ensemble'] - fcst.nmembers <- dim(fcst)['ensemble'] - ndays <- dim(hcst)['sday'] - - # selects only the centered day of time window - if ( ndays == 1 ){ - fcst.index <- 1 - } else { - fcst.index <- (ndays + 1)/2 - } - - ind <- 1:ntime - climObs <- mean(obs, na.rm=na.rm) - climPred <- mean(hcst, na.rm=na.rm) - - obs.ano <- obs - climObs - hcst.ano <- hcst - climPred - fcst.ano <- fcst - climPred - - calib.fcst <- NA * fcst - - # computes ensemble mean of the fcst & hcst - em_fcst <- mean(fcst.ano, na.rm=na.rm) - em_hcst <- Apply(hcst.ano, target_dims=c('ensemble'), fun=mean, - na.rm=na.rm)$output1 - - # Changes the correlation method in case of NAs allowed - if (na.rm){ - corr.method <- "pairwise.complete.obs" - } else { - corr.method <- "everything" - } - - corr <- cor(as.vector(em_hcst), obs.ano, use = corr.method) - sd_obs <- sd(obs.ano, na.rm=na.rm) - sd_em_hcst <- sd(as.vector(em_hcst), na.rm=na.rm) - - # Computes difference between fcst anomaly and ensemble mean - fcst_diff <- fcst.ano - em_fcst - hcst_diff <- array(numeric(), c(ndays,ntime,0)) - names(dim(hcst_diff)) <- names(dim(hcst.ano)) - for (n in 1 : nmembers) { - diff <- Subset(hcst.ano, along='ensemble', indices=n, - drop='selected') - em_hcst - hcst_diff <- abind(hcst_diff, diff, - along=which(names(dim(hcst)) == 'ensemble'), - hier.names=F) - } - - sd_hcst_diff <- sd(hcst_diff, na.rm) - # print(paste0("Sigma e:", sd_hcst_diff)) - a <- abs(corr) * (sd_obs / sd_em_hcst) - b <- (sd_obs / sd_hcst_diff) * sqrt(1 - (corr ^ 2)) - ## print(paste0("climObs: ", climObs)) - ## print(paste0("sd_obs :", sd_obs)) - # print(paste0("sd_hcst_diff :", sd_hcst_diff)) - # print(paste0("sd_em_hcst :", sd_em_hcst)) - # print(paste0("em_hcst :", em_hcst)) - ## print(paste0("Alpha - Lluís:", a)) - ## print(paste0("Beta - Lluís:", b)) - # print(paste0("climObs: ", climObs)) - # print(paste0("fcst_diff: ", fcst_diff)) - # print(paste0("em_fcst :", em_fcst)) - - # Calibrates hcst sdate - fcst.calib <- ((a * em_fcst) + (b * fcst_diff) + climObs) - # fcst_cube$data <- fcst.calib - return(fcst.calib) - # return(Subset(fcst.calib, c('sday', 'syear'), list(1,1), drop='selected')) - -} - - diff --git a/modules/Calibration/Calibration_fcst4.R b/modules/Calibration/Calibration_fcst4.R deleted file mode 100644 index 9359c6e81ff94aaf83c3ed716fc5f9c672579241..0000000000000000000000000000000000000000 --- a/modules/Calibration/Calibration_fcst4.R +++ /dev/null @@ -1,157 +0,0 @@ - -# Performs the hcst calibration for a centered time -# window of sdays. Objects must have the dimensions; -# sday, ensemble & syear. -# If sday dim = 1; no time window calibration is performed. -.cal <- function(var_obs, var_exp, na.rm=F) { - - ntime <- dim(var_exp)[which(names(dim(var_exp)) == 'syear')][] - nmembers <- dim(var_exp)[which(names(dim(var_exp)) == 'ensemble')][] - ndays <- dim(var_exp)[which(names(dim(var_exp)) == 'sday')][] - - # selects only the centered day of time window - if ( ndays == 1 ){ - fcst.index <- 1 - } else { - fcst.index <- (ndays + 1)/2 - } - - ind <- 1:ntime - - climObs <- sapply(ind, function(x) {mean(var_obs, na.rm=na.rm)}) - climPred <- sapply(ind, function(x) {mean( - Subset(var_exp, along='syear', indices=-x), na.rm=na.rm)}) - - var_obs.ano <- var_obs - climObs - var_exp.ano <- var_exp - climPred - - calib.hcst <- NA * Subset(var_exp, along=c('sday', 'syear'), - c(fcst.index, 0)) - # loop over the hindcast years - for (t in 1 : ntime) { - - # defining forecast,hindcast in cross-validation - fcst.ano <- Subset(var_exp.ano, along=c('syear', 'sday'), - indices=list(t, fcst.index)) - hcst.ano <- Subset(var_exp.ano, along=c('syear'), indices=list(-t)) - - obs.ano <- Subset(var_obs.ano, along='syear', indices=-t) - - # computes ensemble mean of the fcst & hcst - em_fcst <- mean(fcst.ano, na.rm=na.rm) - em_hcst <- Apply(hcst.ano, target_dims=c('ensemble'), fun=mean, - na.rm=na.rm)$output1 - - # Changes the correlation method in case of NAs - if (na.rm){ - corr.method <- "pairwise.complete.obs" - } else { - corr.method <- "everything" - } - corr <- cor(as.vector(em_hcst), obs.ano, use = corr.method) - sd_obs <- sd(obs.ano, na.rm=na.rm) - sd_em_hcst <- sd(as.vector(em_hcst), na.rm=na.rm) - - # Computes difference between fcst anomaly and ensemble mean - fcst_diff <- fcst.ano - em_fcst - hcst_diff <- array(numeric(), c(ndays,ntime-1,0)) - names(dim(hcst_diff)) <- names(dim(hcst.ano)) - for (n in 1 : nmembers) { - diff <- Subset(hcst.ano, along='ensemble', indices=n, - drop='selected') - em_hcst - hcst_diff <- abind(hcst_diff, diff, - along=which(names(dim(var_exp)) == 'ensemble'), - hier.names=F) - } - - sd_hcst_diff <- sd(hcst_diff, na.rm=na.rm) - - a <- abs(corr) * (sd_obs / sd_em_hcst) - b <- (sd_obs / sd_hcst_diff) * sqrt(1 - (corr ^ 2)) - - # Calibrates hcst sdate - calib.syear <- ((a * em_fcst) + (b * fcst_diff) + climObs[t]) - # Appends calibrated sdate to the hcst array - calib.hcst <- abind(calib.hcst, calib.syear, - along=which(names(dim(var_exp)) == 'syear'), - hier.names=F) - } - - names(dim(calib.hcst)) <- names(dim(calib.syear)) - # removes NAs added in array init - calib.hcst <- Subset(calib.hcst, along='syear', indices=-1) - - return(calib.hcst) - -} - -# Performs the fcst calibration for a centered time -# window of sdays. Objects must have the dimensions; -# sday, ensemble & syear. -# If sday dim = 1; no time window calibration is performed. -.fcstcal <- function(obs, hcst, fcst, na.rm=F) { - - ntime <- dim(hcst)['syear'] - nmembers <- dim(hcst)['ensemble'] - fcst.nmembers <- dim(fcst)['ensemble'] - ndays <- dim(hcst)['sday'] - - # selects only the centered day of time window - if ( ndays == 1 ){ - fcst.index <- 1 - } else { - fcst.index <- (ndays + 1)/2 - } - - ind <- 1:ntime - climObs <- mean(obs, na.rm=na.rm) - climPred <- mean(hcst, na.rm=na.rm) - - obs.ano <- obs - climObs - hcst.ano <- hcst - climPred - fcst.ano <- fcst - climPred - - calib.fcst <- NA * fcst - - # computes ensemble mean of the fcst & hcst - em_fcst <- mean(fcst.ano, na.rm=na.rm) - em_hcst <- Apply(hcst.ano, target_dims=c('ensemble'), fun=mean, - na.rm=na.rm)$output1 - - # Changes the correlation method in case of NAs allowed - if (na.rm){ - corr.method <- "pairwise.complete.obs" - } else { - corr.method <- "everything" - } - - corr <- cor(as.vector(em_hcst), obs.ano, use = corr.method) - sd_obs <- sd(obs.ano, na.rm=na.rm) - sd_em_hcst <- sd(as.vector(em_hcst), na.rm=na.rm) - - # Computes difference between fcst anomaly and ensemble mean - fcst_diff <- fcst.ano - em_fcst - hcst_diff <- array(numeric(), c(ndays,ntime,0)) - names(dim(hcst_diff)) <- names(dim(hcst.ano)) - for (n in 1 : nmembers) { - diff <- Subset(hcst.ano, along='ensemble', indices=n, - drop='selected') - em_hcst - hcst_diff <- abind(hcst_diff, diff, - along=which(names(dim(hcst)) == 'ensemble'), - hier.names=F) - } - - sd_hcst_diff <- sd(hcst_diff, na.rm) - - a <- abs(corr) * (sd_obs / sd_em_hcst) - b <- (sd_obs / sd_hcst_diff) * sqrt(1 - (corr ^ 2)) - - # Calibrates hcst sdate - fcst.calib <- ((a * em_fcst) + (b * fcst_diff) + climObs) - # fcst_cube$data <- fcst.calib - return(fcst.calib) - # return(Subset(fcst.calib, c('sday', 'syear'), list(1,1), drop='selected')) - -} - - diff --git a/modules/Calibration/test_calib.R b/modules/Calibration/test_calib.R deleted file mode 100644 index 8c6600f3e60bf4986c56597265531030cd4e8b16..0000000000000000000000000000000000000000 --- a/modules/Calibration/test_calib.R +++ /dev/null @@ -1,112 +0,0 @@ - -## TODO: Remove once Alba's fun is merged in CSTools -source("https://earth.bsc.es/gitlab/external/cstools/-/raw/develop-CalibrationForecast/R/CST_Calibration.R") - -## Entry params data and recipe? -calibrate_datasets <- function(data, recipe) { - # Function that calibrates the hindcast using the method stated in the - # recipe. If the forecast is not null, it calibrates it as well. - # - # data: list of s2dv_cube objects containing the hcst, obs and fcst. - # recipe: object obtained when passing the .yml recipe file to read_yaml() - hcst <- data$hcst - obs <- data$obs - fcst <- data$fcst - - # Calibration function params - method <- recipe$Analysis$Workflow$Calibration$method - mm <- recipe$Analysis$Datasets$Multimodel - ncores <- 4 - na.rm <- T - - # Replicate observation array for the multi-model case - if (mm) { - obs.mm <- obs$data - for(dat in 1:(dim(hcst$data)['dat'][[1]]-1)) { - obs.mm <- abind(obs.mm, obs$data, - along=which(names(dim(obs$data)) == 'dat')) - } - names(dim(obs.mm)) <- names(dim(obs$data)) - obs$data <- obs.mm - remove(obs.mm) - } - - if (recipe$Analysis$Variables$freq == "monthly_mean") { - - CST_CALIB_METHODS <- c("bias", "evmos", "mse_min", "crps_min", "rpc-based") - ## TODO: implement other calibration methods - ## TODO: Restructure the code? - if (!(method %in% CST_CALIB_METHODS)) { - stop("Calibration method in the recipe is not available for monthly", - " data.") - } else { - ## Alba's version of CST_Calibration (pending merge) is being used - # Calibrate the hindcast - hcst_calibrated <- CST_Calibration(hcst, obs, - cal.method = method, - eval.method = "leave-one-out", - multi.model = mm, - na.fill = TRUE, - na.rm = na.rm, - apply_to = NULL, - alpha = NULL, - memb_dim = "ensemble", - sdate_dim = "syear", - ncores = ncores) - if (!is.null(fcst)) { - # Calibrate the forecast - fcst_calibrated <- CST_Calibration(hcst, obs, fcst, - cal.method = method, - eval.method = "leave-one-out", - multi.model = mm, - na.fill = TRUE, - na.rm = na.rm, - apply_to = NULL, - alpha = NULL, - memb_dim = "ensemble", - sdate_dim = "syear", - ncores = ncores) - } else { - fcst_calibrated <- NULL - } - - } - - } else if (recipe$Analysis$Variables$freq == "daily_mean") { - # Daily data calibration using Quantile Mapping - if (!(method %in% c("qmap"))) { - stop("Calibration method in the recipe is not available at daily ", - "frequency. Only quantile mapping 'qmap' is implemented.") - } - # Calibrate the hindcast - hcst_calibrated <- CST_QuantileMapping(hcst, obs, - exp_cor = NULL, - sample_dims = c("syear", - "time", - "ensemble"), - sample_length = NULL, - method = "QUANT", - ncores = ncores, - na.rm = na.rm) - - if (!is.null(fcst)) { - # Calibrate the forecast - fcst_calibrated <- CST_QuantileMapping(hcst, obs, - exp_cor = fcst, - sample_dims = c("syear", - "time", - "ensemble"), - sample_length = NULL, - method = "QUANT", - ncores = ncores, - na.rm = na.rm) - } else { - fcst_calibrated <- NULL - } - } - - print("##### CALIBRATION COMPLETE #####") - ## TODO: Return observations too? - ## TODO: Change naming convention? - return(list(hcst = hcst_calibrated, fcst = fcst_calibrated)) -} diff --git a/modules/Saving/Save_data.R b/modules/Saving/Save_data.R index ad6906f1eaf90228518bde5f80eaf4e15f32c57d..3ace35b85eff85e8dc01b3c722e5fb61c3834244 100644 --- a/modules/Saving/Save_data.R +++ b/modules/Saving/Save_data.R @@ -1,21 +1,19 @@ -library(easyNCDF) +## TODO: Change names of functions to make sure they don't clash +## with others in the tool - -get_times <- function(fcst.type, leadtimes, sdate) { +get_times <- function(fcst.horizon, leadtimes, sdate) { ## TODO: Adapt to Auto-S2S recipe format - ## TODO: Is all of this necessary? Is some of it already computed by - ## dates2load? - # fcst.type <- recipe$Analysis$Horizon - switch(tolower(fcst.type), + switch(fcst.horizon, ## TODO: Remove "sub_obs"? + ## TODO: daily case? "seasonal" = {len <- length(leadtimes); ref <- 'months since '; stdname <- paste(strtoi(leadtimes), collapse=", ")}, - "sub_obs" = {len <- 52; ref <- 'week of the year '; + "sub_obs" = {len <- 52; ref <- 'week of the year '; stdname <- paste(strtoi(leadtimes), collapse=", ")}, "subseasonal" = {len <- 4; ref <- 'weeks since '; stdname <- ''} - ) + ) time <- 1:len dim(time) <- length(time) @@ -42,34 +40,53 @@ get_times <- function(fcst.type, leadtimes, sdate) { } -get_latlon <- function(lat, lon) { - - longitude <- lon - dim(longitude) <- length(longitude) - metadata <- list(longitude = list(units = 'degrees_east')) - attr(longitude, 'variables') <- metadata - names(dim(longitude)) <- 'longitude' - - latitude <- lat - dim(latitude) <- length(latitude) - metadata <- list(latitude = list(units = 'degrees_north')) - attr(latitude, 'variables') <- metadata - names(dim(latitude)) <- 'latitude' +get_latlon <- function(latitude, longitude) { + ## TODO: Verify if this is needed or if it needs a change now that we are + ## using s2dv_cube objects + # Adds dimensions and metadata to lat and lon + # latitude: array containing the latitude values + # longitude: array containing the longitude values + ## the inputs are simply the arrays with the values? + dim(longitude) <- length(longitude) + ## TODO: Extract metadata from s2dv_cube + metadata <- list(longitude = list(units = 'degrees_east')) + # Add metadata to attributes + attr(longitude, 'variables') <- metadata + # Add longitude dimension + names(dim(longitude)) <- 'longitude' + + dim(latitude) <- length(latitude) + ## TODO: Extract metadata from s2dv_cube + metadata <- list(latitude = list(units = 'degrees_north')) + # Add metadata to attributes + attr(latitude, 'variables') <- metadata + # Add latitude dimension + names(dim(latitude)) <- 'latitude' - return(list(lat=latitude, lon=longitude)) + return(list(lat=latitude, lon=longitude)) } -save_metrics <- function(variable, - skill, - fcst.sdate, +## TODO: Place inside a function somewhere +# if (tolower(agg) == "country"){ +# load(mask.path) +# grid <- europe.countries.iso +# } else { +# grid <- list(lon=attr(var.obs, 'Variables')$dat1$longitude, +# lat=attr(var.obs, 'Variables')$dat1$latitude) +# } + +save_metrics <- function(skill, + recipe, grid, outfile, - monthnames, - fcst.type, - agg) { + leadtimes, + agg="global") { + + fcst.horizon <- tolower(recipe$Analysis$Horizon) + fcst.sdate <- paste0(recipe$Analysis$Time$sdate$fcst_syear, + recipe$Analysis$Time$sdate$fcst_sday) - ## TODO: Remove variable as entry parameter lalo <- c('longitude', 'latitude') @@ -95,21 +112,19 @@ save_metrics <- function(variable, names(dim(skill[[i]])) <- dims } - times <- get_times(fcst.type, monthnames, fcst.sdate) + times <- get_times(fcst.horizon, leadtimes, fcst.sdate) time <- times$time time_step <- times$time_step - if (tolower(agg) == "country") { - - country <- get_countries(grid) - ArrayToNc(append(country, time, skill, time_step), outfile) + # if (tolower(agg) == "country") { + # country <- get_countries(grid) + # ArrayToNc(append(country, time, skill, time_step), outfile) + # } else { - } else { + latlon <- get_latlon(grid$lat, grid$lon) + vars <- list(latlon$lat, latlon$lon, time) + vars <- c(vars, skill, list(time_step)) + ArrayToNc(vars, outfile) - latlon <- get_latlon(grid$lat, grid$lon) - vars <- list(latlon$lat, latlon$lon, time) - vars <- c(vars, skill, list(time_step)) - ArrayToNc(vars, outfile) - } } diff --git a/modules/Skill/Skill.R b/modules/Skill/Skill.R index f641fa6a57edc3281f5fd256736f07f1b996c32a..caf4939d117dc89a313deaed2c15b81f96ac2dec 100644 --- a/modules/Skill/Skill.R +++ b/modules/Skill/Skill.R @@ -9,9 +9,9 @@ source("modules/Skill/s2s.metrics.R") ## TODO: Remove once the new version of s2dv is released -source("https://earth.bsc.es/gitlab/es/s2dv/-/raw/master/R/RandomWalkTest.R") -source("https://earth.bsc.es/gitlab/es/s2dv/-/raw/develop-RPSS/R/RPS.R") -source("https://earth.bsc.es/gitlab/es/s2dv/-/raw/develop-RPSS/R/RPSS.R") +source("tools/tmp/RandomWalkTest.R") +source("tools/tmp/RPS.R") +source("tools/tmp/RPSS.R") ## TODO: Implement this in the future ## Which parameter are required? diff --git a/modules/data_load/load.R b/modules/data_load/load.R index 570788eedf404ebe2c680e8aad003ffd3c113a36..aaadf5532f58dd530e2144745c497891fbedcbdb 100755 --- a/modules/data_load/load.R +++ b/modules/data_load/load.R @@ -1,6 +1,5 @@ ## TODO: remove paths to personal scratchs source("/esarchive/scratch/vagudets/repos/csoperational/R/get_regrid_params.R") -source("/esarchive/scratch/vagudets/repos/cstools/R/s2dv_cube.R") # Load required libraries/funs source("modules/data_load/dates2load.R") source("tools/libs.R") @@ -278,31 +277,10 @@ load_datasets <- function(recipe_file) { obs <- as.s2dv_cube(obs) # Print a summary of the loaded data for the user, for each object - # (hcst, obs and fcst). - ## TODO: Incorporate into logger - ## TODO: Adapt to daily/subseasonal cases - ## TODO: Add check for missing files/NAs - data_summary <- function(object) { - # Get name and start dates - object_name <- deparse(substitute(object)) - sdate_min <- format(min(as.Date(object$Dates[[1]])), format = '%b %Y') - sdate_max <- format(max(as.Date(object$Dates[[1]])), format = '%b %Y') - - print("DATA SUMMARY:") - print(paste0(object_name, " range: ", sdate_min, " to ", sdate_max)) - print(paste0(object_name, " dimensions: ")) - print(dim(object$data)) - print(paste0("Statistical summary of the data in ", object_name, ":")) - print(summary(object$data)) - print("---------------------------------------------") - - } - - - data_summary(hcst) - data_summary(obs) + data_summary(hcst, store.freq) + data_summary(obs, store.freq) if (!is.null(fcst)) { - data_summary(fcst) + data_summary(fcst, store.freq) } print("##### DATA LOADING COMPLETED SUCCESSFULLY #####") diff --git a/modules/data_load/testing_recipes/recipe_circular-sort-test.yml b/modules/data_load/testing_recipes/recipe_circular-sort-test.yml new file mode 100644 index 0000000000000000000000000000000000000000..3c49f8074ffa050b94e986b454059da439a97006 --- /dev/null +++ b/modules/data_load/testing_recipes/recipe_circular-sort-test.yml @@ -0,0 +1,44 @@ +Description: + Author: V. Agudetse + Info: For testing the behavior of the loading module when loading data + that crosses the date line or the Greenwich meridian. +Analysis: + Horizon: Seasonal + Variables: + name: tas + freq: monthly_mean + Datasets: + System: + name: system7c3s + Multimodel: False + Reference: + name: era5 + Time: + sdate: + fcst_syear: + fcst_sday: '1101' + hcst_start: '1993' + hcst_end: '2003' + leadtimemin: 1 + leadtimemax: 1 + Region: + latmin: -10 + latmax: 10 + lonmin: 320 + lonmax: 350 + Regrid: + method: bilinear + type: to_system + Workflow: + Calibration: + method: mse_min + Skill: + metric: BSS90 + Indicators: + index: no + Output_format: S2S4E +Run: + Loglevel: INFO + Terminal: yes + output_dir: /esarchive/scratch/vagudets/repos/auto-s2s/out-logs/ + code_dir: /esarchive/scratch/vagudets/repos/auto-s2s/ diff --git a/modules/test_victoria.R b/modules/test_victoria.R index 58d9be6aee6503870ad78dda11c390833a4e8c74..b5e026a3f36242691e405e8c0f5b54ec30f3d976 100644 --- a/modules/test_victoria.R +++ b/modules/test_victoria.R @@ -3,7 +3,7 @@ recipe_file <- "modules/data_load/testing_recipes/recipe_1.yml" source("modules/data_load/load.R") -source("modules/Calibration/test_calib.R") +source("modules/Calibration/Calibration.R") source("modules/Skill/Skill.R") # Load datasets diff --git a/tools/data_summary.R b/tools/data_summary.R new file mode 100644 index 0000000000000000000000000000000000000000..625bbef13a77183907857519346db8cd9b10cdd8 --- /dev/null +++ b/tools/data_summary.R @@ -0,0 +1,27 @@ +# Print a summary of the loaded data for the user, for each object. +# object: hindcast, forecast or reference data in s2dv_cube format. +## TODO: Incorporate into logger +## TODO: Adapt to daily/subseasonal cases +## TODO: Add check for missing files/NAs by dimension + +data_summary <- function(object, frequency) { + # Get name and start dates + object_name <- deparse(substitute(object)) + if (tolower(frequency) == "monthly_mean") { + date_format <- '%b %Y' + } else if (tolower(frequency) == "daily_mean") { + date_format <- '%b %d %Y' + } + sdate_min <- format(min(as.Date(object$Dates[[1]])), format = date_format) + sdate_max <- format(max(as.Date(object$Dates[[1]])), format = date_format) + + print("DATA SUMMARY:") + print(paste0(object_name, " range: ", sdate_min, " to ", sdate_max)) + print(paste0(object_name, " dimensions: ")) + print(dim(object$data)) + print(paste0("Statistical summary of the data in ", object_name, ":")) + print(summary(object$data)) + print("---------------------------------------------") + +} + diff --git a/tools/libs.R b/tools/libs.R index 2470225fb4450a39e5654b6ac9f01bdf0d9af3de..317286508f6929bb3258443ebf5eec29867dbd51 100644 --- a/tools/libs.R +++ b/tools/libs.R @@ -26,5 +26,6 @@ library(CSTools) # # To be removed when new package is done by library(CSOperational) source("tools/check_recipe.R") source("tools/prepare_outputs.R") -source("tools/divide_recipe.R") +source("tools/divide_recipe.R") +source("tools/data_summary.R") # source("tools/add_dims.R") # Not sure if necessary yet diff --git a/tools/tmp/CST_Calibration.R b/tools/tmp/CST_Calibration.R new file mode 100644 index 0000000000000000000000000000000000000000..79b41951e13ed3ec2ba5f187165ef9c675b44ffa --- /dev/null +++ b/tools/tmp/CST_Calibration.R @@ -0,0 +1,563 @@ +#'Forecast Calibration +#' +#'@author Verónica Torralba, \email{veronica.torralba@bsc.es} +#'@author Bert Van Schaeybroeck, \email{bertvs@meteo.be} +#'@description Equivalent to function \code{Calibration} but for objects of class \code{s2dv_cube}. +#' +#'@param exp an object of class \code{s2dv_cube} as returned by \code{CST_Load} function, containing the seasonal hindcast experiment data in the element named \code{$data}. The hindcast is used to calibrate the forecast in case the forecast is provided; if not, the same hindcast will be calibrated instead. +#'@param obs an object of class \code{s2dv_cube} as returned by \code{CST_Load} function, containing the observed data in the element named \code{$data}. +#'@param exp_cor an optional object of class \code{s2dv_cube} as returned by \code{CST_Load} function, containing the seasonal forecast experiment data in the element named \code{$data}. If the forecast is provided, it will be calibrated using the hindcast and observations; if not, the hindcast will be calibrated instead. +#'@param cal.method is the calibration method used, can be either \code{bias}, \code{evmos}, \code{mse_min}, \code{crps_min} or \code{rpc-based}. Default value is \code{mse_min}. +#'@param eval.method is the sampling method used, can be either \code{in-sample} or \code{leave-one-out}. Default value is the \code{leave-one-out} cross validation. In case the forecast is provided, any chosen eval.method is over-ruled and a third option is used. +#'@param multi.model is a boolean that is used only for the \code{mse_min} method. If multi-model ensembles or ensembles of different sizes are used, it must be set to \code{TRUE}. By default it is \code{FALSE}. Differences between the two approaches are generally small but may become large when using small ensemble sizes. Using multi.model when the calibration method is \code{bias}, \code{evmos} or \code{crps_min} will not affect the result. +#'@param na.fill is a boolean that indicates what happens in case calibration is not possible or will yield unreliable results. This happens when three or less forecasts-observation pairs are available to perform the training phase of the calibration. By default \code{na.fill} is set to true such that NA values will be returned. If \code{na.fill} is set to false, the uncorrected data will be returned. +#'@param na.rm is a boolean that indicates whether to remove the NA values or not. The default value is \code{TRUE}. See Details section for further information about its use and compatibility with \code{na.fill}. +#'@param apply_to is a character string that indicates whether to apply the calibration to all the forecast (\code{"all"}) or only to those where the correlation between the ensemble mean and the observations is statistically significant (\code{"sign"}). Only useful if \code{cal.method == "rpc-based"}. +#'@param alpha is a numeric value indicating the significance level for the correlation test. Only useful if \code{cal.method == "rpc-based" & apply_to == "sign"}. +#'@param memb_dim is a character string indicating the name of the member dimension. By default, it is set to 'member'. +#'@param sdate_dim is a character string indicating the name of the start date dimension. By default, it is set to 'sdate'. +#'@param ncores is an integer that indicates the number of cores for parallel computations using multiApply function. The default value is one. +#'@return an object of class \code{s2dv_cube} containing the calibrated forecasts in the element \code{$data} with the same dimensions as the one in the exp object. +#' +#'@importFrom s2dv InsertDim +#'@import abind +#' +#'@seealso \code{\link{CST_Load}} +#' +#'@examples +#'# Example 1: +#'mod1 <- 1 : (1 * 3 * 4 * 5 * 6 * 7) +#'dim(mod1) <- c(dataset = 1, member = 3, sdate = 4, ftime = 5, lat = 6, lon = 7) +#'obs1 <- 1 : (1 * 1 * 4 * 5 * 6 * 7) +#'dim(obs1) <- c(dataset = 1, member = 1, sdate = 4, ftime = 5, lat = 6, lon = 7) +#'lon <- seq(0, 30, 5) +#'lat <- seq(0, 25, 5) +#'exp <- list(data = mod1, lat = lat, lon = lon) +#'obs <- list(data = obs1, lat = lat, lon = lon) +#'attr(exp, 'class') <- 's2dv_cube' +#'attr(obs, 'class') <- 's2dv_cube' +#'a <- CST_Calibration(exp = exp, obs = obs, cal.method = "mse_min", eval.method = "in-sample") +#'str(a) +#' +#'# Example 2: +#'mod1 <- 1 : (1 * 3 * 4 * 5 * 6 * 7) +#'mod2 <- 1 : (1 * 3 * 1 * 5 * 6 * 7) +#'dim(mod1) <- c(dataset = 1, member = 3, sdate = 4, ftime = 5, lat = 6, lon = 7) +#'dim(mod2) <- c(dataset = 1, member = 3, sdate = 1, ftime = 5, lat = 6, lon = 7) +#'obs1 <- 1 : (1 * 1 * 4 * 5 * 6 * 7) +#'dim(obs1) <- c(dataset = 1, member = 1, sdate = 4, ftime = 5, lat = 6, lon = 7) +#'lon <- seq(0, 30, 5) +#'lat <- seq(0, 25, 5) +#'exp <- list(data = mod1, lat = lat, lon = lon) +#'obs <- list(data = obs1, lat = lat, lon = lon) +#'exp_cor <- list(data = mod2, lat = lat, lon = lon) +#'attr(exp, 'class') <- 's2dv_cube' +#'attr(obs, 'class') <- 's2dv_cube' +#'attr(exp_cor, 'class') <- 's2dv_cube' +#'a <- CST_Calibration(exp = exp, obs = obs, exp_cor = exp_cor, cal.method = "evmos") +#'str(a) +#'@export + +CST_Calibration <- function(exp, obs, exp_cor = NULL, cal.method = "mse_min", + eval.method = "leave-one-out", multi.model = FALSE, + na.fill = TRUE, na.rm = TRUE, apply_to = NULL, alpha = NULL, + memb_dim = 'member', sdate_dim = 'sdate', ncores = 1) { + + if(!missing(multi.model) & !(cal.method == "mse_min")){ + warning(paste0("The multi.model parameter is ignored when using the calibration method ", cal.method)) + } + + if(is.null(exp_cor)){ #exp will be used to calibrate and will also be calibrated: "calibrate hindcast" + if (!inherits(exp, "s2dv_cube") || !inherits(obs, "s2dv_cube")) { + stop("Parameter 'exp' and 'obs' must be of the class 's2dv_cube', ", + "as output by CSTools::CST_Load.") + } + exp$data <- Calibration(exp = exp$data, obs = obs$data, exp_cor = NULL, + cal.method = cal.method, + eval.method = eval.method, + multi.model = multi.model, + na.fill = na.fill, na.rm = na.rm, + apply_to = apply_to, alpha = alpha, + memb_dim = memb_dim, sdate_dim = sdate_dim, + ncores = ncores) + exp$Datasets <- c(exp$Datasets, obs$Datasets) + exp$source_files <- c(exp$source_files, obs$source_files) + + return(exp) + + }else{ #if exp_cor is provided, it will be calibrated: "calibrate forecast instead of hindcast" + eval.method = "hindcast-vs-forecast" #if exp_cor is provided, eval.method is overrruled (because if exp_cor is provided, the train data will be all data of "exp" and the evalutaion data will be all data of "exp_cor"; no need for "leave-one-out" or "in-sample") + if (!inherits(exp, "s2dv_cube") || !inherits(obs, "s2dv_cube") || !inherits(exp_cor, "s2dv_cube")) { + stop("Parameter 'exp', 'obs' and 'exp_cor' must be of the class 's2dv_cube', ", + "as output by CSTools::CST_Load.") + } + exp_cor$data <- Calibration(exp = exp$data, obs = obs$data, exp_cor = exp_cor$data, + cal.method = cal.method, + eval.method = eval.method, + multi.model = multi.model, + na.fill = na.fill, na.rm = na.rm, + apply_to = apply_to, alpha = alpha, + memb_dim = memb_dim, sdate_dim = sdate_dim, + ncores = ncores) + exp_cor$Datasets <- c(exp_cor$Datasets, obs$Datasets) + exp_cor$source_files <- c(exp_cor$source_files, exp$source_files, obs$source_files) + + return(exp_cor) + + } +} + + + +#'Forecast Calibration +#' +#'@author Verónica Torralba, \email{veronica.torralba@bsc.es} +#'@author Bert Van Schaeybroeck, \email{bertvs@meteo.be} +#'@description Five types of member-by-member bias correction can be performed. The \code{"bias"} method corrects the bias only, the \code{"evmos"} method applies a variance inflation technique to ensure the correction of the bias and the correspondence of variance between forecast and observation (Van Schaeybroeck and Vannitsem, 2011). The ensemble calibration methods \code{"mse_min"} and \code{"crps_min"} correct the bias, the overall forecast variance and the ensemble spread as described in Doblas-Reyes et al. (2005) and Van Schaeybroeck and Vannitsem (2015), respectively. While the \code{"mse_min"} method minimizes a constrained mean-squared error using three parameters, the \code{"crps_min"} method features four parameters and minimizes the Continuous Ranked Probability Score (CRPS). The \code{"rpc-based"} method adjusts the forecast variance ensuring that the ratio of predictable components (RPC) is equal to one, as in Eade et al. (2014). +#'@description Both in-sample or our out-of-sample (leave-one-out cross validation) calibration are possible. +#'@references Doblas-Reyes F.J, Hagedorn R, Palmer T.N. The rationale behind the success of multi-model ensembles in seasonal forecasting-II calibration and combination. Tellus A. 2005;57:234-252. doi:10.1111/j.1600-0870.2005.00104.x +#'@references Eade, R., Smith, D., Scaife, A., Wallace, E., Dunstone, N., Hermanson, L., & Robinson, N. (2014). Do seasonal-to-decadal climate predictions underestimate the predictability of the read world? Geophysical Research Letters, 41(15), 5620-5628. doi: 10.1002/2014GL061146 +#'@references Van Schaeybroeck, B., & Vannitsem, S. (2011). Post-processing through linear regression. Nonlinear Processes in Geophysics, 18(2), 147. doi:10.5194/npg-18-147-2011 +#'@references Van Schaeybroeck, B., & Vannitsem, S. (2015). Ensemble post-processing using member-by-member approaches: theoretical aspects. Quarterly Journal of the Royal Meteorological Society, 141(688), 807-818. doi:10.1002/qj.2397 +#' +#'@param exp a multidimensional array with named dimensions (at least 'sdate' and 'member') containing the seasonal hindcast experiment data. The hindcast is used to calibrate the forecast in case the forecast is provided; if not, the same hindcast will be calibrated instead. +#'@param obs a multidimensional array with named dimensions (at least 'sdate') containing the observed data. +#'@param exp_cor an optional multidimensional array with named dimensions (at least 'sdate' and 'member') containing the seasonal forecast experiment data. If the forecast is provided, it will be calibrated using the hindcast and observations; if not, the hindcast will be calibrated instead. +#'@param cal.method is the calibration method used, can be either \code{bias}, \code{evmos}, \code{mse_min}, \code{crps_min} or \code{rpc-based}. Default value is \code{mse_min}. +#'@param eval.method is the sampling method used, can be either \code{in-sample} or \code{leave-one-out}. Default value is the \code{leave-one-out} cross validation. In case the forecast is provided, any chosen eval.method is over-ruled and a third option is used. +#'@param multi.model is a boolean that is used only for the \code{mse_min} method. If multi-model ensembles or ensembles of different sizes are used, it must be set to \code{TRUE}. By default it is \code{FALSE}. Differences between the two approaches are generally small but may become large when using small ensemble sizes. Using multi.model when the calibration method is \code{bias}, \code{evmos} or \code{crps_min} will not affect the result. +#'@param na.fill is a boolean that indicates what happens in case calibration is not possible or will yield unreliable results. This happens when three or less forecasts-observation pairs are available to perform the training phase of the calibration. By default \code{na.fill} is set to true such that NA values will be returned. If \code{na.fill} is set to false, the uncorrected data will be returned. +#'@param na.rm is a boolean that indicates whether to remove the NA values or not. The default value is \code{TRUE}. +#'@param apply_to is a character string that indicates whether to apply the calibration to all the forecast (\code{"all"}) or only to those where the correlation between the ensemble mean and the observations is statistically significant (\code{"sign"}). Only useful if \code{cal.method == "rpc-based"}. +#'@param alpha is a numeric value indicating the significance level for the correlation test. Only useful if \code{cal.method == "rpc-based" & apply_to == "sign"}. +#'@param memb_dim is a character string indicating the name of the member dimension. By default, it is set to 'member'. +#'@param sdate_dim is a character string indicating the name of the start date dimension. By default, it is set to 'sdate'. +#'@param ncores is an integer that indicates the number of cores for parallel computations using multiApply function. The default value is one. +#'@return an array containing the calibrated forecasts with the same dimensions as the \code{exp} array. +#' +#'@importFrom s2dv InsertDim MeanDims Reorder +#'@import abind +#'@import multiApply +#'@importFrom s2dverification Subset +#' +#'@seealso \code{\link{CST_Load}} +#' +#'@details +#'Both the \code{na.fill} and \code{na.rm} parameters can be used to indicate how the function has to handle the NA values. The \code{na.fill} parameter checks whether there are more than three forecast-observations pairs to perform the computation. In case there are three or less pairs, the computation is not carried out, and the value returned by the function depends on the value of this parameter (either NA if \code{na.fill == TRUE} or the uncorrected value if \code{na.fill == TRUE}). On the other hand, \code{na.rm} is used to indicate the function whether to remove the missing values during the computation of the parameters needed to perform the calibration. +#' +#'@examples +#'mod1 <- 1 : (1 * 3 * 4 * 5 * 6 * 7) +#'dim(mod1) <- c(dataset = 1, member = 3, sdate = 4, ftime = 5, lat = 6, lon = 7) +#'obs1 <- 1 : (1 * 1 * 4 * 5 * 6 * 7) +#'dim(obs1) <- c(dataset = 1, member = 1, sdate = 4, ftime = 5, lat = 6, lon = 7) +#'a <- Calibration(exp = mod1, obs = obs1) +#'str(a) +#'@export +Calibration <- function(exp, obs, exp_cor=NULL, cal.method = "mse_min", + eval.method = "leave-one-out", + multi.model = FALSE, na.fill = TRUE, + na.rm = TRUE, apply_to = NULL, alpha = NULL, + memb_dim = 'member', sdate_dim = 'sdate', ncores = 1) { + + dim.exp <- dim(exp) + amt.dims.exp <- length(dim.exp) + dim.obs <- dim(obs) + amt.dims.obs <- length(dim.obs) + dim.names.exp <- names(dim.exp) + dim.names.obs <- names(dim.obs) + if(!is.null(exp_cor)){ + dim.exp_cor <- dim(exp_cor) + amt.dims.exp_cor <- length(dim.exp_cor) + dim.names.exp_cor <- names(dim.exp_cor) + } + if (is.null(memb_dim) || !is.character(memb_dim)) { + stop("Parameter 'memb_dim' should be a character string indicating the", + "name of the dimension where members are stored in 'exp'.") + } + if (length(memb_dim) > 1) { + memb_dim <- memb_dim[1] + warning("Parameter 'memb_dim' has length greater than 1 and only", + " the first element will be used.") + } + + if (is.null(sdate_dim) || !is.character(sdate_dim)) { + stop("Parameter 'sdate_dim' should be a character string indicating the", + "name of the dimension where start dates are stored in 'exp'.") + } + if (length(sdate_dim) > 1) { + sdate_dim <- sdate_dim[1] + warning("Parameter 'sdate_dim' has length greater than 1 and only", + " the first element will be used.") + } + target.dim.names.exp <- c(memb_dim, sdate_dim) + target.dim.names.obs <- sdate_dim + + if (!all(target.dim.names.exp %in% dim.names.exp)) { + stop("Parameter 'exp' must have the dimensions defined in memb_dim ", + "and sdate_dim.") + } + + if(!is.null(exp_cor)){ + if (!all(target.dim.names.exp %in% dim.names.exp_cor)) { + stop("Parameter 'exp_cor' must have the dimensions defined in memb_dim ", + "and sdate_dim.") + } + } + + if (!all(c(sdate_dim) %in% dim.names.obs)) { + stop("Parameter 'obs' must have the dimension defined in sdate_dim ", + "parameter.") + } + + if (any(is.na(exp))) { + warning("Parameter 'exp' contains NA values.") + } + + if(!is.null(exp_cor)){ + if (any(is.na(exp_cor))) { + warning("Parameter 'exp_cor' contains NA values.") + } + } + + if (any(is.na(obs))) { + warning("Parameter 'obs' contains NA values.") + } + + if (memb_dim %in% names(dim(obs))) { + obs <- Subset(obs, along = memb_dim, indices = 1, drop = "selected") + } + data.set.sufficiently.large.out <- + Apply(data = list(exp = exp, obs = obs), + target_dims = list(exp = target.dim.names.exp, obs = target.dim.names.obs), + ncores = ncores, + fun = .data.set.sufficiently.large)$output1 + + if(!all(data.set.sufficiently.large.out)){ + if(na.fill){ + warning("Some forecast data could not be corrected due to data lack", + " and is replaced with NA values") + } else { + warning("Some forecast data could not be corrected due to data lack", + " and is replaced with uncorrected values") + } + } + + if (!na.rm %in% c(TRUE,FALSE)) { + stop("Parameter 'na.rm' must be TRUE or FALSE.") + } + if (cal.method == 'rpc-based') { + if (is.null(apply_to)) { + apply_to <- 'sign' + warning("'apply_to' cannot be NULL for 'rpc-based' method so it has been set to 'sign', as in Eade et al. (2014).") + } else if (!apply_to %in% c('all','sign')) { + stop("'apply_to' must be either 'all' or 'sign' when 'rpc-based' method is used.") + } + if (apply_to == 'sign') { + if (is.null(alpha)) { + alpha <- 0.1 + warning("'alpha' cannot be NULL for 'rpc-based' method so it has been set to 0.1, as in Eade et al. (2014).") + } else if (!is.numeric(alpha) | alpha <= 0 | alpha >= 1) { + stop("'alpha' must be a number between 0 and 1.") + } + } + } + + if(is.null(exp_cor)){ + calibrated <- Apply(data = list(exp = exp, obs = obs), + cal.method = cal.method, + eval.method = eval.method, + multi.model = multi.model, + na.fill = na.fill, na.rm = na.rm, + apply_to = apply_to, alpha = alpha, + target_dims = list(exp = target.dim.names.exp, obs = target.dim.names.obs), + ncores = ncores, output_dims = target.dim.names.exp, + fun = .cal)$output1 + dexes <- match(names(dim(exp)), names(dim(calibrated))) + calibrated <- aperm(calibrated, dexes) + dimnames(calibrated) <- dimnames(exp)[dexes] + }else{ + calibrated <- Apply(data = list(exp = exp, obs = obs, exp_cor = exp_cor), + cal.method = cal.method, + eval.method = eval.method, + multi.model = multi.model, + na.fill = na.fill, na.rm = na.rm, + apply_to = apply_to, alpha = alpha, + target_dims = list(exp = target.dim.names.exp, obs = target.dim.names.obs, exp_cor = target.dim.names.exp), + ncores = ncores, output_dims = target.dim.names.exp, + fun = .cal)$output1 + dexes <- match(names(dim(exp_cor)), names(dim(calibrated))) + calibrated <- aperm(calibrated, dexes) + dimnames(calibrated) <- dimnames(exp_cor)[dexes] + } + + return(calibrated) +} + + +.data.set.sufficiently.large <- function(exp, obs){ + amt.min.samples <- 3 + amt.good.pts <- sum(!is.na(obs) & !apply(exp, c(2), function(x) all(is.na(x)))) + return(amt.good.pts > amt.min.samples) +} + +.make.eval.train.dexes <- function(eval.method, amt.points, amt.points_cor){ + if(eval.method == "leave-one-out"){ + dexes.lst <- lapply(seq(1, amt.points), function(x) return(list(eval.dexes = x, + train.dexes = seq(1, amt.points)[-x]))) + } else if (eval.method == "in-sample"){ + dexes.lst <- list(list(eval.dexes = seq(1, amt.points), + train.dexes = seq(1, amt.points))) + } else if (eval.method == "hindcast-vs-forecast"){ + dexes.lst <- list(list(eval.dexes = seq(1,amt.points_cor), + train.dexes = seq(1, amt.points))) + } else { + stop(paste0("unknown sampling method: ",eval.method)) + } + return(dexes.lst) +} + +.cal <- function(exp, obs, exp_cor = NULL, cal.method, eval.method, multi.model, na.fill, na.rm, apply_to, alpha) { + if(is.null(exp_cor)){ + exp_cor <- exp ## generate a copy of exp so that the same function can run + ## when exp_cor is provided and when it's not + } + obs <- as.vector(obs) + dims.fc <- dim(exp) + dims.fc_cor <- dim(exp_cor) ## new line + amt.mbr <- dims.fc[1] + amt.sdate <- dims.fc[2] + amt.sdate_cor <- dims.fc_cor[2] ## new line + var.cor.fc <- NA * exp_cor ## modified line (exp_cor instead of exp); + ## in case of exp_cor not provided, at this point exp_cor + ## is already the same as exp so no change + names(dim(var.cor.fc)) <- dims.fc + + if(!.data.set.sufficiently.large(exp = exp, obs = obs)){ + if(na.fill){ + return(var.cor.fc) + } else { + var.cor.fc[] <- exp[] + return(var.cor.fc) + } + } + eval.train.dexeses <- .make.eval.train.dexes(eval.method, amt.points = amt.sdate, + amt.points_cor = amt.sdate_cor) + amt.resamples <- length(eval.train.dexeses) + for (i.sample in seq(1, amt.resamples)) { + # defining training (tr) and evaluation (ev) subsets + eval.dexes <- eval.train.dexeses[[i.sample]]$eval.dexes + train.dexes <- eval.train.dexeses[[i.sample]]$train.dexes + + fc.ev <- exp_cor[ , eval.dexes, drop = FALSE] ## modified line (exp_cor instead of exp) + ## fc.ev is used to evaluate (not train; train should be done with exp (hindcast)) + fc.tr <- exp[ , train.dexes] + obs.tr <- obs[train.dexes , drop = FALSE] + + if(cal.method == "bias"){ + var.cor.fc[ , eval.dexes] <- fc.ev + mean(obs.tr, na.rm = na.rm) - mean(fc.tr, na.rm = na.rm) + } else if(cal.method == "evmos"){ # forecast correction implemented + #calculate ensemble and observational characteristics + quant.obs.fc.tr <- .calc.obs.fc.quant(obs = obs.tr, fc = fc.tr, na.rm = na.rm) + #calculate value for regression parameters + init.par <- c(.calc.evmos.par(quant.obs.fc.tr, na.rm = na.rm)) + #correct evaluation subset + var.cor.fc[ , eval.dexes] <- .correct.evmos.fc(fc.ev , init.par, na.rm = na.rm) + } else if (cal.method == "mse_min"){ + #calculate ensemble and observational characteristics + quant.obs.fc.tr <- .calc.obs.fc.quant(obs = obs.tr, fc = fc.tr, na.rm = na.rm) + #calculate value for regression parameters + init.par <- .calc.mse.min.par(quant.obs.fc.tr, multi.model, na.rm = na.rm) + #correct evaluation subset + var.cor.fc[ , eval.dexes] <- .correct.mse.min.fc(fc.ev , init.par, na.rm = na.rm) + } else if (cal.method == "crps_min"){ + #calculate ensemble and observational characteristics + quant.obs.fc.tr <- .calc.obs.fc.quant.ext(obs = obs.tr, fc = fc.tr, na.rm = na.rm) + #calculate initial value for regression parameters + init.par <- c(.calc.mse.min.par(quant.obs.fc.tr, na.rm = na.rm), 0.001) + init.par[3] <- sqrt(init.par[3]) + #calculate regression parameters on training dataset + optim.tmp <- optim(par = init.par, + fn = .calc.crps.opt, + gr = .calc.crps.grad.opt, + quant.obs.fc = quant.obs.fc.tr, + na.rm = na.rm, + method = "BFGS") + + mbm.par <- optim.tmp$par + #correct evaluation subset + var.cor.fc[ , eval.dexes] <- .correct.crps.min.fc(fc.ev , mbm.par, na.rm = na.rm) + } else if (cal.method == 'rpc-based') { + ens_mean.ev <- multiApply::Apply(data = fc.ev, target_dims = names(amt.mbr), fun = mean, na.rm = na.rm)$output1 ## Ensemble mean + ens_mean.tr <- multiApply::Apply(data = fc.tr, target_dims = names(amt.mbr), fun = mean, na.rm = na.rm)$output1 ## Ensemble mean + ens_spread.tr <- multiApply::Apply(data = list(fc.tr, ens_mean.tr), target_dims = names(amt.sdate), fun = "-")$output1 ## Ensemble spread + exp_mean.tr <- mean(fc.tr, na.rm = na.rm) ## Mean (climatology) + var_signal.tr <- var(ens_mean.tr, na.rm = na.rm) ## Ensemble mean variance + var_noise.tr <- var(as.vector(ens_spread.tr), na.rm = na.rm) ## Variance of ensemble members about ensemble mean (= spread) + var_obs.tr <- var(obs.tr, na.rm = na.rm) ## Variance in the observations + r.tr <- cor(x = ens_mean.tr, y = obs.tr, method = 'pearson', use = ifelse(test = isTRUE(na.rm), yes = "pairwise.complete.obs", no = "everything")) ## Correlation between observations and the ensemble mean + if ((apply_to == 'all') || (apply_to == 'sign' && cor.test(ens_mean.tr, obs.tr, method = 'pearson', alternative = 'greater')$p.value < alpha)) { + ens_mean_cal <- (ens_mean.ev - exp_mean.tr) * r.tr * sqrt(var_obs.tr) / sqrt(var_signal.tr) + exp_mean.tr + var.cor.fc[ , eval.dexes] <- s2dv::Reorder(data = multiApply::Apply(data = list(exp = fc.ev, ens_mean = ens_mean.ev, ens_mean_cal = ens_mean_cal), target_dims = names(amt.sdate), fun = .CalibrationMembersRPC, var_obs = var_obs.tr, var_noise = var_noise.tr, r = r.tr)$output1, + order = names(dims.fc)) + dim(var.cor.fc) <- dims.fc + } else { ## no significant -> replacing with observed climatology + var.cor.fc[ , eval.dexes] <- array(data = mean(obs.tr, na.rm = na.rm), dim = dim(fc.ev)) + } + } else { + stop("unknown calibration method: ",cal.method) + } + } + return(var.cor.fc) +} + +.calc.obs.fc.quant <- function(obs, fc, na.rm){ #function to calculate different quantities of a series of ensemble forecasts and corresponding observations + amt.mbr <- dim(fc)[1] + obs.per.ens <- InsertDim(obs, posdim = 1, lendim = amt.mbr) + fc.ens.av <- apply(fc, c(2), mean, na.rm = na.rm) + cor.obs.fc <- cor(fc.ens.av, obs, use = "complete.obs") + obs.av <- mean(obs, na.rm = na.rm) + obs.sd <- sd(obs, na.rm = na.rm) + return( + append( + .calc.fc.quant(fc = fc, na.rm = na.rm), + list( + obs.per.ens = obs.per.ens, + cor.obs.fc = cor.obs.fc, + obs.av = obs.av, + obs.sd = obs.sd + ) + ) + ) +} + +.calc.obs.fc.quant.ext <- function(obs, fc, na.rm){ #extended function to calculate different quantities of a series of ensemble forecasts and corresponding observations + amt.mbr <- dim(fc)[1] + obs.per.ens <- InsertDim(obs, posdim = 1, lendim = amt.mbr) + fc.ens.av <- apply(fc, c(2), mean, na.rm = na.rm) + cor.obs.fc <- cor(fc.ens.av, obs, use = "complete.obs") + obs.av <- mean(obs, na.rm = na.rm) + obs.sd <- sd(obs, na.rm = na.rm) + + return( + append( + .calc.fc.quant.ext(fc = fc, na.rm = na.rm), + list( + obs.per.ens = obs.per.ens, + cor.obs.fc = cor.obs.fc, + obs.av = obs.av, + obs.sd = obs.sd + ) + ) + ) +} + + +.calc.fc.quant <- function(fc, na.rm){ #function to calculate different quantities of a series of ensemble forecasts + amt.mbr <- dim(fc)[1] + fc.ens.av <- apply(fc, c(2), mean, na.rm = na.rm) + fc.ens.av.av <- mean(fc.ens.av, na.rm = na.rm) + fc.ens.av.sd <- sd(fc.ens.av, na.rm = na.rm) + fc.ens.av.per.ens <- InsertDim(fc.ens.av, posdim = 1, lendim = amt.mbr) + fc.ens.sd <- apply(fc, c(2), sd, na.rm = na.rm) + fc.ens.var.av.sqrt <- sqrt(mean(fc.ens.sd^2, na.rm = na.rm)) + fc.dev <- fc - fc.ens.av.per.ens + fc.dev.sd <- sd(fc.dev, na.rm = na.rm) + fc.av <- mean(fc, na.rm = na.rm) + fc.sd <- sd(fc, na.rm = na.rm) + return( + list( + fc.ens.av = fc.ens.av, + fc.ens.av.av = fc.ens.av.av, + fc.ens.av.sd = fc.ens.av.sd, + fc.ens.av.per.ens = fc.ens.av.per.ens, + fc.ens.sd = fc.ens.sd, + fc.ens.var.av.sqrt = fc.ens.var.av.sqrt, + fc.dev = fc.dev, + fc.dev.sd = fc.dev.sd, + fc.av = fc.av, + fc.sd = fc.sd + ) + ) +} + +.calc.fc.quant.ext <- function(fc, na.rm){ #extended function to calculate different quantities of a series of ensemble forecasts + + amt.mbr <- dim(fc)[1] + repmat1.tmp <- InsertDim(fc, posdim = 1, lendim = amt.mbr) + repmat2.tmp <- aperm(repmat1.tmp, c(2, 1, 3)) + spr.abs <- apply(abs(repmat1.tmp - repmat2.tmp), c(3), mean, na.rm = na.rm) + spr.abs.per.ens <- InsertDim(spr.abs, posdim = 1, lendim = amt.mbr) + + return( + append(.calc.fc.quant(fc, na.rm = na.rm), + list(spr.abs = spr.abs, spr.abs.per.ens = spr.abs.per.ens)) + ) +} + +#Below are the core or elementary functions to calculate the regression parameters for the different methods +.calc.mse.min.par <- function(quant.obs.fc, multi.model = F, na.rm){ + par.out <- rep(NA, 3) + + if(multi.model){ + par.out[3] <- with(quant.obs.fc, obs.sd * sqrt(1. - cor.obs.fc^2) / fc.ens.var.av.sqrt) + } else { + par.out[3] <- with(quant.obs.fc, obs.sd * sqrt(1. - cor.obs.fc^2) / fc.dev.sd) + } + par.out[2] <- with(quant.obs.fc, abs(cor.obs.fc) * obs.sd / fc.ens.av.sd) + par.out[1] <- with(quant.obs.fc, obs.av - par.out[2] * fc.ens.av.av, na.rm = na.rm) + + return(par.out) +} +.calc.evmos.par <- function(quant.obs.fc, na.rm){ + par.out <- rep(NA, 2) + par.out[2] <- with(quant.obs.fc, obs.sd / fc.sd) + par.out[1] <- with(quant.obs.fc, obs.av - par.out[2] * fc.ens.av.av, na.rm = na.rm) + return(par.out) +} +#Below are the core or elementary functions to calculate the functions necessary for the minimization of crps +.calc.crps.opt <- function(par, quant.obs.fc, na.rm){ + return( + with(quant.obs.fc, + mean(abs(obs.per.ens - (par[1] + par[2] * fc.ens.av.per.ens + + ((par[3])^2 + par[4] / spr.abs.per.ens) * fc.dev)), na.rm = na.rm) - + mean(abs((par[3])^2 * spr.abs + par[4]) / 2., na.rm = na.rm) + ) + ) +} + +.calc.crps.grad.opt <- function(par, quant.obs.fc, na.rm){ + sgn1 <- with(quant.obs.fc,sign(obs.per.ens - + (par[1] + par[2] * fc.ens.av.per.ens + + ((par[3])^2 + par[4] / spr.abs.per.ens) * fc.dev))) + sgn2 <- with(quant.obs.fc, sign((par[3])^2 + par[4] / spr.abs.per.ens)) + sgn3 <- with(quant.obs.fc,sign((par[3])^2 * spr.abs + par[4])) + deriv.par1 <- mean(sgn1, na.rm = na.rm) + deriv.par2 <- with(quant.obs.fc, mean(sgn1 * fc.dev, na.rm = na.rm)) + deriv.par3 <- with(quant.obs.fc, + mean(2* par[3] * sgn1 * sgn2 * fc.ens.av.per.ens, na.rm = na.rm) - + mean(spr.abs * sgn3, na.rm = na.rm) / 2.) + deriv.par4 <- with(quant.obs.fc, + mean(sgn1 * sgn2 * fc.ens.av.per.ens / spr.abs.per.ens, na.rm = na.rm) - + mean(sgn3, na.rm = na.rm) / 2.) + return(c(deriv.par1, deriv.par2, deriv.par3, deriv.par4)) +} + +#Below are the core or elementary functions to correct the evaluation set based on the regression parameters +.correct.evmos.fc <- function(fc, par, na.rm){ + quant.fc.mp <- .calc.fc.quant(fc = fc, na.rm = na.rm) + return(with(quant.fc.mp, par[1] + par[2] * fc)) +} +.correct.mse.min.fc <- function(fc, par, na.rm){ + quant.fc.mp <- .calc.fc.quant(fc = fc, na.rm = na.rm) + return(with(quant.fc.mp, par[1] + par[2] * fc.ens.av.per.ens + fc.dev * par[3])) +} +.correct.crps.min.fc <- function(fc, par, na.rm){ + quant.fc.mp <- .calc.fc.quant.ext(fc = fc, na.rm = na.rm) + return(with(quant.fc.mp, par[1] + par[2] * fc.ens.av.per.ens + fc.dev * abs((par[3])^2 + par[4] / spr.abs))) +} + +# Function to calibrate the individual members with the RPC-based method +.CalibrationMembersRPC <- function(exp, ens_mean, ens_mean_cal, var_obs, var_noise, r){ + member_cal <- (exp - ens_mean) * sqrt(var_obs) * sqrt(1 - r^2) / sqrt(var_noise) + ens_mean_cal + return(member_cal) +} diff --git a/tools/tmp/RPS.R b/tools/tmp/RPS.R new file mode 100644 index 0000000000000000000000000000000000000000..84956ac85a2ee39250e793ec80dab9f26df75eaa --- /dev/null +++ b/tools/tmp/RPS.R @@ -0,0 +1,193 @@ +#'Compute the Ranked Probability Score +#' +#'The Ranked Probability Score (RPS; Wilks, 2011) is defined as the sum of the +#'squared differences between the cumulative forecast probabilities (computed +#'from the ensemble members) and the observations (defined as 0% if the category +#'did not happen and 100% if it happened). It can be used to evaluate the skill +#'of multi-categorical probabilistic forecasts. The RPS ranges between 0 +#'(perfect forecast) and n-1 (worst possible forecast), where n is the number of +#'categories. In the case of a forecast divided into two categories (the lowest +#'number of categories that a probabilistic forecast can have), the RPS +#'corresponds to the Brier Score (BS; Wilks, 2011), therefore, ranges between 0 +#'and 1. +#' +#'@param exp A named numerical array of the forecast with at least time +#' dimension. +#'@param obs A named numerical array of the observation with at least time +#' dimension. The dimensions must be the same as 'exp' except 'memb_dim'. +#'@param time_dim A character string indicating the name of the time dimension. +#' The default value is 'sdate'. +#'@param memb_dim A character string indicating the name of the member dimension +#' to compute the probabilities of the forecast. The default value is 'member'. +#'@param prob_thresholds A numeric vector of the relative thresholds (from 0 to +#' 1) between the categories. The default value is c(1/3, 2/3), which +#' corresponds to tercile equiprobable categories. +#'@param indices_for_clim A vector of the indices to be taken along 'time_dim' +#' for computing the thresholds between the probabilistic categories. If NULL, +#' the whole period is used. The default value is NULL. +#'@param Fair A logical indicating whether to compute the FairRPSS (the +#' potential RPSS that the forecast would have with an infinite ensemble size). +#' The default value is FALSE. +#'@param ncores An integer indicating the number of cores to use for parallel +#' computation. The default value is NULL. +#' +#'@return +#'A numerical array of RPS with the same dimensions as "exp" except the +#''time_dim' and 'memb_dim' dimensions. +#' +#'@references +#'Wilks, 2011; https://doi.org/10.1016/B978-0-12-385022-5.00008-7 +#' +#'@examples +#'exp <- array(rnorm(1000), dim = c(lat = 3, lon = 2, member = 10, sdate = 50)) +#'obs <- array(rnorm(1000), dim = c(lat = 3, lon = 2, sdate = 50)) +#'res <- RPS(exp = exp, obs = obs) +#' +#'@import multiApply +#'@importFrom easyVerification convert2prob +#'@export +RPS <- function(exp, obs, time_dim = 'sdate', memb_dim = 'member', + prob_thresholds = c(1/3, 2/3), indices_for_clim = NULL, Fair = FALSE, + ncores = NULL) { + + # Check inputs + ## exp and obs (1) + if (!is.array(exp) | !is.numeric(exp)) + stop('Parameter "exp" must be a numeric array.') + if (!is.array(obs) | !is.numeric(obs)) + stop('Parameter "obs" must be a numeric array.') + if(any(is.null(names(dim(exp))))| any(nchar(names(dim(exp))) == 0) | + any(is.null(names(dim(obs))))| any(nchar(names(dim(obs))) == 0)) { + stop("Parameter 'exp' and 'obs' must have dimension names.") + } + ## time_dim + if (!is.character(time_dim) | length(time_dim) != 1) + stop('Parameter "time_dim" must be a character string.') + if (!time_dim %in% names(dim(exp)) | !time_dim %in% names(dim(obs))) { + stop("Parameter 'time_dim' is not found in 'exp' or 'obs' dimension.") + } + ## memb_dim + if (!is.character(memb_dim) | length(memb_dim) > 1) { + stop("Parameter 'memb_dim' must be a character string.") + } + if (!memb_dim %in% names(dim(exp))) { + stop("Parameter 'memb_dim' is not found in 'exp' dimension.") + } + ## exp and obs (2) + name_exp <- sort(names(dim(exp))) + name_obs <- sort(names(dim(obs))) + name_exp <- name_exp[-which(name_exp == memb_dim)] + if (memb_dim %in% name_obs) { + name_obs <- name_obs[-which(name_obs == memb_dim)] + } + if (!identical(length(name_exp), length(name_obs)) | + !identical(dim(exp)[name_exp], dim(obs)[name_obs])) { + stop(paste0("Parameter 'exp' and 'obs' must have same length of ", + "all dimensions expect 'memb_dim'.")) + } + ## prob_thresholds + if (!is.numeric(prob_thresholds) | !is.vector(prob_thresholds) | + any(prob_thresholds <= 0) | any(prob_thresholds >= 1)) { + stop("Parameter 'prob_thresholds' must be a numeric vector between 0 and 1.") + } + ## indices_for_clim + if (is.null(indices_for_clim)) { + indices_for_clim <- 1:dim(obs)[time_dim] + } else { + if (!is.numeric(indices_for_clim) | !is.vector(indices_for_clim)) { + stop("Parameter 'indices_for_clim' must be NULL or a numeric vector.") + } else if (length(indices_for_clim) > dim(obs)[time_dim] | + max(indices_for_clim) > dim(obs)[time_dim] | + any(indices_for_clim) < 1) { + stop("Parameter 'indices_for_clim' should be the indices of 'time_dim'.") + } + } + ## Fair + if (!is.logical(Fair) | length(Fair) > 1) { + stop("Parameter 'Fair' must be either TRUE or FALSE.") + } + ## ncores + if (!is.null(ncores)) { + if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 | + length(ncores) > 1) { + stop("Parameter 'ncores' must be either NULL or a positive integer.") + } + } + + ############################### + + # Compute RPS + if (!memb_dim %in% names(dim(obs))) { + target_dims_obs <- time_dim + } else { + target_dims_obs <- c(time_dim, memb_dim) + } + + rps <- Apply(data = list(exp = exp, obs = obs), + target_dims = list(exp = c(time_dim, memb_dim), + obs = target_dims_obs), + output_dims = time_dim, + fun = .RPS, + prob_thresholds = prob_thresholds, + indices_for_clim = indices_for_clim, Fair = Fair, + ncores = ncores)$output1 + + # Return only the mean RPS + rps <- MeanDims(rps, time_dim, na.rm = FALSE) + + return(rps) +} + + +.RPS <- function(exp, obs, prob_thresholds = c(1/3, 2/3), + indices_for_clim = NULL, Fair = FALSE) { + # exp: [sdate, memb] + # obs: [sdate, (memb)] + + exp_probs <- .get_probs(data = exp, indices_for_quantiles = indices_for_clim, + prob_thresholds = prob_thresholds) + # exp_probs: [bin, sdate] + obs_probs <- .get_probs(data = obs, indices_for_quantiles = indices_for_clim, + prob_thresholds = prob_thresholds) + # obs_probs: [bin, sdate] + + probs_exp_cumsum <- apply(exp_probs, 2, cumsum) + probs_obs_cumsum <- apply(obs_probs, 2, cumsum) + rps <- apply((probs_exp_cumsum - probs_obs_cumsum)^2, 2, sum) + # rps: [sdate] + + if (Fair) { # FairRPS + ## adjustment <- rowSums(-1 * (1/R - 1/R.new) * ens.cum * (R - ens.cum)/R/(R - 1)) [formula taken from SpecsVerification::EnsRps] + R <- dim(exp)[2] #memb + R_new <- Inf + adjustment <- (-1) / (R - 1) * probs_exp_cumsum * (1 - probs_exp_cumsum) + adjustment <- apply(adjustment, 2, sum) + rps <- rps + adjustment + } + + return(rps) +} + +.get_probs <- function(data, indices_for_quantiles, prob_thresholds) { + # if exp: [sdate, memb] + # if obs: [sdate, (memb)] + + # Add dim [memb = 1] to obs if it doesn't have memb_dim + if (length(dim(data)) == 1) dim(data) <- c(dim(data), 1) + + # Absolute thresholds + quantiles <- quantile(x = as.vector(data[indices_for_quantiles, ]), probs = prob_thresholds, type = 8, na.rm = TRUE) + + # Probabilities + probs <- array(dim = c(bin = length(quantiles) + 1, dim(data)[1])) # [bin, sdate] + for (i_time in 1:dim(data)[1]) { + if (anyNA(data[i_time, ])) { + probs[, i_time] <- rep(NA, dim = length(quantiles) + 1) + } else { + probs[, i_time] <- colMeans(easyVerification::convert2prob(data[i_time, ], threshold = quantiles)) + } + } + + return(probs) +} + diff --git a/tools/tmp/RPSS.R b/tools/tmp/RPSS.R new file mode 100644 index 0000000000000000000000000000000000000000..04c31378f6e1d733d83a5fb1ea1e5d297612973b --- /dev/null +++ b/tools/tmp/RPSS.R @@ -0,0 +1,223 @@ +#'Compute the Ranked Probability Skill Score +#' +#'The Ranked Probability Skill Score (RPSS; Wilks, 2011) is the skill score +#'based on the Ranked Probability Score (RPS; Wilks, 2011). It can be used to +#'assess whether a forecast presents an improvement or worsening with respect to +#'a reference forecast. The RPSS ranges between minus infinite and 1. If the +#'RPSS is positive, it indicates that the forecast has higher skill than the +#'reference forecast, while a negative value means that it has a lower skill. +#'Examples of reference forecasts are the climatological forecast (same +#'probabilities for all categories for all time steps), persistence, a previous +#'model version, and another model. It is computed as RPSS = 1 - RPS_exp / RPS_ref. +#'The statistical significance is obtained based on a Random Walk test at the +#'95% confidence level (DelSole and Tippett, 2016). +#' +#'@param exp A named numerical array of the forecast with at least time +#' dimension. +#'@param obs A named numerical array of the observation with at least time +#' dimension. The dimensions must be the same as 'exp' except 'memb_dim'. +#'@param ref A named numerical array of the reference forecast data with at +#' least time dimension. The dimensions must be the same as 'exp' except +#' 'memb_dim'. If it is NULL, the climatological forecast is used as reference +#' forecast. The default value is NULL. +#'@param time_dim A character string indicating the name of the time dimension. +#' The default value is 'sdate'. +#'@param memb_dim A character string indicating the name of the member dimension +#' to compute the probabilities of the forecast and the reference forecast. The +#' default value is 'member'. +#'@param prob_thresholds A numeric vector of the relative thresholds (from 0 to +#' 1) between the categories. The default value is c(1/3, 2/3), which +#' corresponds to tercile equiprobable categories. +#'@param indices_for_clim A vector of the indices to be taken along 'time_dim' +#' for computing the thresholds between the probabilistic categories. If NULL, +#' the whole period is used. The default value is NULL. +#'@param Fair A logical indicating whether to compute the FairRPSS (the +#' potential RPSS that the forecast would have with an infinite ensemble size). +#' The default value is FALSE. +#'@param ncores An integer indicating the number of cores to use for parallel +#' computation. The default value is NULL. +#' +#'@return +#'\item{$rpss}{ +#' A numerical array of the RPSS with the same dimensions as "exp" except the +#' 'time_dim' and 'memb_dim' dimensions. +#'} +#'\item{$sign}{ +#' A logical array of the statistical significance of the RPSS with the same +#' dimensions as 'exp' except the 'time_dim' and 'memb_dim' dimensions. +#'} +#' +#'@references +#'Wilks, 2011; https://doi.org/10.1016/B978-0-12-385022-5.00008-7 +#'DelSole and Tippett, 2016; https://doi.org/10.1175/MWR-D-15-0218.1 +#' +#'@examples +#'exp <- array(rnorm(1000), dim = c(lat = 3, lon = 2, member = 10, sdate = 50)) +#'obs <- array(rnorm(1000), dim = c(lat = 3, lon = 2, sdate = 50)) +#'ref <- array(rnorm(1000), dim = c(lat = 3, lon = 2, member = 10, sdate = 50)) +#'res <- RPSS(exp = exp, obs = obs) ## climatology as reference forecast +#'res <- RPSS(exp = exp, obs = obs, ref = ref) ## ref as reference forecast +#' +#'@import multiApply +#'@export +RPSS <- function(exp, obs, ref = NULL, time_dim = 'sdate', memb_dim = 'member', + prob_thresholds = c(1/3, 2/3), indices_for_clim = NULL, Fair = FALSE, + ncores = NULL) { + + # Check inputs + ## exp, obs, and ref (1) + if (!is.array(exp) | !is.numeric(exp)) + stop('Parameter "exp" must be a numeric array.') + if (!is.array(obs) | !is.numeric(obs)) + stop('Parameter "obs" must be a numeric array.') + if (!is.null(ref)) { + if (!is.array(ref) | !is.numeric(ref)) + stop('Parameter "ref" must be a numeric array.') + } + ## time_dim + if (!is.character(time_dim) | length(time_dim) != 1) + stop('Parameter "time_dim" must be a character string.') + if (!time_dim %in% names(dim(exp)) | !time_dim %in% names(dim(obs))) { + stop("Parameter 'time_dim' is not found in 'exp' or 'obs' dimension.") + } + if (!is.null(ref) & !time_dim %in% names(dim(ref))) { + stop("Parameter 'time_dim' is not found in 'ref' dimension.") + } + ## memb_dim + if (!is.character(memb_dim) | length(memb_dim) > 1) { + stop("Parameter 'memb_dim' must be a character string.") + } + if (!memb_dim %in% names(dim(exp))) { + stop("Parameter 'memb_dim' is not found in 'exp' dimension.") + } + if (!is.null(ref) & !memb_dim %in% names(dim(ref))) { + stop("Parameter 'memb_dim' is not found in 'ref' dimension.") + } + ## exp and obs (2) + name_exp <- sort(names(dim(exp))) + name_obs <- sort(names(dim(obs))) + name_exp <- name_exp[-which(name_exp == memb_dim)] + if (memb_dim %in% name_obs) { + name_obs <- name_obs[-which(name_obs == memb_dim)] + } + if (!identical(length(name_exp), length(name_obs)) | + !identical(dim(exp)[name_exp], dim(obs)[name_obs])) { + stop(paste0("Parameter 'exp' and 'obs' must have same length of ", + "all dimensions expect 'memb_dim'.")) + } + if (!is.null(ref)) { + name_ref <- sort(names(dim(ref))) + name_ref <- name_ref[-which(name_ref == memb_dim)] + if (!identical(length(name_exp), length(name_ref)) | + !identical(dim(exp)[name_exp], dim(ref)[name_ref])) { + stop(paste0("Parameter 'exp' and 'obs' must have same length of ", + "all dimensions expect 'memb_dim'.")) + } + } + ## prob_thresholds + if (!is.numeric(prob_thresholds) | !is.vector(prob_thresholds) | + any(prob_thresholds <= 0) | any(prob_thresholds >= 1)) { + stop("Parameter 'prob_thresholds' must be a numeric vector between 0 and 1.") + } + ## indices_for_clim + if (is.null(indices_for_clim)) { + indices_for_clim <- 1:dim(obs)[time_dim] + } else { + if (!is.numeric(indices_for_clim) | !is.vector(indices_for_clim)) { + stop("Parameter 'indices_for_clim' must be NULL or a numeric vector.") + } else if (length(indices_for_clim) > dim(obs)[time_dim] | + max(indices_for_clim) > dim(obs)[time_dim] | + any(indices_for_clim) < 1) { + stop("Parameter 'indices_for_clim' should be the indices of 'time_dim'.") + } + } + ## Fair + if (!is.logical(Fair) | length(Fair) > 1) { + stop("Parameter 'Fair' must be either TRUE or FALSE.") + } + ## ncores + if (!is.null(ncores)) { + if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 | + length(ncores) > 1) { + stop("Parameter 'ncores' must be either NULL or a positive integer.") + } + } + + ############################### + + # Compute RPSS + if (!memb_dim %in% names(dim(obs))) { + target_dims_obs <- time_dim + } else { + target_dims_obs <- c(time_dim, memb_dim) + } + + if (!is.null(ref)) { # use "ref" as reference forecast + data <- list(exp = exp, obs = obs, ref = ref) + target_dims = list(exp = c(time_dim, memb_dim), + obs = target_dims_obs, + ref = c(time_dim, memb_dim)) + } else { + data <- list(exp = exp, obs = obs) + target_dims = list(exp = c(time_dim, memb_dim), + obs = target_dims_obs) + } + output <- Apply(data, + target_dims = target_dims, + fun = .RPSS, + prob_thresholds = prob_thresholds, + indices_for_clim = indices_for_clim, Fair = Fair, + ncores = ncores) + + return(output) +} + +.RPSS <- function(exp, obs, ref = NULL, prob_thresholds = c(1/3, 2/3), + indices_for_clim = NULL, Fair = FALSE) { + # exp: [sdate, memb] + # obs: [sdate, (memb)] + # ref: [sdate, memb] or NULL + + # RPS of the forecast + rps_exp <- .RPS(exp = exp, obs = obs, prob_thresholds = prob_thresholds, + indices_for_clim = indices_for_clim, Fair = Fair) + + # RPS of the reference forecast + if (is.null(ref)) { ## using climatology as reference forecast + obs_probs <- .get_probs(data = obs, indices_for_quantiles = indices_for_clim, + prob_thresholds = prob_thresholds) + # obs_probs: [bin, sdate] + + clim_probs <- c(prob_thresholds[1], diff(prob_thresholds), 1 - prob_thresholds[length(prob_thresholds)]) + clim_probs <- array(clim_probs, dim = dim(obs_probs)) + # clim_probs: [bin, sdate] + + # Calculate RPS for each time step + probs_clim_cumsum <- apply(clim_probs, 2, cumsum) + probs_obs_cumsum <- apply(obs_probs, 2, cumsum) + rps_ref <- apply((probs_clim_cumsum - probs_obs_cumsum)^2, 2, sum) + # rps_ref: [sdate] + +# if (Fair) { # FairRPS +# ## adjustment <- rowSums(-1 * (1/R - 1/R.new) * ens.cum * (R - ens.cum)/R/(R - 1)) [formula taken from SpecsVerification::EnsRps] +# R <- dim(exp)[2] #memb +# R_new <- Inf +# adjustment <- (-1) / (R - 1) * probs_clim_cumsum * (1 - probs_clim_cumsum) +# adjustment <- apply(adjustment, 2, sum) +# rps_ref <- rps_ref + adjustment +# } + + } else { # use "ref" as reference forecast + rps_ref <- .RPS(exp = ref, obs = obs, prob_thresholds = prob_thresholds, + indices_for_clim = indices_for_clim, Fair = Fair) + } + + # RPSS + rpss <- 1 - mean(rps_exp) / mean(rps_ref) + + # Significance + sign <- .RandomWalkTest(skill_A = rps_exp, skill_B = rps_ref)$signif + + return(list(rpss = rpss, sign = sign)) +} + diff --git a/tools/tmp/RandomWalkTest.R b/tools/tmp/RandomWalkTest.R new file mode 100644 index 0000000000000000000000000000000000000000..adeadc1ec94b62920c885640938f966c91e75ddc --- /dev/null +++ b/tools/tmp/RandomWalkTest.R @@ -0,0 +1,82 @@ +#'Random walk test for skill differences +#' +#'Forecast comparison of the skill obtained with 2 forecasts (with respect to a +#'common reference) based on Random Walks, with significance estimate at the 95% +#'confidence level, as in DelSole and Tippett (2016). +#' +#'@param skill_A A numerical array of the time series of the skill with the +#' forecaster A's. +#'@param skill_B A numerical array of the time series of the skill with the +#' forecaster B's. The dimensions should be identical as parameter 'skill_A'. +#'@param time_dim A character string indicating the name of the dimension along +#' which the tests are computed. The default value is 'sdate'. +#'@param ncores An integer indicating the number of cores to use for parallel +#' computation. The default value is NULL. +#' +#'@return A list of 2: +#'\item{$score}{ +#' A numerical array with the same dimensions as the input arrays except +#' 'time_dim'. The number of times that forecaster A has been better than +#' forecaster B minus the number of times that forecaster B has been better +#' than forecaster A (for skill positively oriented). If $score is positive +#' forecaster A is better than forecaster B, and if $score is negative +#' forecaster B is better than forecaster B. +#'} +#'\item{$signif}{ +#' A logical array with the same dimensions as the input arrays except +#' 'time_dim'. Whether the difference is significant or not at the 5% +#' significance level. +#'} +#' +#'@examples +#' fcst_A <- array(c(11:50), dim = c(sdate = 10, lat = 2, lon = 2)) +#' fcst_B <- array(c(21:60), dim = c(sdate = 10, lat = 2, lon = 2)) +#' reference <- array(1:40, dim = c(sdate = 10, lat = 2, lon = 2)) +#' skill_A <- abs(fcst_A - reference) +#' skill_B <- abs(fcst_B - reference) +#' RandomWalkTest(skill_A = skill_A, skill_B = skill_B, time_dim = 'sdate', ncores = 1) +#' +#'@import multiApply +#'@export +RandomWalkTest <- function(skill_A, skill_B, time_dim = 'sdate', ncores = NULL){ + + ## Check inputs + if (is.null(skill_A) | is.null(skill_B)){ + stop("Parameters 'skill_A' and 'skill_B' cannot be NULL.") + } + if(!is.numeric(skill_A) | !is.numeric(skill_B)){ + stop("Parameters 'skill_A' and 'skill_B' must be a numerical array.") + } + if (!identical(dim(skill_A),dim(skill_B))){ + stop("Parameters 'skill_A' and 'skill_B' must have the same dimensions.") + } + if(!is.character(time_dim)){ + stop("Parameter 'time_dim' must be a character string.") + } + if(!time_dim %in% names(dim(skill_A)) | !time_dim %in% names(dim(skill_B))){ + stop("Parameter 'time_dim' is not found in 'skill_A' or 'skill_B' dimensions.") + } + if (!is.null(ncores)){ + if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 | length(ncores) > 1){ + stop("Parameter 'ncores' must be a positive integer.") + } + } + + ## Compute the Random Walk Test + res <- multiApply::Apply(data = list(skill_A, skill_B), + target_dims = time_dim, + fun = .RandomWalkTest, + ncores = ncores) + return(res) +} + +.RandomWalkTest <- function(skill_A, skill_B){ + + score <- cumsum(skill_A > skill_B) - cumsum(skill_A < skill_B) + + ## TRUE if significant (if last value is above or below 2*sqrt(N)) + signif<- ifelse(test = (score[length(skill_A)] < (-2)*sqrt(length(skill_A))) | (score[length(skill_A)] > 2*sqrt(length(skill_A))), + yes = TRUE, no = FALSE) + + return(list("score"=score[length(skill_A)],"signif"=signif)) +}