#'Compute the correlation coefficient between an array of forecast and their corresponding observation #' #'Calculate the correlation coefficient (Pearson, Kendall or Spearman) for #'an array of forecast and an array of observation. The correlations are #'computed along time_dim, the startdate dimension. If comp_dim is given, #'the correlations are computed only if obs along the comp_dim dimension are #'complete between limits[1] and limits[2], i.e., there is no NA between #'limits[1] and limits[2]. This option can be activated if the user wants to #'account only for the forecasts which the corresponding observations are #'available at all leadtimes.\cr #'The confidence interval is computed by the Fisher transformation and the #'significance level relies on an one-sided student-T distribution.\cr #'If the dataset has more than one member, ensemble mean is necessary necessary #'before using this function since it only allows one dimension 'dat_dim' to #'have inconsistent length between 'exp' and 'obs'. If all the dimensions of #''exp' and 'obs' are identical, you can simply use apply() and cor() to #'compute the correlation. #' #'@param exp A named numeric array of experimental data, with at least two #' dimensions 'time_dim' and 'dat_dim'. #'@param obs A named numeric array of observational data, same dimensions as #' parameter 'exp' except along 'dat_dim' and 'memb_dim'. #'@param time_dim A character string indicating the name of dimension along #' which the correlations are computed. The default value is 'sdate'. #'@param dat_dim A character string indicating the name of dataset (nobs/nexp) #' dimension. The default value is 'dataset'. #'@param comp_dim A character string indicating the name of dimension along which #' obs is taken into account only if it is complete. The default value #' is NULL. #'@param limits A vector of two integers indicating the range along comp_dim to #' be completed. The default is c(1, length(comp_dim dimension)). #'@param method A character string indicating the type of correlation: #' 'pearson', 'spearman', or 'kendall'. The default value is 'pearson'. #'@param memb_dim A character string indicating the name of the member #' dimension. It must be one dimension in 'exp' and 'obs'. If there is no #' member dimension, set NULL. The default value is NULL. #'@param memb A logical value indicating whether to remain 'memb_dim' dimension #' (TRUE) or do ensemble mean over 'memb_dim' (FALSE). Only functional when #' 'memb_dim' is not NULL. The default value is TRUE. #'@param pval A logical value indicating whether to compute or not the p-value #' of the test Ho: Corr = 0. The default value is TRUE. #'@param conf A logical value indicating whether to retrieve the confidence #' intervals or not. The default value is TRUE. #'@param conf.lev A numeric indicating the confidence level for the #' regression computation. The default value is 0.95. #'@param ncores An integer indicating the number of cores to use for parallel #' computation. The default value is NULL. #' #'@return #'A list containing the numeric arrays with dimension:\cr #' c(nexp, nobs, exp_memb, obs_memb, all other dimensions of exp except #' time_dim and memb_dim).\cr #'nexp is the number of experiment (i.e., 'dat_dim' in exp), and nobs is the #'number of observation (i.e., 'dat_dim' in obs). exp_memb is the number of #'member in experiment (i.e., 'memb_dim' in exp) and obs_memb is the number of #'member in observation (i.e., 'memb_dim' in obs).\cr\cr #'\item{$corr}{ #' The correlation coefficient. #'} #'\item{$p.val}{ #' The p-value. Only present if \code{pval = TRUE}. #'} #'\item{$conf.lower}{ #' The lower confidence interval. Only present if \code{conf = TRUE}. #'} #'\item{$conf.upper}{ #' The upper confidence interval. Only present if \code{conf = TRUE}. #'} #' #'@examples #'# Case 1: Load sample data as in Load() example: #'example(Load) #'clim <- Clim(sampleData$mod, sampleData$obs) #'ano_exp <- Ano(sampleData$mod, clim$clim_exp) #'ano_obs <- Ano(sampleData$obs, clim$clim_obs) #'runmean_months <- 12 #' #'# Smooth along lead-times #'smooth_ano_exp <- Smoothing(ano_exp, runmeanlen = runmean_months) #'smooth_ano_obs <- Smoothing(ano_obs, runmeanlen = runmean_months) #'required_complete_row <- 3 # Discard start dates which contain any NA lead-times #'leadtimes_per_startdate <- 60 #'corr <- Corr(MeanDims(smooth_ano_exp, 'member'), #' MeanDims(smooth_ano_obs, 'member'), #' comp_dim = 'ftime', #' limits = c(ceiling((runmean_months + 1) / 2), #' leadtimes_per_startdate - floor(runmean_months / 2))) #' #'# Case 2: Keep member dimension #'corr <- Corr(smooth_ano_exp, smooth_ano_obs, memb_dim = 'member') #'# ensemble mean #'corr <- Corr(smooth_ano_exp, smooth_ano_obs, memb_dim = 'member', memb = FALSE) #' #'@import multiApply #'@importFrom ClimProjDiags Subset #'@importFrom stats cor pt qnorm #'@export Corr <- function(exp, obs, time_dim = 'sdate', dat_dim = 'dataset', comp_dim = NULL, limits = NULL, method = 'pearson', memb_dim = NULL, memb = TRUE, pval = TRUE, conf = TRUE, conf.lev = 0.95, ncores = NULL) { # Check inputs ## exp and obs (1) if (is.null(exp) | is.null(obs)) { stop("Parameter 'exp' and 'obs' cannot be NULL.") } if (!is.numeric(exp) | !is.numeric(obs)) { stop("Parameter 'exp' and 'obs' must be a numeric array.") } if (is.null(dim(exp)) | is.null(dim(obs))) { stop(paste0("Parameter 'exp' and 'obs' must be at least two dimensions ", "containing time_dim and dat_dim.")) } if(any(is.null(names(dim(exp))))| any(nchar(names(dim(exp))) == 0) | any(is.null(names(dim(obs))))| any(nchar(names(dim(obs))) == 0)) { stop("Parameter 'exp' and 'obs' must have dimension names.") } if(!all(names(dim(exp)) %in% names(dim(obs))) | !all(names(dim(obs)) %in% names(dim(exp)))) { stop("Parameter 'exp' and 'obs' must have same dimension name") } ## time_dim if (!is.character(time_dim) | length(time_dim) > 1) { stop("Parameter 'time_dim' must be a character string.") } if (!time_dim %in% names(dim(exp)) | !time_dim %in% names(dim(obs))) { stop("Parameter 'time_dim' is not found in 'exp' or 'obs' dimension.") } ## dat_dim if (!is.character(dat_dim) | length(dat_dim) > 1) { stop("Parameter 'dat_dim' must be a character string.") } if (!dat_dim %in% names(dim(exp)) | !dat_dim %in% names(dim(obs))) { stop("Parameter 'dat_dim' is not found in 'exp' or 'obs' dimension.") } ## comp_dim if (!is.null(comp_dim)) { if (!is.character(comp_dim) | length(comp_dim) > 1) { stop("Parameter 'comp_dim' must be a character string.") } if (!comp_dim %in% names(dim(exp)) | !comp_dim %in% names(dim(obs))) { stop("Parameter 'comp_dim' is not found in 'exp' or 'obs' dimension.") } } ## limits if (!is.null(limits)) { if (is.null(comp_dim)) { stop("Paramter 'comp_dim' cannot be NULL if 'limits' is assigned.") } if (!is.numeric(limits) | any(limits %% 1 != 0) | any(limits < 0) | length(limits) != 2 | any(limits > dim(exp)[comp_dim])) { stop(paste0("Parameter 'limits' must be a vector of two positive ", "integers smaller than the length of paramter 'comp_dim'.")) } } ## method if (!(method %in% c("kendall", "spearman", "pearson"))) { stop("Parameter 'method' must be one of 'kendall', 'spearman' or 'pearson'.") } ## memb_dim if (!is.null(memb_dim)) { if (!is.character(memb_dim) | length(memb_dim) > 1) { stop("Parameter 'memb_dim' must be a character string.") } if (!memb_dim %in% names(dim(exp)) | !memb_dim %in% names(dim(obs))) { stop("Parameter 'memb_dim' is not found in 'exp' or 'obs' dimension.") } } ## memb if (!is.logical(memb) | length(memb) > 1) { stop("Parameter 'memb' must be one logical value.") } ## pval if (!is.logical(pval) | length(pval) > 1) { stop("Parameter 'pval' must be one logical value.") } ## conf if (!is.logical(conf) | length(conf) > 1) { stop("Parameter 'conf' must be one logical value.") } ## conf.lev if (!is.numeric(conf.lev) | conf.lev < 0 | conf.lev > 1 | length(conf.lev) > 1) { stop("Parameter 'conf.lev' must be a numeric number between 0 and 1.") } ## ncores if (!is.null(ncores)) { if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 | length(ncores) > 1) { stop("Parameter 'ncores' must be a positive integer.") } } ## exp and obs (2) name_exp <- sort(names(dim(exp))) name_obs <- sort(names(dim(obs))) name_exp <- name_exp[-which(name_exp == dat_dim)] name_obs <- name_obs[-which(name_obs == dat_dim)] if (!is.null(memb_dim)) { name_exp <- name_exp[-which(name_exp == memb_dim)] name_obs <- name_obs[-which(name_obs == memb_dim)] } if(!all(dim(exp)[name_exp] == dim(obs)[name_obs])) { stop(paste0("Parameter 'exp' and 'obs' must have same length of ", "all dimension expect 'dat_dim' and 'memb_dim'.")) } if (dim(exp)[time_dim] < 3) { stop("The length of time_dim must be at least 3 to compute correlation.") } ############################### # Sort dimension name_exp <- names(dim(exp)) name_obs <- names(dim(obs)) order_obs <- match(name_exp, name_obs) obs <- Reorder(obs, order_obs) ############################### # Calculate Corr # Remove data along comp_dim dim if there is at least one NA between limits if (!is.null(comp_dim)) { if (is.null(limits)) { limits <- c(1, dim(obs)[comp_dim]) } pos <- which(names(dim(obs)) == comp_dim) obs_sub <- Subset(obs, pos, list(limits[1]:limits[2])) outrows <- is.na(MeanDims(obs_sub, pos, na.rm = FALSE, ncores = ncores)) outrows <- InsertDim(outrows, pos, dim(obs)[comp_dim], ncores = ncores) obs[which(outrows)] <- NA } if (is.null(memb_dim)) { # Define output_dims if (conf & pval) { output_dims <- list(corr = c('nexp', 'nobs'), p.val = c('nexp', 'nobs'), conf.lower = c('nexp', 'nobs'), conf.upper = c('nexp', 'nobs')) } else if (conf & !pval) { output_dims <- list(corr = c('nexp', 'nobs'), conf.lower = c('nexp', 'nobs'), conf.upper = c('nexp', 'nobs')) } else if (!conf & pval) { output_dims <- list(corr = c('nexp', 'nobs'), p.val = c('nexp', 'nobs')) } else { output_dims <- list(corr = c('nexp', 'nobs')) } res <- Apply(list(exp, obs), target_dims = list(c(time_dim, dat_dim), c(time_dim, dat_dim)), output_dims = output_dims, fun = .Corr, time_dim = time_dim, method = method, pval = pval, conf = conf, conf.lev = conf.lev, ncores = ncores) } else { if (!memb) { #ensemble mean name_exp <- names(dim(exp)) margin_dims_ind <- c(1:length(name_exp))[-which(name_exp == memb_dim)] exp <- apply(exp, margin_dims_ind, mean, na.rm = TRUE) #NOTE: remove NAs here obs <- apply(obs, margin_dims_ind, mean, na.rm = TRUE) # Define output_dims if (conf & pval) { output_dims <- list(corr = c('nexp', 'nobs'), p.val = c('nexp', 'nobs'), conf.lower = c('nexp', 'nobs'), conf.upper = c('nexp', 'nobs')) } else if (conf & !pval) { output_dims <- list(corr = c('nexp', 'nobs'), conf.lower = c('nexp', 'nobs'), conf.upper = c('nexp', 'nobs')) } else if (!conf & pval) { output_dims <- list(corr = c('nexp', 'nobs'), p.val = c('nexp', 'nobs')) } else { output_dims <- list(corr = c('nexp', 'nobs')) } res <- Apply(list(exp, obs), target_dims = list(c(time_dim, dat_dim), c(time_dim, dat_dim)), output_dims = output_dims, fun = .Corr, time_dim = time_dim, method = method, pval = pval, conf = conf, conf.lev = conf.lev, ncores_input = ncores, ncores = ncores) } else { # correlation for each member # Define output_dims if (conf & pval) { output_dims <- list(corr = c('nexp', 'nobs', 'exp_memb', 'obs_memb'), p.val = c('nexp', 'nobs', 'exp_memb', 'obs_memb'), conf.lower = c('nexp', 'nobs', 'exp_memb', 'obs_memb'), conf.upper = c('nexp', 'nobs', 'exp_memb', 'obs_memb')) } else if (conf & !pval) { output_dims <- list(corr = c('nexp', 'nobs', 'exp_memb', 'obs_memb'), conf.lower = c('nexp', 'nobs', 'exp_memb', 'obs_memb'), conf.upper = c('nexp', 'nobs', 'exp_memb', 'obs_memb')) } else if (!conf & pval) { output_dims <- list(corr = c('nexp', 'nobs', 'exp_memb', 'obs_memb'), p.val = c('nexp', 'nobs', 'exp_memb', 'obs_memb')) } else { output_dims <- list(corr = c('nexp', 'nobs', 'exp_memb', 'obs_memb')) } res <- Apply(list(exp, obs), target_dims = list(c(time_dim, dat_dim, memb_dim), c(time_dim, dat_dim, memb_dim)), output_dims = output_dims, fun = .Corr, time_dim = time_dim, method = method, pval = pval, conf = conf, conf.lev = conf.lev, ncores_input = ncores, ncores = ncores) } } return(res) } .Corr <- function(exp, obs, time_dim = 'sdate', method = 'pearson', conf = TRUE, pval = TRUE, conf.lev = 0.95, ncores_input = NULL) { if (length(dim(exp)) == 2) { # exp: [sdate, dat_exp] # obs: [sdate, dat_obs] nexp <- as.numeric(dim(exp)[2]) nobs <- as.numeric(dim(obs)[2]) # NOTE: Use sapply to replace the for loop CORR <- sapply(1:nobs, function(i) { sapply(1:nexp, function (x) { if (any(!is.na(exp[, x])) && sum(!is.na(obs[, i])) > 2) { #NOTE: Is this necessary? cor(exp[, x], obs[, i], use = "pairwise.complete.obs", method = method) } else { NA #CORR[, i] <- NA } }) }) if (is.null(dim(CORR))) { CORR <- array(CORR, dim = c(1, 1)) } } else { # member # exp: [sdate, dat_exp, memb_exp] # obs: [sdate, dat_obs, memb_obs] nexp <- as.numeric(dim(exp)[2]) nobs <- as.numeric(dim(obs)[2]) exp_memb <- as.numeric(dim(exp)[3]) obs_memb <- as.numeric(dim(obs)[3]) CORR <- array(dim = c(nexp = nexp, nobs = nobs, exp_memb = exp_memb, obs_memb = obs_memb)) for (j in 1:obs_memb) { for (y in 1:exp_memb) { CORR[, , y, j] <- sapply(1:nobs, function(i) { sapply(1:nexp, function (x) { if (any(!is.na(exp[, x, y])) && sum(!is.na(obs[, i, j])) > 2) { cor(exp[, x, y], obs[, i, j], use = "pairwise.complete.obs", method = method) } else { NA #CORR[, i] <- NA } }) }) } } } # if (pval) { # for (i in 1:nobs) { # p.val[, i] <- try(sapply(1:nexp, # function(x) {(cor.test(exp[, x], obs[, i], # use = "pairwise.complete.obs", # method = method)$p.value)/2}), silent = TRUE) # if (class(p.val[, i]) == 'character') { # p.val[, i] <- NA # } # } # } if (pval | conf) { if (method == "kendall" | method == "spearman") { tmp <- apply(obs, c(1:length(dim(obs)))[-1], rank) # for memb_dim = NULL, 2; for memb_dim, c(2, 3) names(dim(tmp))[1] <- time_dim eno <- Eno(tmp, time_dim, ncores = ncores_input) } else if (method == "pearson") { eno <- Eno(obs, time_dim, ncores = ncores_input) } if (length(dim(exp)) == 2) { eno_expand <- array(dim = c(nexp = nexp, nobs = nobs)) for (i in 1:nexp) { eno_expand[i, ] <- eno } } else { #member eno_expand <- array(dim = c(nexp = nexp, nobs = nobs, exp_memb = exp_memb, obs_memb = obs_memb)) for (i in 1:nexp) { for (j in 1:exp_memb) { eno_expand[i, , j, ] <- eno } } } } #############old################# #This doesn't return error but it's diff from cor.test() when method is spearman and kendall if (pval) { t <-sqrt(CORR * CORR * (eno_expand - 2) / (1 - (CORR ^ 2))) p.val <- pt(t, eno_expand - 2, lower.tail = FALSE) } ################################### if (conf) { conf.lower <- (1 - conf.lev) / 2 conf.upper <- 1 - conf.lower conflow <- tanh(atanh(CORR) + qnorm(conf.lower) / sqrt(eno_expand - 3)) confhigh <- tanh(atanh(CORR) + qnorm(conf.upper) / sqrt(eno_expand - 3)) } if (pval & conf) { res <- list(corr = CORR, p.val = p.val, conf.lower = conflow, conf.upper = confhigh) } else if (pval & !conf) { res <- list(corr = CORR, p.val = p.val) } else if (!pval & conf) { res <- list(corr = CORR, conf.lower = conflow, conf.upper = confhigh) } else { res <- list(corr = CORR) } return(res) }