QThreshold.R 12.6 KB
Newer Older
nperez's avatar
nperez committed
#'Transform an absolute threshold into probabilities
#'
#'From the user's perspective, an absolute threshold can be very useful for a 
#'specific needs (e.g.: grape variety). However, this absolute threshold could 
#'be transformed to a relative threshold in order to get its frequency in a given 
#'dataset. Therefore, the function \code{QThreshold} returns the probability of 
#'an absolute threshold. This is done by computing the Cumulative Distribution 
#'Function of a sample and leaving one out. The sample used will depend on the 
#'dimensions of the data provided and the dimension names provided in sdate_dim 
#'and memb_dim parameters:
#' 
nperez's avatar
nperez committed
#'\itemize{
#'  \item{If a forecast (hindcast) has dimensions member and start date, and
#'        both must be used in the sample, their names should be passed in 
#'        sdate_dim and memb_dim.}
#'  \item{If a forecast (hindcast) has dimensions member and start date, and
#'        only start date must be used in the sample (the calculation is done in
#'        each separate member), memb_dim can be set to NULL.}
#'  \item{If a reference (observations) has start date dimension, the sample
#'        used is the start date dimension.}
#'  \item{If a reference (observations) doesn't have start date dimension, 
#'        the sample used must be especified in sdate_dim parameter.}
#'}
nperez's avatar
nperez committed
#'
#'@param data An 's2dv_cube' object as provided function \code{CST_Load} in 
#'  package CSTools.
#'@param threshold An 's2dv_cube' object as output of a 'CST_' function in the 
#'  same units as parameter 'data' and with the common dimensions of the element
#'  'data' of the same length. A single scalar is also possible.
#'@param start An optional parameter to defined the initial date of the period 
#'  to select from the data by providing a list of two elements: the initial 
#'  date of the period and the initial month of the period. By default it is set
#'  to NULL and the indicator is computed using all the data provided in 
#'  \code{data}.
#'@param end An optional parameter to defined the final date of the period to 
#'  select from the data by providing a list of two elements: the final day of 
#'  the period and the final month of the period. By default it is set to NULL 
#'  and the indicator is computed using all the data provided in \code{data}.
#'@param time_dim A character string indicating the name of the temporal 
#'  dimension. By default, it is set to 'ftime'. More than one dimension name 
#'  matching the dimensions provided in the object \code{data$data} can be 
#'  specified. This dimension is required to subset the data in a requested 
#'  period.
#'@param memb_dim A character string indicating the name of the dimension in 
#'  which the ensemble members are stored.
#'@param sdate_dim A character string indicating the name of the dimension in 
#'  which the initialization dates are stored. 
#'@param ncores An integer indicating the number of cores to use in parallel 
#'  computation.
nperez's avatar
nperez committed
#'
#'@return An 's2dv_cube' object containing the probability of an absolute 
#'threshold in the element \code{data}.
nperez's avatar
nperez committed
#'
#'@examples
#'threshold <- 26
#'exp <- NULL
#'exp$data <- array(abs(rnorm(112)*26), dim = c(member = 7, sdate = 8, ftime = 2))
#'class(exp) <- 's2dv_cube'
nperez's avatar
nperez committed
#'exp_probs <- CST_QThreshold(exp, threshold)
#'exp$data <- array(rnorm(5 * 3 * 214 * 2),
#'                    c(member = 5, sdate = 3, ftime = 214, lon = 2)) 
#'exp$attrs$Dates <- c(seq(as.Date("01-05-2000", format = "%d-%m-%Y"), 
#'                         as.Date("30-11-2000", format = "%d-%m-%Y"), by = 'day'),
#'                     seq(as.Date("01-05-2001", format = "%d-%m-%Y"), 
#'                         as.Date("30-11-2001", format = "%d-%m-%Y"), by = 'day'),
#'                     seq(as.Date("01-05-2002", format = "%d-%m-%Y"), 
#'                         as.Date("30-11-2002", format = "%d-%m-%Y"), by = 'day'))
#'exp_probs <- CST_QThreshold(exp, threshold)
#' 
#'@import multiApply
nperez's avatar
nperez committed
#'@export
CST_QThreshold <- function(data, threshold, start = NULL, end = NULL,
                           time_dim = 'ftime', memb_dim = 'member', 
                           sdate_dim = 'sdate', ncores = NULL) {
  # Check 's2dv_cube'
  if (!inherits(data, 's2dv_cube')) {
    stop("Parameter 'data' must be of the class 's2dv_cube'.")
nperez's avatar
nperez committed
  }
  # Dates subset
nperez's avatar
nperez committed
  if (!is.null(start) && !is.null(end)) {
    if (is.null(dim(data$attrs$Dates))) {
      warning("Dimensions in 'data' element 'attrs$Dates' are missed and ",
              "all data would be used.")
      start <- NULL
      end <- NULL
nperez's avatar
nperez committed
    }
  }
  if (inherits(threshold, 's2dv_cube')) {
    threshold <- threshold$data
  }
  probs <- QThreshold(data$data, threshold, dates = data$attrs$Dates, 
                      start, end, time_dim = time_dim, memb_dim = memb_dim,
nperez's avatar
nperez committed
                      sdate_dim = sdate_dim, ncores = ncores)
  data$data <- probs
  if (!is.null(start) && !is.null(end)) {
    data$attrs$Dates <- SelectPeriodOnDates(dates = data$attrs$Dates,
                                            start = start, end = end, 
                                            time_dim = time_dim, 
                                            ncores = ncores)
nperez's avatar
nperez committed
  }
  return(data)
}
#'Transform an absolute threshold into probabilities
#'
#'From the user's perspective, an absolute threshold can be very useful for a 
#'specific needs (e.g.: grape variety). However, this absolute threshold could 
#'be transformed to a relative threshold in order to get its frequency in a given 
#'dataset. Therefore, the function \code{QThreshold} returns the probability of 
#'an absolute threshold. This is done by computing the Cumulative Distribution 
#'Function of a sample and leaving-one-ot. The sample used will depend on the 
#'dimensions of the data provided and the dimension names provided in sdate_dim 
#'and memb_dim parameters:
nperez's avatar
nperez committed
#'\itemize{
#'  \item{If a forecast (hindcast) has dimensions member and start date, and
#'        both must be used in the sample, their names should be passed in 
#'        sdate_dim and memb_dim.}
#'  \item{If a forecast (hindcast) has dimensions member and start date, and
#'        only start date must be used in the sample (the calculation is done in
#'        each separate member), memb_dim can be set to NULL.}
#'  \item{If a reference (observations) has start date dimension, the sample
#'        used is the start date dimension.}
#'  \item{If a reference (observations) doesn't have start date dimension, 
#'        the sample used must be especified in sdate_dim parameter.}
#'}
nperez's avatar
nperez committed
#'
#'@param data A multidimensional array with named dimensions.
#'@param threshold A multidimensional array with named dimensions in the same 
#'  units as parameter 'data' and with the common dimensions of the element 
#'  'data' of the same length.
#'@param dates A vector of dates or a multidimensional array of dates with named
#'  dimensions matching the dimensions on parameter 'data'. By default it is 
#'  NULL, to select a period this parameter must be provided.
#'@param start An optional parameter to defined the initial date of the period 
#'  to select from the data by providing a list of two elements: the initial 
#'  date of the period and the initial month of the period. By default it is set
#'  to NULL and the indicator is computed using all the data provided in 
#'  \code{data}.
#'@param end An optional parameter to defined the final date of the period to 
#'  select from the data by providing a list of two elements: the final day of 
#'  the period and the final month of the period. By default it is set to NULL 
#'  and the indicator is computed using all the data provided in \code{data}.
#'@param time_dim A character string indicating the name of the temporal 
#'  dimension. By default, it is set to 'ftime'. More than one dimension name 
#'  matching the dimensions provided in the object \code{data$data} can be 
#'  specified. This dimension is required to subset the data in a requested 
#'  period.
#'@param memb_dim A character string indicating the name of the dimension in 
#'  which the ensemble members are stored.
#'@param sdate_dim A character string indicating the name of the dimension in 
#'  which the initialization dates are stored. 
#'@param ncores An integer indicating the number of cores to use in parallel 
#'  computation.
nperez's avatar
nperez committed
#'
#'@return A multidimensional array with named dimensions containing the 
#'probability of an absolute threshold in the element \code{data}.
nperez's avatar
nperez committed
#'
#'@examples
#'threshold = 25
nperez's avatar
nperez committed
#'data <- array(rnorm(5 * 3 * 20 * 2, mean = 26), 
#'              c(member = 5, sdate = 3, time = 20, lon = 2)) 
nperez's avatar
nperez committed
#'thres_q <- QThreshold(data, threshold)
#' 
#'@import multiApply
nperez's avatar
nperez committed
#'@export
QThreshold <- function(data, threshold, dates = NULL, start = NULL, end = NULL,
                       time_dim = 'time', memb_dim = 'member', sdate_dim = 'sdate',
                       ncores = NULL) {
  # Initial checks
  ## data
nperez's avatar
nperez committed
  if (is.null(data)) {
    stop("Parameter 'data' cannot be NULL.")
  }
  if (!is.numeric(data)) {
    stop("Parameter 'data' must be numeric.")
  }
  if (!is.array(data)) {
    dim(data) <- c(length(data), 1)
    names(dim(data)) <- c(memb_dim, sdate_dim)
  }
  if (is.null(names(dim(data)))) {
    stop("Parameter 'data' must have named dimensions.")
  }
  ## threshold
nperez's avatar
nperez committed
  if (is.null(threshold)) {
      stop("Parameter 'threshold' cannot be NULL.")
  }
   if (!is.numeric(threshold)) {
    stop("Parameter 'threshold' must be numeric.")
  }
  if (!is.array(threshold) && length(threshold) > 1) {
    dim(threshold) <- length(threshold)
    names(dim(threshold)) <- time_dim
  } else if (length(threshold) == 1) {
    dim(threshold) <- NULL
  }
  if (sdate_dim %in% names(dim(threshold))) {
    stop("Parameter threshold cannot have dimension 'sdate_dim'.") 
nperez's avatar
nperez committed
  }
  if (is.null(names(dim(threshold))) && length(threshold) > 1) {
    stop("Parameter 'threshold' must have named dimensions.")
  }
  if (is.null(memb_dim)) {
    memb_dim <- 99999
  }
nperez's avatar
nperez committed
  if (!is.null(dates)) {
    if (!is.null(start) && !is.null(end)) {
      if (!any(c(is.list(start), is.list(end)))) {
        stop("Parameter 'start' and 'end' must be lists indicating the ",
             "day and the month of the period start and end.")
      }
      if (time_dim %in% names(dim(threshold))) {
        if (dim(threshold)[time_dim] == dim(data)[time_dim]) {
          if (!is.null(dim(dates)) && sdate_dim %in% names(dim(dates))) {
            dates_thres <- .arraysubset(dates, dim = sdate_dim, value = 1)
            threshold <- SelectPeriodOnData(data = threshold, dates = dates_thres, start, end,
                                            time_dim = time_dim, ncores = ncores)
nperez's avatar
nperez committed
          } else {
            threshold <- SelectPeriodOnData(threshold, dates, start, end,
                                            time_dim = time_dim, ncores = ncores)
nperez's avatar
nperez committed
          }
        }
      }
      data <- SelectPeriodOnData(data, dates, start, end, 
                                 time_dim = time_dim, ncores = ncores)
    }
  }
  if (length(threshold) == 1) {
    if (memb_dim %in% names(dim(data))) {
      probs <- Apply(list(data), target_dims = c(memb_dim, sdate_dim),
                     fun = .qthreshold_exp,
                     threshold, ncores = ncores)$output1
    } else {
nperez's avatar
nperez committed
      probs <- Apply(list(data), target_dims = sdate_dim, fun = .qthreshold_obs,
nperez's avatar
nperez committed
                     threshold, ncores = ncores)$output1
    }
  } else {
    target_thres <- NULL
nperez's avatar
nperez committed
    if (memb_dim %in% names(dim(data))) {
      if (memb_dim %in% names(dim(threshold))) {
      # comparison member by member
        probs <- Apply(list(data, threshold), 
                       target_dims = list(sdate_dim, NULL), 
                       fun = .qthreshold_obs, ncores = ncores)$output1
      } else {
        probs <- Apply(list(data, threshold),
                       target_dims = list(c(memb_dim, sdate_dim), NULL),
                       fun = .qthreshold_exp, ncores = ncores)$output1
      }
    } else {
      probs <- Apply(list(data, threshold), target_dims = list(sdate_dim, NULL),
                     fun = .qthreshold_obs, ncores = ncores)$output1
    }
  } 
  return(probs)
}
# By splitting the atomic function, a conditional repetition is avoided 
# inside the Apply loops
.qthreshold_obs <- function(data, threshold) {
  # dims data: sdate 
  dims <- dim(data)
  # no 'member' involving 
  qres <- unlist(lapply(1:dims, function(x) { 
                        ecdf(data[-x])(threshold)}))
nperez's avatar
nperez committed
  dim(qres) <- c(dims)
  return(qres)
}
.qthreshold_exp <- function(data, threshold) {
  qres <- unlist(
  lapply(1:(dim(data)[1]), function(x) { # dim 1: member
            lapply(1:(dim(data)[2]), function(y) { # dim 2: sdate
                   ecdf(as.vector(data[,-y]))(threshold)
                   })
nperez's avatar
nperez committed
        }))
  dim(qres) <- c(dim(data)[2], dim(data)[1])
  return(qres)
}