PeriodStandardization.R


#'Compute the Standardization of Precipitation-Evapotranspiration Index
#'
#'@param data An 's2dv_cube' that element 'data' stores a multidimensional 
#'  array containing the data to be standardized.
#'@param data_cor An 's2dv_cube' that element 'data' stores a multidimensional 
#'  array containing the data in which the standardization should be applied 
#'  using the fitting parameters from 'data'.
#'@param time_dim A character string indicating the name of the temporal 
#'  dimension. By default, it is set to 'syear'. 
#'@param leadtime_dim A character string indicating the name of the temporal 
#'  dimension. By default, it is set to 'time'. 
#'@param memb_dim A character string indicating the name of the dimension in 
#'  which the ensemble members are stored. When set it to NULL, threshold is 
#'  computed for individual members.
#'@param accum An integer value indicating the number of months for the 
#'  accumulation for each variable. When it is greater than 1, the result will 
#'  be filled with NA until the accum time_dim dimension number due to the 
#'  accumulation to previous months.
#'@param ref_period A list with two numeric values with the starting and end 
#'  points of the reference period used for computing the index. The default 
#'  value is NULL indicating that the first and end values in data will be 
#'  used as starting and end points.
#'@param param_error A numeric value with the error accepted.
#'@param handle_infinity A logical value wether to return Infinite values (TRUE)
#'  or not (FALSE).
#'@param method A character string indicating the standardization method used. 
#'  If can be: 'parametric' or 'non-parametric'. It is set to 'parametric' by 
#'  default.
#'@param distribution A character string indicating the name of the distribution 
#'  function to be used for computing the SPEI. The accepted names are: 
#'  'log-Logistic' and 'Gamma'. It is set to 'log-Logistic' by default. The 
#'  'Gamma' method only works when only precipitation is provided and other 
#'  variables are 0 because it is positive defined (SPI indicator).
#'@param na.rm A logical value indicating whether NA values should be removed 
#'  from data. It is FALSE by default. If it is FALSE and there are NA values, 
#'  (if standardization is TRUE) all values of other dimensions except time_dim 
#'  and leadtime_dim will be set to NA directly. On the other hand, if it is 
#'  TRUE, if the data from other dimensions except time_dim and leadtime_dim is 
#'  not reaching 4 values, it is not enough values to estimate the parameters 
#'  and the result will include NA.
#'@param ncores An integer value indicating the number of cores to use in 
#'  parallel computation.
#' 
#'@return An object of class \code{s2dv_cube} containing the standardized data. 
#'If 'data_cor' is provided the standardizaton is applied to it using 'data' 
#'to adjust it.
#' 
#'@examples 
#'dims <-  c(syear = 6, time = 3, latitude = 2, ensemble = 25)
#'data <- NULL
#'data$data <- array(rnorm(600, -204.1, 78.1), dim = dims)
#'class(data) <- 's2dv_cube'
#'SPEI <- CST_PeriodStandardization(data = data, accum = 2)
#'@export
CST_PeriodStandardization <- function(data, data_cor = NULL, time_dim = 'syear', 
                                      leadtime_dim = 'time', memb_dim = 'ensemble',
                                      accum = NULL, ref_period = NULL, param_error = -9999,
                                      handle_infinity = FALSE, method = 'parametric', 
                                      distribution = 'log-Logistic', 
                                      na.rm = FALSE, ncores = NULL) {
  # Check 's2dv_cube'
  if (is.null(data)) {
    stop("Parameter 'exp' cannot be NULL.")
  }
  if (!inherits(data, 's2dv_cube')) {
    stop("Parameter 'data' must be of 's2dv_cube' class.")
  }
  if (!is.null(data_cor)) {
    if (!inherits(data_cor, 's2dv_cube')) {
      stop("Parameter 'data_cor' must be of 's2dv_cube' class.")
    }
  }
  std <- PeriodStandardization(data = data$data, data_cor = data_cor$data, 
                               time_dim = time_dim, leadtime_dim = leadtime_dim, 
                               memb_dim = memb_dim, accum = accum, 
                               ref_period = ref_period, param_error = param_error,
                               handle_infinity = handle_infinity, method = method, 
                               distribution = distribution, 
                               na.rm = na.rm, ncores = ncores)
  if (is.null(data_cor)) {
    data$data <- std
    data$attrs$Variable$varName <- paste0(data$attrs$Variable$varName, ' standardized')
    return(data)
  } else {
    data_cor$data <- std
    data_cor$attrs$Variable$varName <- paste0(data_cor$attrs$Variable$varName, ' standardized')
    data_cor$attrs$Datasets <- c(data_cor$attrs$Datasets, data$attrs$Datasets)
    data_cor$attrs$source_files <- c(data_cor$attrs$source_files, data$attrs$source_files)
    return(data_cor)
  }
}

#'Compute the Standardization of Precipitation-Evapotranspiration Index
#'
#'@param data A multidimensional array containing the data to be standardized.
#'@param data_cor A multidimensional array containing the data in which the 
#'  standardization should be applied using the fitting parameters from 'data'.
#'@param time_dim A character string indicating the name of the temporal 
#'  dimension. By default, it is set to 'syear'. 
#'@param leadtime_dim A character string indicating the name of the temporal 
#'  dimension. By default, it is set to 'time'. 
#'@param memb_dim A character string indicating the name of the dimension in 
#'  which the ensemble members are stored. When set it to NULL, threshold is 
#'  computed for individual members.
#'@param accum An integer value indicating the number of months for the 
#'  accumulation for each variable. When it is greater than 1, the result will 
#'  be filled with NA until the accum time_dim dimension number due to the 
#'  accumulation to previous months.
#'@param ref_period A list with two numeric values with the starting and end 
#'  points of the reference period used for computing the index. The default 
#'  value is NULL indicating that the first and end values in data will be 
#'  used as starting and end points.
#'@param param_error A numeric value with the error accepted.
#'@param handle_infinity A logical value wether to return Infinite values (TRUE)
#'  or not (FALSE).
#'@param method A character string indicating the standardization method used. 
#'  If can be: 'parametric' or 'non-parametric'. It is set to 'parametric' by 
#'  default.
#'@param distribution A character string indicating the name of the distribution 
#'  function to be used for computing the SPEI. The accepted names are: 
#'  'log-Logistic' and 'Gamma'. It is set to 'log-Logistic' by default. The 
#'  'Gamma' method only works when only precipitation is provided and other 
#'  variables are 0 because it is positive defined (SPI indicator).
#'@param na.rm A logical value indicating whether NA values should be removed 
#'  from data. It is FALSE by default. If it is FALSE and there are NA values, 
#'  (if standardization is TRUE) all values of other dimensions except time_dim 
#'  and leadtime_dim will be set to NA directly. On the other hand, if it is 
#'  TRUE, if the data from other dimensions except time_dim and leadtime_dim is 
#'  not reaching 4 values, it is not enough values to estimate the parameters 
#'  and the result will include NA.
#'@param ncores An integer value indicating the number of cores to use in 
#'  parallel computation.
#' 
#'@return A multidimensional array containing the standardized data. 
#'If 'data_cor' is provided the standardizaton is applied to it using 'data' 
#'to adjust it.
#' 
#'@examples
#'dims <-  c(syear = 6, time = 2, latitude = 2, ensemble = 25)
#'dimscor <-  c(syear = 1, time = 2, latitude = 2, ensemble = 25)
#'data <- array(rnorm(600, -194.5, 64.8), dim = dims)
#'datacor <- array(rnorm(100, -217.8, 68.29), dim = dimscor)
#'
#'SPEI <- PeriodStandardization(data = data, accum = 2)
#'SPEIcor <- PeriodStandardization(data = data, data_cor = datacor, accum = 2)
#'@import multiApply
#'@import ClimProjDiags
#'@import TLMoments
#'@import lmomco
#'@import lmom
#'@export
PeriodStandardization <- function(data, data_cor = NULL, time_dim = 'syear', 
                                  leadtime_dim = 'time', memb_dim = 'ensemble',
                                  accum = NULL, ref_period = NULL, param_error = -9999,
                                  handle_infinity = FALSE, method = 'parametric', 
                                  distribution = 'log-Logistic', 
                                  na.rm = FALSE, ncores = NULL) {
  # Check inputs
  ## data
  if (!is.array(data)) {
    stop("Parameter 'data' must be a numeric array.")
  }
  if (is.null(names(dim(data)))) {
    stop("Parameter 'data' must have dimension names.")
  }
  ## data_cor
  if (!is.null(data_cor)) {
    if (!is.array(data_cor)) {
      stop("Parameter 'data_cor' must be a numeric array.")
    }
    if (is.null(names(dim(data_cor)))) {
      stop("Parameter 'data_cor' must have dimension names.")
    }
  }
  ## time_dim
  if (!is.character(time_dim) | length(time_dim) != 1) {
    stop("Parameter 'time_dim' must be a character string.")
  }
  if (!time_dim %in% names(dim(data))) {
    stop("Parameter 'time_dim' is not found in 'data' dimension.")
  }
  if (!is.null(data_cor)) {
    if (!time_dim %in% names(dim(data_cor))) {
      stop("Parameter 'time_dim' is not found in 'data_cor' dimension.")
    } 
  }
  ## leadtime_dim
  if (!is.character(leadtime_dim) | length(leadtime_dim) != 1) {
    stop("Parameter 'leadtime_dim' must be a character string.")
  }
  if (!leadtime_dim %in% names(dim(data))) {
    stop("Parameter 'leadtime_dim' is not found in 'data' dimension.")
  }
  if (!is.null(data_cor)) {
    if (!leadtime_dim %in% names(dim(data_cor))) {
      stop("Parameter 'leadtime_dim' is not found in 'data_cor' dimension.")
    } 
  }
  ## memb_dim
  if (!is.character(memb_dim) | length(memb_dim) != 1) {
    stop("Parameter 'memb_dim' must be a character string.")
  }
  if (!memb_dim %in% names(dim(data))) {
    stop("Parameter 'memb_dim' is not found in 'data' dimension.")
  }
  if (!is.null(data_cor)) {
    if (!memb_dim %in% names(dim(data_cor))) {
      stop("Parameter 'memb_dim' is not found in 'data_cor' dimension.")
    } 
  }
  ## accum
  if (accum > dim(data)[leadtime_dim]) {
    stop(paste0("Cannot compute accumulation of ", accum, " months because ", 
                "loaded data has only ", dim(data)[leadtime_dim], " months."))
  }
  ## ref_period
  if (!is.null(ref_period)) {
    if (length(ref_period) != 2) {
      warning("Parameter 'ref_period' must be of length two indicating the ",
              "first and end years of the reference period. It will not ", 
              "be used.")
      ref_period <- NULL
    } else if (!all(sapply(ref_period, is.numeric))) {
      warning("Parameter 'ref_period' must be a numeric vector indicating the ", 
              "'start' and 'end' years of the reference period. It will not ",
              "be used.")
      ref_period <- NULL
    } else if (ref_period[[1]] > ref_period[[2]]) {
      warning("In parameter 'ref_period' 'start' cannot be after 'end'. It ",
              "will not be used.")
      ref_period <- NULL
    } else if (!all(unlist(ref_period) %in% year(dates_exp))) {
      warning("Parameter 'ref_period' contain years outside the dates. ", 
              "It will not be used.")
      ref_period <- NULL
    } else {
      years <- year(ClimProjDiags::Subset(dates_exp, along = leadtime_dim, 
                                          indices = 1))
      ref_period[[1]] <- which(ref_period[[1]] == years)
      ref_period[[2]] <- which(ref_period[[2]] == years)
    }
  }
  ## handle_infinity
  if (!is.logical(handle_infinity)) {
    stop("Parameter 'handle_infinity' must be a logical value.")
  }
  ## method
  if (!(method %in% c('parametric', 'non-parametric'))) {
    stop("Parameter 'method' must be a character string containing one of ", 
         "the following methods: 'parametric' or 'non-parametric'.")
  }
  ## distribution
  if (!(distribution %in% c('log-Logistic', 'Gamma', 'PearsonIII'))) {
    stop("Parameter 'distribution' must be a character string containing one ", 
         "of the following distributions: 'log-Logistic', 'Gamma' or ", 
         "'PearsonIII'.")
  }
  ## na.rm
  if (!is.logical(na.rm)) {
    stop("Parameter 'na.rm' must be logical.")
  }
  ## ncores
  if (!is.null(ncores)) {
    if (!is.numeric(ncores) | any(ncores %% 1 != 0) | any(ncores < 0) |
        length(ncores) > 1) {
      stop("Parameter 'ncores' must be a positive integer.")
    }
  }

  target_dims <- c(leadtime_dim, time_dim, memb_dim)

  if (is.null(ref_period)) {
    ref_start <- NULL
    ref_end <- NULL
  } else {
    ref_start <- ref_period[[1]]
    ref_end <- ref_period[[2]]
  }

  # Standardization
  if (is.null(data_cor)) {
    spei <- Apply(data = list(data), 
                  target_dims = target_dims,
                  fun = .standardization, 
                  leadtime_dim = leadtime_dim, 
                  ref_period = ref_period, handle_infinity = handle_infinity,
                  param_error = param_error, 
                  method = method, distribution = distribution, 
                  na.rm = na.rm, ncores = ncores)$output1
  } else {
    spei <- Apply(data = list(data, data_cor), target_dims = target_dims,
                  fun = .standardization,
                  leadtime_dim = leadtime_dim, 
                  ref_period = ref_period, handle_infinity = handle_infinity,
                  param_error = param_error, 
                  method = method, distribution = distribution, 
                  na.rm = na.rm, ncores = ncores)$output1
  }

  # add NA
  if (!is.null(accum)) {
    spei <- Apply(data = list(spei), target_dims = leadtime_dim,
                  output_dims = leadtime_dim,
                  fun = function(x, accum, leadtime_dim) {
                    res <- c(rep(NA, accum-1), x)
                    return(res)
                  }, accum = accum, leadtime_dim = leadtime_dim)$output1
  }
  if (is.null(data_cor)) {
    pos <- match(names(dim(data)), names(dim(spei)))
    spei <- aperm(spei, pos)
  } else {
    pos <- match(names(dim(data_cor)), names(dim(spei)))
    spei <- aperm(spei, pos)
  }
  return(spei)
}

.standardization <- function(data, data_cor = NULL, leadtime_dim = 'time', 
                             ref_period = NULL, handle_infinity = FALSE, param_error = -9999,
                             method = 'parametric', distribution = 'log-Logistic',
                             na.rm = FALSE) {
  # data: [leadtime_dim, time_dim, memb_dim]
  dims <- dim(data)[-1]
  fit = 'ub-pwm'

  coef = switch(distribution,
                "Gamma" = array(NA, dim = 2, dimnames = list(c('alpha', 'beta'))),
                "log-Logistic" = array(NA, dim = 3, dimnames = list(c('xi', 'alpha', 'kappa'))),
                "PearsonIII" = array(NA, dim = 3, dimnames = list(c('mu', 'sigma', 'gamma'))))

  if (is.null(data_cor)) {
    # cross_val = TRUE
    spei_mod <- data*NA
    for (ff in 1:dim(data)[leadtime_dim]) {
      data2 <- data[ff, , ]
      dim(data2) <- dims
      if (method == 'non-parametric') {
        bp <- matrix(0, length(data2), 1)
        for (i in 1:length(data2)) {
          bp[i,1] = sum(data2[] <= data2[i], na.rm = na.rm);  # Writes the rank of the data
        }
        std_index <- qnorm((bp - 0.44)/(length(data2) + 0.12))
        dim(std_index) <- dims
        spei_mod[ff, , ] <- std_index
      } else {
        if (!is.null(ref_start) && !is.null(ref_end)) {
          data_fit <- window(data2, ref_start, ref_end)	
        } else {
          data_fit <- data2
        }
        for (nsd in 1:dim(data)[time_dim]) {
          acu <- as.vector(data_fit[-nsd, ])
          acu_sorted <- sort.default(acu, method = "quick")
          if (na.rm) {
            acu_sorted <- acu_sorted[!is.na(acu_sorted)]
          }
          if (!any(is.na(acu_sorted)) & length(acu_sorted) != 0) {
            acu_sd <- sd(acu_sorted)
            if (!is.na(acu_sd) & acu_sd != 0) {
              if (distribution != "log-Logistic") {
                acu_sorted <- acu_sorted[acu_sorted > 0]
              }
              if (length(acu_sorted) >= 4) {
                f_params <- .std(acu_sorted, fit, distribution)
              }
              if (all(is.na(f_params))) {
                cdf_res <- NA
              } else {
                f_params <- f_params[which(!is.na(f_params))]
                cdf_res = switch(distribution,
                                "log-Logistic" = lmom::cdfglo(data2, f_params),
                                "Gamma" = lmom::cdfgam(data2, f_params),
                                "PearsonIII" = lmom::cdfpe3(data2, f_params))
              }
              std_index_cv <- array(qnorm(cdf_res), dim = dims)
              spei_mod[ff, nsd, ] <- std_index_cv[nsd, ]
            }
          }
        }
      }
    }
  } else {
    # cross_val = FALSE
    spei_mod <- data_cor*NA
    dimscor <- dim(data_cor)[-1]
    for (ff in 1:dim(data)[leadtime_dim]) {
      data_cor2 <- data_cor[ff, , ]
      dim(data_cor2) <- dimscor
      if (method == 'non-parametric') {
        bp <- matrix(0, length(data_cor2), 1)
        for (i in 1:length(data_cor2)) {
          bp[i,1] = sum(data_cor2[] <= data_cor2[i], na.rm = na.rm);  # Writes the rank of the data
        }
        std_index <- qnorm((bp - 0.44)/(length(data_cor2) + 0.12))
        dim(std_index) <- dimscor
        spei_mod[ff, , ] <- std_index
      } else {
        data2 <- data[ff, , ]
        dim(data2) <- dims
        if (!is.null(ref_start) && !is.null(ref_end)) {
          data_fit <- window(data2, ref_start, ref_end)	
        } else {
          data_fit <- data2
        }
        acu <- as.vector(data_fit)
        acu_sorted <- sort.default(acu, method = "quick")
        if (na.rm) {
          acu_sorted <- acu_sorted[!is.na(acu_sorted)]
        }
        if (!any(is.na(acu_sorted)) & length(acu_sorted) != 0) {
          acu_sd <- sd(acu_sorted)
          if (!is.na(acu_sd) & acu_sd != 0) {
            if (distribution != "log-Logistic") {
              acu_sorted <- acu_sorted[acu_sorted > 0]
            }
            if (length(acu_sorted) >= 4) {
              f_params <- .std(data = acu_sorted, fit = fit, 
                               distribution = distribution)
            }
            if (all(is.na(f_params))) {
              cdf_res <- NA
            } else {
              f_params <- f_params[which(!is.na(f_params))]
              cdf_res = switch(distribution,
                              "log-Logistic" = lmom::cdfglo(data_cor2, f_params),
                              "Gamma" = lmom::cdfgam(data_cor2, f_params),
                              "PearsonIII" = lmom::cdfpe3(data_cor2, f_params))
            }
            std_index_cv <- array(qnorm(cdf_res), dim = dimscor)
            spei_mod[ff, , ] <- std_index_cv
          }
        }
      }
    }
  }
  if (handle_infinity) { 
    # could also use "param_error" ?; we are giving it the min/max value of the grid point
    spei_mod[is.infinite(spei_mod) & spei_mod < 0] <- min(spei_mod[!is.infinite(spei_mod)])
    spei_mod[is.infinite(spei_mod) & spei_mod > 0] <- max(spei_mod[!is.infinite(spei_mod)]) 
  }
  return(spei_mod)
}

.std <- function(data, fit, distribution) {
  pwm = switch(fit,
               "pp-pwm" = pwm.pp(data, -0.35, 0, nmom = 3),
               pwm.ub(data, nmom = 3)
               # TLMoments::PWM(data, order = 0:2)
               )
  lmom <- pwm2lmom(pwm) 
  if (!(!are.lmom.valid(lmom) || anyNA(lmom[[1]]) || any(is.nan(lmom[[1]])))) {
    fortran_vec = c(lmom$lambdas[1:2], lmom$ratios[3])
    params = switch(distribution,
                    "log-Logistic" = tryCatch(lmom::pelglo(fortran_vec), 
                                              error = function(e){parglo(lmom)$para}),
                    "Gamma" = tryCatch(lmom::pelgam(fortran_vec), 
                                       error = function(e){pargam(lmom)$para}),
                    "PearsonIII" = tryCatch(lmom::pelpe3(fortran_vec), 
                                            error = function(e){parpe3(lmom)$para}))
    if (distribution == 'log-Logistic' && fit == 'max-lik') {
      params = parglo.maxlik(data, params)$para
    }
    return(params)
  } else {
    return(NA)
  }
}