CST_WeatherRegimes.R


#' @rdname CST_WeatherRegimes
#' @title Function for Calculating the Cluster analysis
#'
#' @author Verónica Torralba - BSC, \email{veronica.torralba@bsc.es}
#'
#' @description This function computes the weather regimes from a cluster analysis.
#'It is applied on the array \code{data} in a 's2dv_cube' object. The dimensionality of this object can be also reduced
#'by using PCs obtained from the application of the #'EOFs analysis to filter the dataset.
#'The cluster analysis can be performed with the traditional k-means or those methods
#'included in the hclust (stats package).
#'
#'@references Cortesi, N., V., Torralba, N., González-Reviriego, A., Soret, and F.J., Doblas-Reyes (2019).
#' Characterization of European wind speed variability using weather regimes. Climate Dynamics,53,
#' 4961–4976, doi:10.1007/s00382-019-04839-5.
#'@references Torralba, V. (2019) Seasonal climate prediction for the wind energy sector: methods and tools
#' for the development of a climate service. Thesis. Available online: \url{https://eprints.ucm.es/56841/}
#'
#'@param data a 's2dv_cube' object

#'@param EOFs Whether to compute the EOFs (default = 'TRUE') or not (FALSE) over data.
#'@param neofs number of modes to be kept (default = 30).
#'@param varThreshold Value with the percentage of variance to be explained by the PCs.
#' Only sufficient PCs to explain this much variance will be used in the clustering.
#'@param lon Vector of longitudes.
#'@param lat Vector of latitudes.
#'@param ncenters Number of clusters to be calculated with the clustering function.
#'@param method Different options to estimate the clusters. The most traditional approach is the k-means analysis (default=’kmeans’)
#'but the function also support the different methods included in the hclust . These methods are:
#'"ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).
#' For more details about these methods see the hclust function documentation included in the stats package.
#'@param nstarts Parameter for the cluster analysis determining how many random sets to choose (Only if method='kmeans' is selected).
#'@param iter.max Parameter to select the maximum number of iterations allowed (Only if method='kmeans' is selected).
#'@param ncores The number of multicore threads to use for parallel computation}.
#'@return A list with elements \code{$composite} (3-d array (lon, lat, k) containing the composites k=1,..,K for case (*1)
# or only k=1 for any specific cluster, i.e., case (*2)),
#'         \code{pvalue} (3-d array (lon, lat, k) containing the pvalue of the composites obtained through a t-test that accounts for the serial
# dependence of the data with the same structure as Composite.),
#'         \code{cluster} (A time series of integers (from 1:k) indicating the cluster to which each point is allocated.),
#'         \code{persistence} (The value of the regime whose length is given in cluster_lengths (only if method=’kmeans’ has been selected)),
#'         \code{frequency} (Percentage of days in a month/season belonging to each cluster (only if method=’kmeans’ has been selected).),
#'@import s2dverification
#'@import multiApply
#'@examples
#'res1 <- CST_WeatherRegimes(data = lonlat_data$obs, EOFS = FALSE, ncenters = 4)
#'res2 <- CST_WeatherRegimes(data = lonlat_data$obs, EOFS = TRUE, ncenters = 3)
#'@export
#'
CST_WeatherRegimes <- function(data, ncenters = NULL,
                              EOFS = TRUE,neofs = 30,
                              varThreshold = NULL,
                              method = "kmeans",
                              iter.max=100, nstart = 30,
                              ncores = NULL)  {
  if (!inherits(data, 's2dv_cube')) {
    stop("Parameter 'data' must be of the class 's2dv_cube', ",
         "as output by CSTools::CST_Load.")
  }
  if ('lon' %in% names(data)){
    lon <- data$lon
  }else {
    lon <- NULL
  }
  result <- WeatherRegime(data$data,ncenters = ncenters,
                          EOFS = EOFS, neofs = neofs,
                          varThreshold = varThreshold, lon = lon,
                          lat = data$lat, method = method,
                          iter.max=iter.max, nstart = nstart,
                          ncores = ncores)
  data$data <- result$composite
  data$statistics <- result[-1]
  return(data)
}

#' @rdname WeatherRegimes
#' @title Function for Calculating the Cluster analysis
#'
#' @author Verónica Torralba - BSC, \email{veronica.torralba@bsc.es}
#'
#' @description This function computes the weather regimes from a cluster analysis.
#'It can be applied over the dataset with dimensions
#'c(year/month, month/day, lon, lat), or by using PCs obtained from the application of the
#'EOFs analysis to filter the dataset.
#'The cluster analysis can be performed with the traditional k-means or those methods
#'included in the hclust (stats package).
#'
#'@references Cortesi, N., V., Torralba, N., González-Reviriego, A., Soret, and F.J., Doblas-Reyes (2019).
#' Characterization of European wind speed variability using weather regimes. Climate Dynamics,53,
#' 4961–4976, doi:10.1007/s00382-019-04839-5.
#'@references Torralba, V. (2019) Seasonal climate prediction for the wind energy sector: methods and tools
#' for the development of a climate service. Thesis. Available online: \url{https://eprints.ucm.es/56841/}
#'
#'@param data an array containing anomalies with named dimensions with at least start date 'sdate', forecast time 'ftime', latitude 'lat' and longitude 'lon'.
#'@param EOFs Whether to compute the EOFs (default = 'TRUE') or not (FALSE) over data.
#'@param neofs number of modes to be kept only if EOFs = TRUE has been selected. (default = 30).
#'@param varThreshold Value with the percentage of variance to be explained by the PCs.
#' Only sufficient PCs to explain this much variance will be used in the clustering. 
#'@param lon Vector of longitudes.
#'@param lat Vector of latitudes.
#'@param ncenters Number of clusters to be calculated with the clustering function.
#'@param method Different options to estimate the clusters. The most traditional approach is the k-means analysis (default=’kmeans’)
#'but the function also support the different methods included in the hclust . These methods are:
#'"ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).
#' For more details about these methods see the hclust function documentation included in the stats package.
#'@param nstarts Parameter for the cluster analysis determining how many random sets to choose (Only if method='kmeans' is selected).
#'@param iter.max Parameter to select the maximum number of iterations allowed (Only if method='kmeans' is selected).
#'@param ncores The number of multicore threads to use for parallel computation}.
#'@return A list with elements \code{$composite} (3-d array (lon, lat, k) containing the composites k=1,..,K for case (*1)
# or only k=1 for any specific cluster, i.e., case (*2)),
#'         \code{pvalue} (3-d array (lon, lat, k) containing the pvalue of the composites obtained through a t-test that accounts for the serial
# dependence of the data with the same structure as Composite.),
#'         \code{cluster} (A time series of integers (from 1:k) indicating the cluster to which each point is allocated.),
#'         \code{persistence} (The value of the regime whose length is given in cluster_lengths (only if method=’kmeans’ has been selected)),
#'         \code{frequency} (Percentage of days in a month/season belonging to each cluster (only if method=’kmeans’ has been selected).),
#'@import s2dverification
#'@import multiApply
#'@examples
#'res <- WeatherRegime(data=lonlat_data$obs$data, lat= lonlat_data$obs$lat, EOFS = FALSE, ncenters = 4)
#'@export

WeatherRegime <- function(data, ncenters = NULL,
                          EOFS = TRUE,neofs = 30,
                          varThreshold = NULL, lon = NULL,
                          lat = NULL, method = "kmeans",
                          iter.max=100, nstart = 30,
                          ncores = NULL) {
  
  if (is.null(names(dim(data)))) {
    stop("Parameter 'data' must be an array with named dimensions.")
  }
  
  if (is.null(lat)) {
    stop("Parameter 'lat' must be specified.")
  }
  
  dimData <- names(dim(data))

  if ('sdate' %in% dimData && 'ftime' %in% dimData) {
    nsdates <- dim(data)['sdate']
    nftimes <- dim(data)['ftime']
    data <- MergeDims(data, 
                      merge_dims = c('ftime','sdate'), 
                      rename_dim = 'time')
  } else if ('sdate' %in% dimData | 'ftime' %in% dimData) {
    names(dim(data))[which(dimData == 'sdate' | dimData == 'ftime') ] = 'time' 
  } else {
      if (!('time' %in% dimData)) {
         stop("Parameter 'data' must have temporal dimensions.")
      }
  }
  

  output <- Apply(data = list(data),
                  target_dims = c('time','lat','lon'),
                  fun = ".WeatherRegime",
                  EOFS = EOFS,  neofs = neofs,
                  varThreshold = varThreshold,
                  lon = lon, lat = lat,
                  ncenters = ncenters,
                  method = method,
                  ncores = ncores)
  
  if (method=='kmeans' && 'sdate' %in% dimData && 'ftime' %in% dimData) {
    # The frequency and the persistency are computed as they are useful 
    # parameters in the cluster analysis
    extra_output <-  Apply(data = output$cluster,
      target_dims = 'time',
      fun = .freqPer,
      nsdates = nsdates,
      nftimes = nftimes ,
      ncenters = ncenters)
    
    output <- list(composite=output$composite,
                   pvalue=output$pvalue,
                   cluster=output$cluster,
                   frequency=extra_output$frequency, 
                   persistence=extra_output$persistence)
  }
  return(output)
}

.WeatherRegime <- function(data, ncenters = NULL, EOFS = TRUE,neofs = 30,
                           varThreshold = NULL, lon = NULL,
                           lat = NULL, method = "kmeans",
                           iter.max=100, nstart = 30) {
  
  if (is.null(names(dim(data)))) {
    stop("Parameter 'data' must be an array with 'time', 'lat' and 'lon' dimensions.")
  }
  
  if (!is.null(lat) && dim(data)['lat'] != length(lat)) {
    stop("The length of the paramter 'lat' does not match with the ['lat'] dimension of 
         the parameter 'data'.")
  }
  if (is.null(ncenters)) {
    stop("Parameter 'ncenters' must be specified.")
  }
  if (EOFS  == TRUE && is.null(lon)) {
    stop("Parameter 'lon' must be specified.")
  }
  if (is.null(lat)) {
    stop("Parameter 'lat' must be specified.")
  }
  
  nlon <- dim(data)['lat']
  nlat <- dim(data)['lon']
  
  if (any(is.na(data))){
    nas_test <- MergeDims(data, merge_dims = c('lat','lon'),
                          rename_dim = 'space',na.rm = TRUE)
    if (dim(nas_test)['space']== c(nlat*nlon)){
      stop("Parameter 'data' contains NAs in the 'time' dimensions.")
    }
  }

  if (EOFS  == TRUE) {
    if (is.null(varThreshold)) {
      dataPC <- EOF(data,
                    lat = as.vector(lat),
                    lon = as.vector(lon),
                    neofs = neofs)
      cluster_input <- dataPC$PC
    } else {
      dataPC <- EOF(data,
                    lat = as.vector(lat),
                    lon = as.vector(lon),
                    neofs = neofs)
      minPC <- 
        head(as.numeric(which(cumsum(dataPC$var) > varThreshold)), 1)
      cluster_input <- dataPC$PC[, 1:minPC]
    }
  } else {
  
    dataW <- aperm(Apply(data, target_dims = 'lat', 
                         function (x, la) {
                           x * cos(la * pi / 180)},
                         la = lat)[[1]], c(2, 1, 3))
    
    cluster_input <- MergeDims(dataW, merge_dims = c('lat','lon'),
                               rename_dim = 'space',na.rm = TRUE)

  }
  
  if (method == "kmeans") {
   
    clust <- kmeans(
      cluster_input,
      centers = ncenters,
      iter.max = iter.max,
      nstart = nstart,
      trace = FALSE)
    
    result <- array(0, c(ncenters, nlat, nlon))
    # the order of the data dimensions is changed ('lat','lon','time')
    result <- Composite(aperm(data,c(2, 3, 1)), clust$cluster)
    
  } else {
    result <- hclust(dist(cluster_input), method = method)
    clusterCut <- cutree(result, ncenters)
    result <- Composite(aperm(data, c(2, 3, 1)), clusterCut)
  }
  result <- lapply(1:length(result),
                   function (n) {
                     names(dim(result[[n]])) <- c("lat", "lon", "cluster")
                     return (result[[n]])
                   })
  
  names(result) <- c('composite','pvalue')

  if (method == "kmeans") {
    clust <- as.array(clust$cluster)
    names(dim(clust)) <- 'time'
    return(list(
      composite = result$composite,
      pvalue = result$pvalue,
      cluster = clust))
  } else {
    clust <- as.array(clusterCut)
    names(dim(clust)) <- 'time'
    return(list(
      composite = result$composite,
      pvalue = result$pvalue,
      cluster = clust))
  }
}

.freqPer<- function (clust, nsdates, nftimes, ncenters){
  frequency <- persistence <- matrix(NA, nsdates, ncenters)
  x <- as.vector(clust)
  for (i in 1:nsdates) {
    occurences <-rle(x[((i * nftimes) + 1 - nftimes):(i * nftimes)])
    for (j in 1:ncenters) {
      frequency[i, j] <-(sum(occurences$lengths[occurences$values == j]) /  nftimes) * 100
      persistence[i, j] <- mean(occurences$lengths[occurences$values == j])
    }
  }
  return(list(frequency = frequency,
              persistence = persistence))
}