Cluster.R 8.07 KB
Newer Older
#'K-means Clustering
#'
#'Compute cluster centers and their time series of occurrences, with the 
#'K-means clustering method using Euclidean distance, of an array of input data
#'with any number of dimensions that at least contain time_dim. 
#'Specifically, it partitions the array along time axis in K groups or clusters
#'in which each space vector/array belongs to (i.e., is a member of) the 
#'cluster with the nearest center or centroid. This function relies on the 
#'NbClust package (Charrad et al., 2014 JSS). 
#'
#'@param data A numeric array with named dimensions that at least have 
#'  'time_dim' corresponding to time and the dimensions of 'weights'
#'  corresponding to either area-averages over a series of domains or the grid 
#'  points for any sptial grid structure.
#'@param weights A numeric array with named dimension of multiplicative weights
#'  based on the areas covering each domain/region or grid-cell of 'data'. The 
#'  dimensions must also be part of the 'data' dimensions.
#'@param time_dim A character string indicating the name of time dimension in 
#'  'data'. The default value is 'sdate'.
#'@param nclusters A positive integer K that must be bigger than 1 indicating
#'  the number of clusters to be computed, or K initial cluster centers to be 
#'  used in the method. The default is NULL, and users have to specify which
#'  index from NbClust and the associated criteria for selecting the optimal
#'  number of clusters will be used for K-means clustering of 'data'.
#'@param index A character string of the validity index from NbClust package 
#'  that can be used to determine optimal K if K is not specified with 
#'  'nclusters'. The default value is 'sdindex' (Halkidi et al. 2001, JIIS). 
#'  Other indices available in NBClust are "kl", "ch", "hartigan", "ccc", 
#'  "scott", "marriot", "trcovw", "tracew", "friedman", "rubin", "cindex", "db",
#'  "silhouette", "duda", "pseudot2", "beale", "ratkowsky", "ball", 
#'  "ptbiserial", "gap", "frey", "mcclain", "gamma", "gplus", "tau", "dunn", 
aho's avatar
aho committed
#'  "hubert", "sdindex", and "sdbw".
#'  One can also use all of them with the option 'alllong' or almost all indices
#   except gap, gamma, gplus and tau with 'all', when the optimal number of
#'  clusters K is detremined by the majority rule (the maximum of histogram of 
#'  the results of all indices with finite solutions). Use of some indices on 
#'  a big and/or unstructured dataset can be computationally intense and/or 
#'  could lead to numerical singularity. 
#'@param ncores An integer indicating the number of cores to use for parallel 
#'  computation. The default value is NULL.
#'
#'@return
#'A list containing:
#'\item{$cluster}{
#'  An integer vector of the occurrence of a cluster along time, i.e., when
#'  certain data member in time is allocated to a specific cluster.
#'}
#'\item{$centers}{
#'  A matrix of cluster centres or centroids (e.g. [1:K, 1:spatial degrees of freedom]).
#'}
#'\item{$totss}{
#'  A number of the total sum of squares.
#'}
#'\item{$withinss}{
#'  A vector of within-cluster sum of squares, one component per cluster.
#'}
#'\item{$tot.withinss}{
#'  A number of the total within-cluster sum of squares, i.e., sum(withinss).
#'}
#'\item{$betweenss}{
#'  A number of the between-cluster sum of squares, i.e. totss-tot.withinss.
#'}
#'\item{$size}{
#'  A vector of the number of points in each cluster.
#'}
#'\item{$iter}{
#'  An interger as the number of (outer) iterations.
#'}
#'\item{$ifault}{
#'  An integer as an indicator of a possible algorithm problem.
#'}
#'
#'@references
#'Wilks, 2011, Statistical Methods in the Atmospheric Sciences, 3rd ed., Elsevire, pp 676.
#'
#'@examples
#'# Generating synthetic data
#'a1 <- array(dim = c(200, 4))
#'mean1 <- 0
#'sd1 <- 0.3 
#'
#'c0 <- seq(1, 200)
#'c1 <- sort(sample(x = 1:200, size = sample(x = 50:150, size = 1), replace = FALSE))
#'x1 <- c(1, 1, 1, 1)
#'for (i1 in c1) {
#'  a1[i1, ] <- x1 + rnorm(4, mean = mean1, sd = sd1)
#'}
#'
#'c1p5 <- c0[!(c0 %in% c1)]
#'c2 <- c1p5[seq(1, length(c1p5), 2)] 
#'x2 <- c(2, 2, 4, 4)
#'for (i2 in c2) {
#'  a1[i2, ] <- x2 + rnorm(4, mean = mean1, sd = sd1)
#'}
#'
#'c3 <- c1p5[seq(2, length(c1p5), 2)]
#'x3 <- c(3, 3, 1, 1)
#'for (i3 in c3) {
#'  a1[i3, ] <- x3 + rnorm(4, mean = mean1, sd = sd1)
#'}
#'
#'# Computing the clusters
aho's avatar
aho committed
#'names(dim(a1)) <- c('sdate', 'space')
aho's avatar
aho committed
#'res1 <- Cluster(data = a1, weights = array(1, dim = dim(a1)[2]), nclusters = 3)
#'res2 <- Cluster(data = a1, weights = array(1, dim = dim(a1)[2]))
#'
#'@import NbClust multiApply
#'@importFrom abind abind
#'@importFrom stats kmeans
#'@importFrom grDevices pdf dev.off 
#'@export
Cluster <- function(data, weights, time_dim = 'sdate', 
                    nclusters = NULL, index = 'sdindex', ncores = NULL) {
  # Check inputs 
  ## data
  if (is.null(data)) {
    stop("Parameter 'data' cannot be NULL.")
  }
  if (!is.numeric(data)) {
    stop("Parameter 'data' must be a numeric array.")
  }
  if (is.null(dim(data))) {  #is vector
    dim(data) <- c(length(data))
    names(dim(data)) <- time_dim
  }
  if(any(is.null(names(dim(data))))| any(nchar(names(dim(data))) == 0)) {
    stop("Parameter 'data' must have dimension names.")
  }

  ## weights
  if (is.null(weights)) {
    stop("Parameter 'weights' cannot be NULL.")
  }
  if (!is.numeric(weights)) {
    stop("Parameter 'weights' must be a numeric array.")
  }
  if (is.null(dim(weights))) {  #is vector
    dim(weights) <- c(length(weights))
  }
  if(any(is.null(names(dim(weights))))| any(nchar(names(dim(weights))) == 0)) {
    stop("Parameter 'weights' must have dimension names.")
  }
  if (any(!names(dim(weights)) %in% names(dim(data)) |
          !dim(weights) %in% dim(data))) {
    stop("Parameter 'weights' must have dimensions that can be found in 'data' dimensions.")
  }

  ## time_dim
  if (!is.character(time_dim) | length(time_dim) > 1) {
    stop("Parameter 'time_dim' must be a character string.")
  }
  if (!time_dim %in% names(dim(data))) {
    stop("Parameter 'time_dim' is not found in 'data' dimensions.")
  }

  ## nclusters
  if (!is.null(nclusters)) {
    if (!is.numeric(nclusters) | length(nclusters) != 1) {
      stop("Parameter 'nclusters' must be an integer bigger than 1.") 
    } else if (nclusters <= 1) {
      stop("Parameter 'nclusters' must be an integer bigger than 1.")      
    }
  }

  ## index
  if (!is.character(index) | length(index) > 1) {
    stop("Parameter 'index' should be a character strings accepted as 'index' by the function NbClust::NbClust.")
  }

  ## ncores
  if (!is.null(ncores)) {
    if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 |
      length(ncores) > 1) {
      stop("Parameter 'ncores' must be a positive integer.")
    }
  }

  ###############################
  # Calculate Cluster

  output <- Apply(list(data),
                  target_dims = c(time_dim, names(dim(weights))),
                  fun = .Cluster,
                  weights = weights, nclusters = nclusters, index = index,
                  ncores = ncores)

  return(output)
}

.Cluster <- function(data, weights, nclusters = NULL, index = 'sdindex') {
  # data: [time, lat, lon]
  dat_dim <- dim(data)

  # Reshape data into two dims
  dim(data) <- c(dat_dim[1], prod(dat_dim[-1]))
  dim(weights) <- prod(dim(weights))  # a vector

  # weights
  data_list <- lapply(1:dat_dim[1], 
                      function(x) { data[x, ] * weights })
  data <- do.call(abind::abind, c(data_list, along = 0))

  if (!is.null(nclusters)) {
    kmeans.results <- kmeans(data, centers = nclusters, iter.max = 300, 
                             nstart = 30) 
  } else {
    pdf(file = NULL)
    nbclust.results <- NbClust::NbClust(data, distance = 'euclidean', 
                                        min.nc = 2, max.nc = 20, 
                                        method = 'kmeans', index = index)
    dev.off()

    if (index == 'all' || index == 'alllong') {
      kmc  <- hist(nbclust.results$Best.nc[1, ], breaks = seq(0, 20), 
                   plot = FALSE)$counts
      kmc1 <- which(kmc == max(kmc))
    } else {
      kmc1 <- nbclust.results$Best.nc[1]
    }

    kmeans.results <- kmeans(data, centers = kmc1, iter.max = 300, 
                             nstart = 30)
  }

  invisible(kmeans.results)
}