#'K-means Clustering #' #'Compute cluster centers and their time series of occurrences, with the #'K-means clustering method using Euclidean distance, of an array of input data #'with any number of dimensions that at least contain time_dim. #'Specifically, it partitions the array along time axis in K groups or clusters #'in which each space vector/array belongs to (i.e., is a member of) the #'cluster with the nearest center or centroid. This function relies on the #'NbClust package (Charrad et al., 2014 JSS). #' #'@param data A numeric array with named dimensions that at least have #' 'time_dim' corresponding to time and the dimensions of 'weights' #' corresponding to either area-averages over a series of domains or the grid #' points for any sptial grid structure. #'@param weights A numeric array with named dimension of multiplicative weights #' based on the areas covering each domain/region or grid-cell of 'data'. The #' dimensions must also be part of the 'data' dimensions. #'@param time_dim A character string indicating the name of time dimension in #' 'data'. The default value is 'sdate'. #'@param nclusters A positive integer K that must be bigger than 1 indicating #' the number of clusters to be computed, or K initial cluster centers to be #' used in the method. The default is NULL, and users have to specify which #' index from NbClust and the associated criteria for selecting the optimal #' number of clusters will be used for K-means clustering of 'data'. #'@param index A character string of the validity index from NbClust package #' that can be used to determine optimal K if K is not specified with #' 'nclusters'. The default value is 'sdindex' (Halkidi et al. 2001, JIIS). #' Other indices available in NBClust are "kl", "ch", "hartigan", "ccc", #' "scott", "marriot", "trcovw", "tracew", "friedman", "rubin", "cindex", "db", #' "silhouette", "duda", "pseudot2", "beale", "ratkowsky", "ball", #' "ptbiserial", "gap", "frey", "mcclain", "gamma", "gplus", "tau", "dunn", #' "hubert", "sdindex", and "sdbw".\n #' One can also use all of them with the option 'alllong' or almost all indices # except gap, gamma, gplus and tau with 'all', when the optimal number of #' clusters K is detremined by the majority rule (the maximum of histogram of #' the results of all indices with finite solutions). Use of some indices on #' a big and/or unstructured dataset can be computationally intense and/or #' could lead to numerical singularity. #'@param ncores An integer indicating the number of cores to use for parallel #' computation. The default value is NULL. #' #'@return #'A list containing: #'\item{$cluster}{ #' An integer vector of the occurrence of a cluster along time, i.e., when #' certain data member in time is allocated to a specific cluster. #'} #'\item{$centers}{ #' A matrix of cluster centres or centroids (e.g. [1:K, 1:spatial degrees of freedom]). #'} #'\item{$totss}{ #' A number of the total sum of squares. #'} #'\item{$withinss}{ #' A vector of within-cluster sum of squares, one component per cluster. #'} #'\item{$tot.withinss}{ #' A number of the total within-cluster sum of squares, i.e., sum(withinss). #'} #'\item{$betweenss}{ #' A number of the between-cluster sum of squares, i.e. totss-tot.withinss. #'} #'\item{$size}{ #' A vector of the number of points in each cluster. #'} #'\item{$iter}{ #' An interger as the number of (outer) iterations. #'} #'\item{$ifault}{ #' An integer as an indicator of a possible algorithm problem. #'} #' #'@references #'Wilks, 2011, Statistical Methods in the Atmospheric Sciences, 3rd ed., Elsevire, pp 676. #' #'@examples #'# Generating synthetic data #'a1 <- array(dim = c(200, 4)) #'mean1 <- 0 #'sd1 <- 0.3 #' #'c0 <- seq(1, 200) #'c1 <- sort(sample(x = 1:200, size = sample(x = 50:150, size = 1), replace = FALSE)) #'x1 <- c(1, 1, 1, 1) #'for (i1 in c1) { #' a1[i1, ] <- x1 + rnorm(4, mean = mean1, sd = sd1) #'} #' #'c1p5 <- c0[!(c0 %in% c1)] #'c2 <- c1p5[seq(1, length(c1p5), 2)] #'x2 <- c(2, 2, 4, 4) #'for (i2 in c2) { #' a1[i2, ] <- x2 + rnorm(4, mean = mean1, sd = sd1) #'} #' #'c3 <- c1p5[seq(2, length(c1p5), 2)] #'x3 <- c(3, 3, 1, 1) #'for (i3 in c3) { #' a1[i3, ] <- x3 + rnorm(4, mean = mean1, sd = sd1) #'} #' #'# Computing the clusters #'res1 <- Cluster(var = a1, weights = array(1, dim = dim(a1)[2]), nclusters = 3) #'print(res1$cluster) #'print(res1$centers) #' #'res2 <- Cluster(var = a1, weights = array(1, dim = dim(a1)[2])) #'print(res2$cluster) #'print(res2$centers) #' #'@import NbClust multiApply #'@importFrom abind abind #'@importFrom stats kmeans #'@importFrom grDevices pdf dev.off #'@export Cluster <- function(data, weights, time_dim = 'sdate', nclusters = NULL, index = 'sdindex', ncores = NULL) { # Check inputs ## data if (is.null(data)) { stop("Parameter 'data' cannot be NULL.") } if (!is.numeric(data)) { stop("Parameter 'data' must be a numeric array.") } if (is.null(dim(data))) { #is vector dim(data) <- c(length(data)) names(dim(data)) <- time_dim } if(any(is.null(names(dim(data))))| any(nchar(names(dim(data))) == 0)) { stop("Parameter 'data' must have dimension names.") } ## weights if (is.null(weights)) { stop("Parameter 'weights' cannot be NULL.") } if (!is.numeric(weights)) { stop("Parameter 'weights' must be a numeric array.") } if (is.null(dim(weights))) { #is vector dim(weights) <- c(length(weights)) } if(any(is.null(names(dim(weights))))| any(nchar(names(dim(weights))) == 0)) { stop("Parameter 'weights' must have dimension names.") } if (any(!names(dim(weights)) %in% names(dim(data)) | !dim(weights) %in% dim(data))) { stop("Parameter 'weights' must have dimensions that can be found in 'data' dimensions.") } ## time_dim if (!is.character(time_dim) | length(time_dim) > 1) { stop("Parameter 'time_dim' must be a character string.") } if (!time_dim %in% names(dim(data))) { stop("Parameter 'time_dim' is not found in 'data' dimensions.") } ## nclusters if (!is.null(nclusters)) { if (!is.numeric(nclusters) | length(nclusters) != 1) { stop("Parameter 'nclusters' must be an integer bigger than 1.") } else if (nclusters <= 1) { stop("Parameter 'nclusters' must be an integer bigger than 1.") } } ## index if (!is.character(index) | length(index) > 1) { stop("Parameter 'index' should be a character strings accepted as 'index' by the function NbClust::NbClust.") } ## ncores if (!is.null(ncores)) { if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 | length(ncores) > 1) { stop("Parameter 'ncores' must be a positive integer.") } } ############################### # Calculate Cluster output <- Apply(list(data), target_dims = c(time_dim, names(dim(weights))), fun = .Cluster, weights = weights, nclusters = nclusters, index = index, ncores = ncores) return(output) } .Cluster <- function(data, weights, nclusters = NULL, index = 'sdindex') { # data: [time, lat, lon] dat_dim <- dim(data) # Reshape data into two dims dim(data) <- c(dat_dim[1], prod(dat_dim[-1])) dim(weights) <- prod(dim(weights)) # a vector # weights data_list <- lapply(1:dat_dim[1], function(x) { data[x, ] * weights }) data <- do.call(abind::abind, c(data_list, along = 0)) if (!is.null(nclusters)) { kmeans.results <- kmeans(data, centers = nclusters, iter.max = 300, nstart = 30) } else { pdf(file = NULL) nbclust.results <- NbClust::NbClust(data, distance = 'euclidean', min.nc = 2, max.nc = 20, method = 'kmeans', index = index) dev.off() if (index == 'all' || index == 'alllong') { kmc <- hist(nbclust.results$Best.nc[1, ], breaks = seq(0, 20), plot = FALSE)$counts kmc1 <- which(kmc == max(kmc)) } else { kmc1 <- nbclust.results$Best.nc[1] } kmeans.results <- kmeans(data, centers = kmc1, iter.max = 300, nstart = 30) } invisible(kmeans.results) }