Newer
Older
#'K-means Clustering
#'
#'Compute cluster centers and their time series of occurrences, with the
#'K-means clustering method using Euclidean distance, of an array of input data
#'with any number of dimensions that at least contain time_dim.
#'Specifically, it partitions the array along time axis in K groups or clusters
#'in which each space vector/array belongs to (i.e., is a member of) the
#'cluster with the nearest center or centroid. This function is a wrapper of
#'kmeans() and relies on the NbClust package (Charrad et al., 2014 JSS) to
#'determine the optimal number of clusters used for K-means clustering if it is
#'not provided by users.
#'
#'@param data A numeric array with named dimensions that at least have
#' 'time_dim' corresponding to time and 'space_dim' (optional) corresponding
#' to either area-averages over a series of domains or the grid points for any
#' sptial grid structure.
#'@param weights A numeric array with named dimension of multiplicative weights
#' based on the areas covering each domain/region or grid-cell of 'data'. The
#' dimensions must be equal to the 'space_dim' in 'data'. The default value is
#' NULL which means no weighting is applied.
#'@param time_dim A character string indicating the name of time dimension in
#' 'data'. The default value is 'sdate'.
#'@param space_dim A character vector indicating the names of spatial dimensions
#' in 'data'. The default value is NULL.
#'@param nclusters A positive integer K that must be bigger than 1 indicating
#' the number of clusters to be computed, or K initial cluster centers to be
#' used in the method. The default value is NULL, which means that the number
#' of clusters will be determined by NbClust(). The parameter 'index'
#' therefore needs to be specified for NbClust() to find the optimal number of
#' clusters to be used for K-means clustering calculation.
#'@param index A character string of the validity index from NbClust package
#' that can be used to determine optimal K if K is not specified with
#' 'nclusters'. The default value is 'sdindex' (Halkidi et al. 2001, JIIS).
#' Other indices available in NBClust are "kl", "ch", "hartigan", "ccc",
#' "scott", "marriot", "trcovw", "tracew", "friedman", "rubin", "cindex", "db",
#' "silhouette", "duda", "pseudot2", "beale", "ratkowsky", "ball",
#' "ptbiserial", "gap", "frey", "mcclain", "gamma", "gplus", "tau", "dunn",
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#' One can also use all of them with the option 'alllong' or almost all indices
# except gap, gamma, gplus and tau with 'all', when the optimal number of
#' clusters K is detremined by the majority rule (the maximum of histogram of
#' the results of all indices with finite solutions). Use of some indices on
#' a big and/or unstructured dataset can be computationally intense and/or
#' could lead to numerical singularity.
#'@param ncores An integer indicating the number of cores to use for parallel
#' computation. The default value is NULL.
#'
#'@return
#'A list containing:
#'\item{$cluster}{
#' An integer vector of the occurrence of a cluster along time, i.e., when
#' certain data member in time is allocated to a specific cluster.
#'}
#'\item{$centers}{
#' A matrix of cluster centres or centroids (e.g. [1:K, 1:spatial degrees of freedom]).
#'}
#'\item{$totss}{
#' A number of the total sum of squares.
#'}
#'\item{$withinss}{
#' A vector of within-cluster sum of squares, one component per cluster.
#'}
#'\item{$tot.withinss}{
#' A number of the total within-cluster sum of squares, i.e., sum(withinss).
#'}
#'\item{$betweenss}{
#' A number of the between-cluster sum of squares, i.e. totss-tot.withinss.
#'}
#'\item{$size}{
#' A vector of the number of points in each cluster.
#'}
#'\item{$iter}{
#' An interger as the number of (outer) iterations.
#'}
#'\item{$ifault}{
#' An integer as an indicator of a possible algorithm problem.
#'}
#'
#'@references
#'Wilks, 2011, Statistical Methods in the Atmospheric Sciences, 3rd ed., Elsevire, pp 676.
#'
#'@examples
#'# Generating synthetic data
#'a1 <- array(dim = c(200, 4))
#'mean1 <- 0
#'sd1 <- 0.3
#'
#'c0 <- seq(1, 200)
#'c1 <- sort(sample(x = 1:200, size = sample(x = 50:150, size = 1), replace = FALSE))
#'x1 <- c(1, 1, 1, 1)
#'for (i1 in c1) {
#' a1[i1, ] <- x1 + rnorm(4, mean = mean1, sd = sd1)
#'}
#'
#'c1p5 <- c0[!(c0 %in% c1)]
#'c2 <- c1p5[seq(1, length(c1p5), 2)]
#'x2 <- c(2, 2, 4, 4)
#'for (i2 in c2) {
#' a1[i2, ] <- x2 + rnorm(4, mean = mean1, sd = sd1)
#'}
#'
#'c3 <- c1p5[seq(2, length(c1p5), 2)]
#'x3 <- c(3, 3, 1, 1)
#'for (i3 in c3) {
#' a1[i3, ] <- x3 + rnorm(4, mean = mean1, sd = sd1)
#'}
#'
#'# Computing the clusters
#'res1 <- Cluster(data = a1, weights = array(1, dim = dim(a1)[2]), nclusters = 3)
#'res2 <- Cluster(data = a1, weights = array(1, dim = dim(a1)[2]))
#'
#'@import NbClust multiApply
#'@importFrom abind abind
#'@importFrom stats kmeans
#'@importFrom grDevices pdf dev.off
#'@export
Cluster <- function(data, weights = NULL, time_dim = 'sdate', space_dim = NULL,
nclusters = NULL, index = 'sdindex', ncores = NULL) {
# Check inputs
## data
if (is.null(data)) {
stop("Parameter 'data' cannot be NULL.")
}
if (!is.numeric(data)) {
stop("Parameter 'data' must be a numeric array.")
}
if (is.null(dim(data))) { #is vector
dim(data) <- c(length(data))
names(dim(data)) <- time_dim
}
if(any(is.null(names(dim(data))))| any(nchar(names(dim(data))) == 0)) {
stop("Parameter 'data' must have dimension names.")
}
## weights
if (!is.null(weights)) {
if (!is.numeric(weights)) {
stop("Parameter 'weights' must be a numeric array.")
}
if (is.null(dim(weights))) { #is vector
dim(weights) <- c(length(weights))
}
if (any(is.null(names(dim(weights))))| any(nchar(names(dim(weights))) == 0)) {
stop("Parameter 'weights' must have dimension names.")
}
if (any(!names(dim(weights)) %in% names(dim(data)) |
!dim(weights) %in% dim(data))) {
stop("Parameter 'weights' must have dimensions that can be found in 'data' dimensions.")
}
}
## time_dim
if (!is.character(time_dim) | length(time_dim) > 1) {
stop("Parameter 'time_dim' must be a character string.")
}
if (!time_dim %in% names(dim(data))) {
stop("Parameter 'time_dim' is not found in 'data' dimensions.")
}
## space_dim
if (!is.null(space_dim)) {
if (!is.character(space_dim)) {
stop("Parameter 'space_dim' must be a character vector.")
}
if (any(!space_dim %in% names(dim(data)))) {
stop("Parameter 'space_dim' is not found in 'data' dimensions.")
}
if (!is.null(weights)) {
if (!(length(space_dim) == length(dim(weights)) & all(space_dim %in% names(dim(weights))))) {
stop("Parameter 'weights' must have dimension names the same as 'space_dim'.")
}
if (space_dim != names(dim(weights))) {
space_dim <- names(dim(weights))
}
}
}
if (is.null(space_dim) & !is.null(weights)) {
space_dim <- names(dim(weights))
.warning(paste0("Parameter 'weights' is assigned but not 'space_dim'. Define 'space_dim' ",
"by the dimensions of 'weights'."))
}
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
## nclusters
if (!is.null(nclusters)) {
if (!is.numeric(nclusters) | length(nclusters) != 1) {
stop("Parameter 'nclusters' must be an integer bigger than 1.")
} else if (nclusters <= 1) {
stop("Parameter 'nclusters' must be an integer bigger than 1.")
}
}
## index
if (!is.character(index) | length(index) > 1) {
stop("Parameter 'index' should be a character strings accepted as 'index' by the function NbClust::NbClust.")
}
## ncores
if (!is.null(ncores)) {
if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 |
length(ncores) > 1) {
stop("Parameter 'ncores' must be a positive integer.")
}
}
###############################
# Calculate Cluster
output <- Apply(list(data),
target_dims = c(time_dim, space_dim),
fun = .Cluster,
weights = weights, nclusters = nclusters, index = index,
ncores = ncores)
return(output)
}
.Cluster <- function(data, weights = NULL, nclusters = NULL, index = 'sdindex') {
# data: [time, (lat, lon)]
if (length(dim(data)) != 1) {
# Reshape data into two dims
dim(data) <- c(dat_dim[1], prod(dat_dim[-1]))
# weights
if (!is.null(weights)) {
dim(weights) <- prod(dim(weights)) # a vector
data_list <- lapply(1:dat_dim[1],
function(x) { data[x, ] * weights })
data <- do.call(abind::abind, c(data_list, along = 0))
}
}
if (!is.null(nclusters)) {
kmeans.results <- kmeans(data, centers = nclusters, iter.max = 300,
nstart = 30)
} else {
pdf(file = NULL)
nbclust.results <- NbClust::NbClust(data, distance = 'euclidean',
min.nc = 2, max.nc = 20,
method = 'kmeans', index = index)
dev.off()
if (index == 'all' || index == 'alllong') {
kmc <- hist(nbclust.results$Best.nc[1, ], breaks = seq(0, 20),
plot = FALSE)$counts
kmc1 <- which(kmc == max(kmc))
} else {
kmc1 <- nbclust.results$Best.nc[1]
}
kmeans.results <- kmeans(data, centers = kmc1, iter.max = 300,
nstart = 30)
}
invisible(kmeans.results)
}