Newer
Older
#'K-means Clustering
#'
#'Compute cluster centers and their time series of occurrences, with the
#'K-means clustering method using Euclidean distance, of an array of input data
#'with any number of dimensions that at least contain time_dim.
#'Specifically, it partitions the array along time axis in K groups or clusters
#'in which each space vector/array belongs to (i.e., is a member of) the
#'cluster with the nearest center or centroid. This function relies on the
#'NbClust package (Charrad et al., 2014 JSS).
#'
#'@param data A numeric array with named dimensions that at least have
#' 'time_dim' corresponding to time and the dimensions of 'weights'
#' corresponding to either area-averages over a series of domains or the grid
#' points for any sptial grid structure.
#'@param weights A numeric array with named dimension of multiplicative weights
#' based on the areas covering each domain/region or grid-cell of 'data'. The
#' dimensions must also be part of the 'data' dimensions.
#'@param time_dim A character string indicating the name of time dimension in
#' 'data'. The default value is 'sdate'.
#'@param nclusters A positive integer K that must be bigger than 1 indicating
#' the number of clusters to be computed, or K initial cluster centers to be
#' used in the method. The default is NULL, and users have to specify which
#' index from NbClust and the associated criteria for selecting the optimal
#' number of clusters will be used for K-means clustering of 'data'.
#'@param index A character string of the validity index from NbClust package
#' that can be used to determine optimal K if K is not specified with
#' 'nclusters'. The default value is 'sdindex' (Halkidi et al. 2001, JIIS).
#' Other indices available in NBClust are "kl", "ch", "hartigan", "ccc",
#' "scott", "marriot", "trcovw", "tracew", "friedman", "rubin", "cindex", "db",
#' "silhouette", "duda", "pseudot2", "beale", "ratkowsky", "ball",
#' "ptbiserial", "gap", "frey", "mcclain", "gamma", "gplus", "tau", "dunn",
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
#' One can also use all of them with the option 'alllong' or almost all indices
# except gap, gamma, gplus and tau with 'all', when the optimal number of
#' clusters K is detremined by the majority rule (the maximum of histogram of
#' the results of all indices with finite solutions). Use of some indices on
#' a big and/or unstructured dataset can be computationally intense and/or
#' could lead to numerical singularity.
#'@param ncores An integer indicating the number of cores to use for parallel
#' computation. The default value is NULL.
#'
#'@return
#'A list containing:
#'\item{$cluster}{
#' An integer vector of the occurrence of a cluster along time, i.e., when
#' certain data member in time is allocated to a specific cluster.
#'}
#'\item{$centers}{
#' A matrix of cluster centres or centroids (e.g. [1:K, 1:spatial degrees of freedom]).
#'}
#'\item{$totss}{
#' A number of the total sum of squares.
#'}
#'\item{$withinss}{
#' A vector of within-cluster sum of squares, one component per cluster.
#'}
#'\item{$tot.withinss}{
#' A number of the total within-cluster sum of squares, i.e., sum(withinss).
#'}
#'\item{$betweenss}{
#' A number of the between-cluster sum of squares, i.e. totss-tot.withinss.
#'}
#'\item{$size}{
#' A vector of the number of points in each cluster.
#'}
#'\item{$iter}{
#' An interger as the number of (outer) iterations.
#'}
#'\item{$ifault}{
#' An integer as an indicator of a possible algorithm problem.
#'}
#'
#'@references
#'Wilks, 2011, Statistical Methods in the Atmospheric Sciences, 3rd ed., Elsevire, pp 676.
#'
#'@examples
#'# Generating synthetic data
#'a1 <- array(dim = c(200, 4))
#'mean1 <- 0
#'sd1 <- 0.3
#'
#'c0 <- seq(1, 200)
#'c1 <- sort(sample(x = 1:200, size = sample(x = 50:150, size = 1), replace = FALSE))
#'x1 <- c(1, 1, 1, 1)
#'for (i1 in c1) {
#' a1[i1, ] <- x1 + rnorm(4, mean = mean1, sd = sd1)
#'}
#'
#'c1p5 <- c0[!(c0 %in% c1)]
#'c2 <- c1p5[seq(1, length(c1p5), 2)]
#'x2 <- c(2, 2, 4, 4)
#'for (i2 in c2) {
#' a1[i2, ] <- x2 + rnorm(4, mean = mean1, sd = sd1)
#'}
#'
#'c3 <- c1p5[seq(2, length(c1p5), 2)]
#'x3 <- c(3, 3, 1, 1)
#'for (i3 in c3) {
#' a1[i3, ] <- x3 + rnorm(4, mean = mean1, sd = sd1)
#'}
#'
#'# Computing the clusters
#'res1 <- Cluster(var = a1, weights = array(1, dim = dim(a1)[2]), nclusters = 3)
#'print(res1$cluster)
#'print(res1$centers)
#'
#'res2 <- Cluster(var = a1, weights = array(1, dim = dim(a1)[2]))
#'print(res2$cluster)
#'print(res2$centers)
#'
#'@import NbClust multiApply
#'@importFrom abind abind
#'@importFrom stats kmeans
#'@importFrom grDevices pdf dev.off
#'@export
Cluster <- function(data, weights, time_dim = 'sdate',
nclusters = NULL, index = 'sdindex', ncores = NULL) {
# Check inputs
## data
if (is.null(data)) {
stop("Parameter 'data' cannot be NULL.")
}
if (!is.numeric(data)) {
stop("Parameter 'data' must be a numeric array.")
}
if (is.null(dim(data))) { #is vector
dim(data) <- c(length(data))
names(dim(data)) <- time_dim
}
if(any(is.null(names(dim(data))))| any(nchar(names(dim(data))) == 0)) {
stop("Parameter 'data' must have dimension names.")
}
## weights
if (is.null(weights)) {
stop("Parameter 'weights' cannot be NULL.")
}
if (!is.numeric(weights)) {
stop("Parameter 'weights' must be a numeric array.")
}
if (is.null(dim(weights))) { #is vector
dim(weights) <- c(length(weights))
}
if(any(is.null(names(dim(weights))))| any(nchar(names(dim(weights))) == 0)) {
stop("Parameter 'weights' must have dimension names.")
}
if (any(!names(dim(weights)) %in% names(dim(data)) |
!dim(weights) %in% dim(data))) {
stop("Parameter 'weights' must have dimensions that can be found in 'data' dimensions.")
}
## time_dim
if (!is.character(time_dim) | length(time_dim) > 1) {
stop("Parameter 'time_dim' must be a character string.")
}
if (!time_dim %in% names(dim(data))) {
stop("Parameter 'time_dim' is not found in 'data' dimensions.")
}
## nclusters
if (!is.null(nclusters)) {
if (!is.numeric(nclusters) | length(nclusters) != 1) {
stop("Parameter 'nclusters' must be an integer bigger than 1.")
} else if (nclusters <= 1) {
stop("Parameter 'nclusters' must be an integer bigger than 1.")
}
}
## index
if (!is.character(index) | length(index) > 1) {
stop("Parameter 'index' should be a character strings accepted as 'index' by the function NbClust::NbClust.")
}
## ncores
if (!is.null(ncores)) {
if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 |
length(ncores) > 1) {
stop("Parameter 'ncores' must be a positive integer.")
}
}
###############################
# Calculate Cluster
output <- Apply(list(data),
target_dims = c(time_dim, names(dim(weights))),
fun = .Cluster,
weights = weights, nclusters = nclusters, index = index,
ncores = ncores)
return(output)
}
.Cluster <- function(data, weights, nclusters = NULL, index = 'sdindex') {
# data: [time, lat, lon]
dat_dim <- dim(data)
# Reshape data into two dims
dim(data) <- c(dat_dim[1], prod(dat_dim[-1]))
dim(weights) <- prod(dim(weights)) # a vector
# weights
data_list <- lapply(1:dat_dim[1],
function(x) { data[x, ] * weights })
data <- do.call(abind::abind, c(data_list, along = 0))
if (!is.null(nclusters)) {
kmeans.results <- kmeans(data, centers = nclusters, iter.max = 300,
nstart = 30)
} else {
pdf(file = NULL)
nbclust.results <- NbClust::NbClust(data, distance = 'euclidean',
min.nc = 2, max.nc = 20,
method = 'kmeans', index = index)
dev.off()
if (index == 'all' || index == 'alllong') {
kmc <- hist(nbclust.results$Best.nc[1, ], breaks = seq(0, 20),
plot = FALSE)$counts
kmc1 <- which(kmc == max(kmc))
} else {
kmc1 <- nbclust.results$Best.nc[1]
}
kmeans.results <- kmeans(data, centers = kmc1, iter.max = 300,
nstart = 30)
}
invisible(kmeans.results)
}