Commit 911bdc20 authored by aho's avatar aho
Browse files

Transform Cluster from s2dverification

parent 84613b95
Pipeline #5245 failed with stage
in 1 minute and 44 seconds
......@@ -5,6 +5,7 @@ export(AMV)
export(AnimateMap)
export(Ano)
export(Clim)
export(Cluster)
export(ColorBar)
export(Composite)
export(ConfigAddEntry)
......@@ -51,6 +52,7 @@ export(Trend)
export(clim.colors)
export(clim.palette)
import(GEOmap)
import(NbClust)
import(bigmemory)
import(geomapdata)
import(graphics)
......@@ -85,6 +87,7 @@ importFrom(stats,acf)
importFrom(stats,anova)
importFrom(stats,confint)
importFrom(stats,cor)
importFrom(stats,kmeans)
importFrom(stats,lm)
importFrom(stats,median)
importFrom(stats,na.fail)
......
#'K-means Clustering
#'
#'Compute cluster centers and their time series of occurrences, with the
#'K-means clustering method using Euclidean distance, of an array of input data
#'with any number of dimensions that at least contain time_dim.
#'Specifically, it partitions the array along time axis in K groups or clusters
#'in which each space vector/array belongs to (i.e., is a member of) the
#'cluster with the nearest center or centroid. This function relies on the
#'NbClust package (Charrad et al., 2014 JSS).
#'
#'@param data A numeric array with named dimensions that at least have
#' 'time_dim' corresponding to time and the dimensions of 'weights'
#' corresponding to either area-averages over a series of domains or the grid
#' points for any sptial grid structure.
#'@param weights A numeric array with named dimension of multiplicative weights
#' based on the areas covering each domain/region or grid-cell of 'data'. The
#' dimensions must also be part of the 'data' dimensions.
#'@param time_dim A character string indicating the name of time dimension in
#' 'data'. The default value is 'sdate'.
#'@param nclusters A positive integer K that must be bigger than 1 indicating
#' the number of clusters to be computed, or K initial cluster centers to be
#' used in the method. The default is NULL, and users have to specify which
#' index from NbClust and the associated criteria for selecting the optimal
#' number of clusters will be used for K-means clustering of 'data'.
#'@param index A character string of the validity index from NbClust package
#' that can be used to determine optimal K if K is not specified with
#' 'nclusters'. The default value is 'sdindex' (Halkidi et al. 2001, JIIS).
#' Other indices available in NBClust are "kl", "ch", "hartigan", "ccc",
#' "scott", "marriot", "trcovw", "tracew", "friedman", "rubin", "cindex", "db",
#' "silhouette", "duda", "pseudot2", "beale", "ratkowsky", "ball",
#' "ptbiserial", "gap", "frey", "mcclain", "gamma", "gplus", "tau", "dunn",
#' "hubert", "sdindex", and "sdbw".\n
#' One can also use all of them with the option 'alllong' or almost all indices
# except gap, gamma, gplus and tau with 'all', when the optimal number of
#' clusters K is detremined by the majority rule (the maximum of histogram of
#' the results of all indices with finite solutions). Use of some indices on
#' a big and/or unstructured dataset can be computationally intense and/or
#' could lead to numerical singularity.
#'@param ncores An integer indicating the number of cores to use for parallel
#' computation. The default value is NULL.
#'
#'@return
#'A list containing:
#'\item{$cluster}{
#' An integer vector of the occurrence of a cluster along time, i.e., when
#' certain data member in time is allocated to a specific cluster.
#'}
#'\item{$centers}{
#' A matrix of cluster centres or centroids (e.g. [1:K, 1:spatial degrees of freedom]).
#'}
#'\item{$totss}{
#' A number of the total sum of squares.
#'}
#'\item{$withinss}{
#' A vector of within-cluster sum of squares, one component per cluster.
#'}
#'\item{$tot.withinss}{
#' A number of the total within-cluster sum of squares, i.e., sum(withinss).
#'}
#'\item{$betweenss}{
#' A number of the between-cluster sum of squares, i.e. totss-tot.withinss.
#'}
#'\item{$size}{
#' A vector of the number of points in each cluster.
#'}
#'\item{$iter}{
#' An interger as the number of (outer) iterations.
#'}
#'\item{$ifault}{
#' An integer as an indicator of a possible algorithm problem.
#'}
#'
#'@references
#'Wilks, 2011, Statistical Methods in the Atmospheric Sciences, 3rd ed., Elsevire, pp 676.
#'
#'@examples
#'# Generating synthetic data
#'a1 <- array(dim = c(200, 4))
#'mean1 <- 0
#'sd1 <- 0.3
#'
#'c0 <- seq(1, 200)
#'c1 <- sort(sample(x = 1:200, size = sample(x = 50:150, size = 1), replace = FALSE))
#'x1 <- c(1, 1, 1, 1)
#'for (i1 in c1) {
#' a1[i1, ] <- x1 + rnorm(4, mean = mean1, sd = sd1)
#'}
#'
#'c1p5 <- c0[!(c0 %in% c1)]
#'c2 <- c1p5[seq(1, length(c1p5), 2)]
#'x2 <- c(2, 2, 4, 4)
#'for (i2 in c2) {
#' a1[i2, ] <- x2 + rnorm(4, mean = mean1, sd = sd1)
#'}
#'
#'c3 <- c1p5[seq(2, length(c1p5), 2)]
#'x3 <- c(3, 3, 1, 1)
#'for (i3 in c3) {
#' a1[i3, ] <- x3 + rnorm(4, mean = mean1, sd = sd1)
#'}
#'
#'# Computing the clusters
#'res1 <- Cluster(var = a1, weights = array(1, dim = dim(a1)[2]), nclusters = 3)
#'print(res1$cluster)
#'print(res1$centers)
#'
#'res2 <- Cluster(var = a1, weights = array(1, dim = dim(a1)[2]))
#'print(res2$cluster)
#'print(res2$centers)
#'
#'@import NbClust multiApply
#'@importFrom abind abind
#'@importFrom stats kmeans
#'@importFrom grDevices pdf dev.off
#'@export
Cluster <- function(data, weights, time_dim = 'sdate',
nclusters = NULL, index = 'sdindex', ncores = NULL) {
# Check inputs
## data
if (is.null(data)) {
stop("Parameter 'data' cannot be NULL.")
}
if (!is.numeric(data)) {
stop("Parameter 'data' must be a numeric array.")
}
if (is.null(dim(data))) { #is vector
dim(data) <- c(length(data))
names(dim(data)) <- time_dim
}
if(any(is.null(names(dim(data))))| any(nchar(names(dim(data))) == 0)) {
stop("Parameter 'data' must have dimension names.")
}
## weights
if (is.null(weights)) {
stop("Parameter 'weights' cannot be NULL.")
}
if (!is.numeric(weights)) {
stop("Parameter 'weights' must be a numeric array.")
}
if (is.null(dim(weights))) { #is vector
dim(weights) <- c(length(weights))
}
if(any(is.null(names(dim(weights))))| any(nchar(names(dim(weights))) == 0)) {
stop("Parameter 'weights' must have dimension names.")
}
if (any(!names(dim(weights)) %in% names(dim(data)) |
!dim(weights) %in% dim(data))) {
stop("Parameter 'weights' must have dimensions that can be found in 'data' dimensions.")
}
## time_dim
if (!is.character(time_dim) | length(time_dim) > 1) {
stop("Parameter 'time_dim' must be a character string.")
}
if (!time_dim %in% names(dim(data))) {
stop("Parameter 'time_dim' is not found in 'data' dimensions.")
}
## nclusters
if (!is.null(nclusters)) {
if (!is.numeric(nclusters) | length(nclusters) != 1) {
stop("Parameter 'nclusters' must be an integer bigger than 1.")
} else if (nclusters <= 1) {
stop("Parameter 'nclusters' must be an integer bigger than 1.")
}
}
## index
if (!is.character(index) | length(index) > 1) {
stop("Parameter 'index' should be a character strings accepted as 'index' by the function NbClust::NbClust.")
}
## ncores
if (!is.null(ncores)) {
if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 |
length(ncores) > 1) {
stop("Parameter 'ncores' must be a positive integer.")
}
}
###############################
# Calculate Cluster
output <- Apply(list(data),
target_dims = c(time_dim, names(dim(weights))),
fun = .Cluster,
weights = weights, nclusters = nclusters, index = index,
ncores = ncores)
return(output)
}
.Cluster <- function(data, weights, nclusters = NULL, index = 'sdindex') {
# data: [time, lat, lon]
dat_dim <- dim(data)
# Reshape data into two dims
dim(data) <- c(dat_dim[1], prod(dat_dim[-1]))
dim(weights) <- prod(dim(weights)) # a vector
# weights
data_list <- lapply(1:dat_dim[1],
function(x) { data[x, ] * weights })
data <- do.call(abind::abind, c(data_list, along = 0))
if (!is.null(nclusters)) {
kmeans.results <- kmeans(data, centers = nclusters, iter.max = 300,
nstart = 30)
} else {
pdf(file = NULL)
nbclust.results <- NbClust::NbClust(data, distance = 'euclidean',
min.nc = 2, max.nc = 20,
method = 'kmeans', index = index)
dev.off()
if (index == 'all' || index == 'alllong') {
kmc <- hist(nbclust.results$Best.nc[1, ], breaks = seq(0, 20),
plot = FALSE)$counts
kmc1 <- which(kmc == max(kmc))
} else {
kmc1 <- nbclust.results$Best.nc[1]
}
kmeans.results <- kmeans(data, centers = kmc1, iter.max = 300,
nstart = 30)
}
invisible(kmeans.results)
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/Cluster.R
\name{Cluster}
\alias{Cluster}
\title{K-means Clustering}
\usage{
Cluster(
data,
weights,
time_dim = "sdate",
nclusters = NULL,
index = "sdindex",
ncores = NULL
)
}
\arguments{
\item{data}{A numeric array with named dimensions that at least have
'time_dim' corresponding to time and the dimensions of 'weights'
corresponding to either area-averages over a series of domains or the grid
points for any sptial grid structure.}
\item{weights}{A numeric array with named dimension of multiplicative weights
based on the areas covering each domain/region or grid-cell of 'data'. The
dimensions must also be part of the 'data' dimensions.}
\item{time_dim}{A character string indicating the name of time dimension in
'data'. The default value is 'sdate'.}
\item{nclusters}{A positive integer K that must be bigger than 1 indicating
the number of clusters to be computed, or K initial cluster centers to be
used in the method. The default is NULL, and users have to specify which
index from NbClust and the associated criteria for selecting the optimal
number of clusters will be used for K-means clustering of 'data'.}
\item{index}{A character string of the validity index from NbClust package
that can be used to determine optimal K if K is not specified with
'nclusters'. The default value is 'sdindex' (Halkidi et al. 2001, JIIS).
Other indices available in NBClust are "kl", "ch", "hartigan", "ccc",
"scott", "marriot", "trcovw", "tracew", "friedman", "rubin", "cindex", "db",
"silhouette", "duda", "pseudot2", "beale", "ratkowsky", "ball",
"ptbiserial", "gap", "frey", "mcclain", "gamma", "gplus", "tau", "dunn",
"hubert", "sdindex", and "sdbw".\n
One can also use all of them with the option 'alllong' or almost all indices
clusters K is detremined by the majority rule (the maximum of histogram of
the results of all indices with finite solutions). Use of some indices on
a big and/or unstructured dataset can be computationally intense and/or
could lead to numerical singularity.}
\item{ncores}{An integer indicating the number of cores to use for parallel
computation. The default value is NULL.}
}
\value{
A list containing:
\item{$cluster}{
An integer vector of the occurrence of a cluster along time, i.e., when
certain data member in time is allocated to a specific cluster.
}
\item{$centers}{
A matrix of cluster centres or centroids (e.g. [1:K, 1:spatial degrees of freedom]).
}
\item{$totss}{
A number of the total sum of squares.
}
\item{$withinss}{
A vector of within-cluster sum of squares, one component per cluster.
}
\item{$tot.withinss}{
A number of the total within-cluster sum of squares, i.e., sum(withinss).
}
\item{$betweenss}{
A number of the between-cluster sum of squares, i.e. totss-tot.withinss.
}
\item{$size}{
A vector of the number of points in each cluster.
}
\item{$iter}{
An interger as the number of (outer) iterations.
}
\item{$ifault}{
An integer as an indicator of a possible algorithm problem.
}
}
\description{
Compute cluster centers and their time series of occurrences, with the
K-means clustering method using Euclidean distance, of an array of input data
with any number of dimensions that at least contain time_dim.
Specifically, it partitions the array along time axis in K groups or clusters
in which each space vector/array belongs to (i.e., is a member of) the
cluster with the nearest center or centroid. This function relies on the
NbClust package (Charrad et al., 2014 JSS).
}
\examples{
# Generating synthetic data
a1 <- array(dim = c(200, 4))
mean1 <- 0
sd1 <- 0.3
c0 <- seq(1, 200)
c1 <- sort(sample(x = 1:200, size = sample(x = 50:150, size = 1), replace = FALSE))
x1 <- c(1, 1, 1, 1)
for (i1 in c1) {
a1[i1, ] <- x1 + rnorm(4, mean = mean1, sd = sd1)
}
c1p5 <- c0[!(c0 \%in\% c1)]
c2 <- c1p5[seq(1, length(c1p5), 2)]
x2 <- c(2, 2, 4, 4)
for (i2 in c2) {
a1[i2, ] <- x2 + rnorm(4, mean = mean1, sd = sd1)
}
c3 <- c1p5[seq(2, length(c1p5), 2)]
x3 <- c(3, 3, 1, 1)
for (i3 in c3) {
a1[i3, ] <- x3 + rnorm(4, mean = mean1, sd = sd1)
}
# Computing the clusters
res1 <- Cluster(var = a1, weights = array(1, dim = dim(a1)[2]), nclusters = 3)
print(res1$cluster)
print(res1$centers)
res2 <- Cluster(var = a1, weights = array(1, dim = dim(a1)[2]))
print(res2$cluster)
print(res2$centers)
}
\references{
Wilks, 2011, Statistical Methods in the Atmospheric Sciences, 3rd ed., Elsevire, pp 676.
}
context("s2dv::Cluster tests")
##############################################
# dat1
set.seed(1)
dat1 <- array(rnorm(100),
dim = c(sdate = 50, space = 2))
weights1 <- array(c(0.9, 1.1), dim = c(space = 2))
# dat2
set.seed(2)
dat2 <- array(rnorm(300),
dim = c(sdate = 50, lat = 2, lon = 3))
weights2 <- array(c(0.9, 1.1), dim = c(lat = 2, lon = 3))
##############################################
test_that("1. Input checks", {
# data
expect_error(
Cluster(c()),
"Parameter 'data' cannot be NULL."
)
expect_error(
Cluster(c(NA, NA)),
"Parameter 'data' must be a numeric array."
)
expect_error(
Cluster(array(1:10, dim = c(2, 5))),
"Parameter 'data' must have dimension names."
)
# weights
expect_error(
Cluster(dat1, weights = c()),
"Parameter 'weights' cannot be NULL."
)
expect_error(
Cluster(dat1, weights = 'lat'),
"Parameter 'weights' must be a numeric array."
)
expect_error(
Cluster(dat1, weights = 2),
"Parameter 'weights' must have dimension names."
)
expect_error(
Cluster(dat1, weights = array(2, dim = c(lat = 2))),
"Parameter 'weights' must have dimensions that can be found in 'data' dimensions."
)
# time_dim
expect_error(
Cluster(dat1, weights1, time_dim = 'a'),
"Parameter 'time_dim' is not found in 'data' dimension."
)
expect_error(
Cluster(array(c(1:25), dim = c(dat = 1, time = 5, space = 2)), weights1),
"Parameter 'time_dim' is not found in 'data' dimension."
)
expect_error(
Cluster(dat1, weights1, time_dim = 2),
"Parameter 'time_dim' must be a character string."
)
expect_error(
Cluster(dat1, weights1, time_dim = c('a', 'sdate')),
"Parameter 'time_dim' must be a character string."
)
# nclusters
expect_error(
Cluster(dat1, weights1, ncluster = 1),
"Parameter 'nclusters' must be an integer bigger than 1."
)
# index
expect_error(
Cluster(dat1, weights1, index = 1),
"Parameter 'index' should be a character strings accepted as 'index' by the function NbClust::NbClust."
)
# ncores
expect_error(
Cluster(dat1, weights1, ncore = 0),
"Parameter 'ncores' must be a positive integer."
)
})
##############################################
test_that("2. Output checks: dat1", {
# The output is random. Only check dimensions.
expect_equal(
length(Cluster(dat1, weights1)$cluster),
50
)
expect_equal(
dim(Cluster(dat1, weights1)$centers),
c(8, 2)
)
expect_equal(
dim(Cluster(dat1, weights1, nclusters = 3)$centers),
c(3, 2)
)
})
##############################################
test_that("3. Output checks: dat2", {
expect_equal(
length(Cluster(dat2, weights2)$cluster),
50
)
expect_equal(
dim(Cluster(dat2, weights2)$centers),
c(7, 6)
)
expect_equal(
dim(Cluster(dat2, weights2, nclusters = 5)$centers),
c(5, 6)
)
})
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment