From 911bdc204bbc4762b742ffc79d8d5c7a9ad29762 Mon Sep 17 00:00:00 2001 From: aho Date: Wed, 17 Mar 2021 13:49:35 +0100 Subject: [PATCH 1/9] Transform Cluster from s2dverification --- NAMESPACE | 3 + R/Cluster.R | 230 ++++++++++++++++++++++++++++++++++ man/Cluster.Rd | 130 +++++++++++++++++++ tests/testthat/test-Cluster.R | 120 ++++++++++++++++++ 4 files changed, 483 insertions(+) create mode 100644 R/Cluster.R create mode 100644 man/Cluster.Rd create mode 100644 tests/testthat/test-Cluster.R diff --git a/NAMESPACE b/NAMESPACE index b12cd82..23ed4ea 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,6 +5,7 @@ export(AMV) export(AnimateMap) export(Ano) export(Clim) +export(Cluster) export(ColorBar) export(Composite) export(ConfigAddEntry) @@ -51,6 +52,7 @@ export(Trend) export(clim.colors) export(clim.palette) import(GEOmap) +import(NbClust) import(bigmemory) import(geomapdata) import(graphics) @@ -85,6 +87,7 @@ importFrom(stats,acf) importFrom(stats,anova) importFrom(stats,confint) importFrom(stats,cor) +importFrom(stats,kmeans) importFrom(stats,lm) importFrom(stats,median) importFrom(stats,na.fail) diff --git a/R/Cluster.R b/R/Cluster.R new file mode 100644 index 0000000..5ccce7d --- /dev/null +++ b/R/Cluster.R @@ -0,0 +1,230 @@ +#'K-means Clustering +#' +#'Compute cluster centers and their time series of occurrences, with the +#'K-means clustering method using Euclidean distance, of an array of input data +#'with any number of dimensions that at least contain time_dim. +#'Specifically, it partitions the array along time axis in K groups or clusters +#'in which each space vector/array belongs to (i.e., is a member of) the +#'cluster with the nearest center or centroid. This function relies on the +#'NbClust package (Charrad et al., 2014 JSS). +#' +#'@param data A numeric array with named dimensions that at least have +#' 'time_dim' corresponding to time and the dimensions of 'weights' +#' corresponding to either area-averages over a series of domains or the grid +#' points for any sptial grid structure. +#'@param weights A numeric array with named dimension of multiplicative weights +#' based on the areas covering each domain/region or grid-cell of 'data'. The +#' dimensions must also be part of the 'data' dimensions. +#'@param time_dim A character string indicating the name of time dimension in +#' 'data'. The default value is 'sdate'. +#'@param nclusters A positive integer K that must be bigger than 1 indicating +#' the number of clusters to be computed, or K initial cluster centers to be +#' used in the method. The default is NULL, and users have to specify which +#' index from NbClust and the associated criteria for selecting the optimal +#' number of clusters will be used for K-means clustering of 'data'. +#'@param index A character string of the validity index from NbClust package +#' that can be used to determine optimal K if K is not specified with +#' 'nclusters'. The default value is 'sdindex' (Halkidi et al. 2001, JIIS). +#' Other indices available in NBClust are "kl", "ch", "hartigan", "ccc", +#' "scott", "marriot", "trcovw", "tracew", "friedman", "rubin", "cindex", "db", +#' "silhouette", "duda", "pseudot2", "beale", "ratkowsky", "ball", +#' "ptbiserial", "gap", "frey", "mcclain", "gamma", "gplus", "tau", "dunn", +#' "hubert", "sdindex", and "sdbw".\n +#' One can also use all of them with the option 'alllong' or almost all indices +# except gap, gamma, gplus and tau with 'all', when the optimal number of +#' clusters K is detremined by the majority rule (the maximum of histogram of +#' the results of all indices with finite solutions). Use of some indices on +#' a big and/or unstructured dataset can be computationally intense and/or +#' could lead to numerical singularity. +#'@param ncores An integer indicating the number of cores to use for parallel +#' computation. The default value is NULL. +#' +#'@return +#'A list containing: +#'\item{$cluster}{ +#' An integer vector of the occurrence of a cluster along time, i.e., when +#' certain data member in time is allocated to a specific cluster. +#'} +#'\item{$centers}{ +#' A matrix of cluster centres or centroids (e.g. [1:K, 1:spatial degrees of freedom]). +#'} +#'\item{$totss}{ +#' A number of the total sum of squares. +#'} +#'\item{$withinss}{ +#' A vector of within-cluster sum of squares, one component per cluster. +#'} +#'\item{$tot.withinss}{ +#' A number of the total within-cluster sum of squares, i.e., sum(withinss). +#'} +#'\item{$betweenss}{ +#' A number of the between-cluster sum of squares, i.e. totss-tot.withinss. +#'} +#'\item{$size}{ +#' A vector of the number of points in each cluster. +#'} +#'\item{$iter}{ +#' An interger as the number of (outer) iterations. +#'} +#'\item{$ifault}{ +#' An integer as an indicator of a possible algorithm problem. +#'} +#' +#'@references +#'Wilks, 2011, Statistical Methods in the Atmospheric Sciences, 3rd ed., Elsevire, pp 676. +#' +#'@examples +#'# Generating synthetic data +#'a1 <- array(dim = c(200, 4)) +#'mean1 <- 0 +#'sd1 <- 0.3 +#' +#'c0 <- seq(1, 200) +#'c1 <- sort(sample(x = 1:200, size = sample(x = 50:150, size = 1), replace = FALSE)) +#'x1 <- c(1, 1, 1, 1) +#'for (i1 in c1) { +#' a1[i1, ] <- x1 + rnorm(4, mean = mean1, sd = sd1) +#'} +#' +#'c1p5 <- c0[!(c0 %in% c1)] +#'c2 <- c1p5[seq(1, length(c1p5), 2)] +#'x2 <- c(2, 2, 4, 4) +#'for (i2 in c2) { +#' a1[i2, ] <- x2 + rnorm(4, mean = mean1, sd = sd1) +#'} +#' +#'c3 <- c1p5[seq(2, length(c1p5), 2)] +#'x3 <- c(3, 3, 1, 1) +#'for (i3 in c3) { +#' a1[i3, ] <- x3 + rnorm(4, mean = mean1, sd = sd1) +#'} +#' +#'# Computing the clusters +#'res1 <- Cluster(var = a1, weights = array(1, dim = dim(a1)[2]), nclusters = 3) +#'print(res1$cluster) +#'print(res1$centers) +#' +#'res2 <- Cluster(var = a1, weights = array(1, dim = dim(a1)[2])) +#'print(res2$cluster) +#'print(res2$centers) +#' +#'@import NbClust multiApply +#'@importFrom abind abind +#'@importFrom stats kmeans +#'@importFrom grDevices pdf dev.off +#'@export +Cluster <- function(data, weights, time_dim = 'sdate', + nclusters = NULL, index = 'sdindex', ncores = NULL) { + # Check inputs + ## data + if (is.null(data)) { + stop("Parameter 'data' cannot be NULL.") + } + if (!is.numeric(data)) { + stop("Parameter 'data' must be a numeric array.") + } + if (is.null(dim(data))) { #is vector + dim(data) <- c(length(data)) + names(dim(data)) <- time_dim + } + if(any(is.null(names(dim(data))))| any(nchar(names(dim(data))) == 0)) { + stop("Parameter 'data' must have dimension names.") + } + + ## weights + if (is.null(weights)) { + stop("Parameter 'weights' cannot be NULL.") + } + if (!is.numeric(weights)) { + stop("Parameter 'weights' must be a numeric array.") + } + if (is.null(dim(weights))) { #is vector + dim(weights) <- c(length(weights)) + } + if(any(is.null(names(dim(weights))))| any(nchar(names(dim(weights))) == 0)) { + stop("Parameter 'weights' must have dimension names.") + } + if (any(!names(dim(weights)) %in% names(dim(data)) | + !dim(weights) %in% dim(data))) { + stop("Parameter 'weights' must have dimensions that can be found in 'data' dimensions.") + } + + ## time_dim + if (!is.character(time_dim) | length(time_dim) > 1) { + stop("Parameter 'time_dim' must be a character string.") + } + if (!time_dim %in% names(dim(data))) { + stop("Parameter 'time_dim' is not found in 'data' dimensions.") + } + + ## nclusters + if (!is.null(nclusters)) { + if (!is.numeric(nclusters) | length(nclusters) != 1) { + stop("Parameter 'nclusters' must be an integer bigger than 1.") + } else if (nclusters <= 1) { + stop("Parameter 'nclusters' must be an integer bigger than 1.") + } + } + + ## index + if (!is.character(index) | length(index) > 1) { + stop("Parameter 'index' should be a character strings accepted as 'index' by the function NbClust::NbClust.") + } + + ## ncores + if (!is.null(ncores)) { + if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 | + length(ncores) > 1) { + stop("Parameter 'ncores' must be a positive integer.") + } + } + + ############################### + # Calculate Cluster + + output <- Apply(list(data), + target_dims = c(time_dim, names(dim(weights))), + fun = .Cluster, + weights = weights, nclusters = nclusters, index = index, + ncores = ncores) + + return(output) +} + +.Cluster <- function(data, weights, nclusters = NULL, index = 'sdindex') { + # data: [time, lat, lon] + dat_dim <- dim(data) + + # Reshape data into two dims + dim(data) <- c(dat_dim[1], prod(dat_dim[-1])) + dim(weights) <- prod(dim(weights)) # a vector + + # weights + data_list <- lapply(1:dat_dim[1], + function(x) { data[x, ] * weights }) + data <- do.call(abind::abind, c(data_list, along = 0)) + + if (!is.null(nclusters)) { + kmeans.results <- kmeans(data, centers = nclusters, iter.max = 300, + nstart = 30) + } else { + pdf(file = NULL) + nbclust.results <- NbClust::NbClust(data, distance = 'euclidean', + min.nc = 2, max.nc = 20, + method = 'kmeans', index = index) + dev.off() + + if (index == 'all' || index == 'alllong') { + kmc <- hist(nbclust.results$Best.nc[1, ], breaks = seq(0, 20), + plot = FALSE)$counts + kmc1 <- which(kmc == max(kmc)) + } else { + kmc1 <- nbclust.results$Best.nc[1] + } + + kmeans.results <- kmeans(data, centers = kmc1, iter.max = 300, + nstart = 30) + } + + invisible(kmeans.results) +} diff --git a/man/Cluster.Rd b/man/Cluster.Rd new file mode 100644 index 0000000..2750e6a --- /dev/null +++ b/man/Cluster.Rd @@ -0,0 +1,130 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Cluster.R +\name{Cluster} +\alias{Cluster} +\title{K-means Clustering} +\usage{ +Cluster( + data, + weights, + time_dim = "sdate", + nclusters = NULL, + index = "sdindex", + ncores = NULL +) +} +\arguments{ +\item{data}{A numeric array with named dimensions that at least have +'time_dim' corresponding to time and the dimensions of 'weights' +corresponding to either area-averages over a series of domains or the grid +points for any sptial grid structure.} + +\item{weights}{A numeric array with named dimension of multiplicative weights +based on the areas covering each domain/region or grid-cell of 'data'. The +dimensions must also be part of the 'data' dimensions.} + +\item{time_dim}{A character string indicating the name of time dimension in +'data'. The default value is 'sdate'.} + +\item{nclusters}{A positive integer K that must be bigger than 1 indicating +the number of clusters to be computed, or K initial cluster centers to be +used in the method. The default is NULL, and users have to specify which +index from NbClust and the associated criteria for selecting the optimal +number of clusters will be used for K-means clustering of 'data'.} + +\item{index}{A character string of the validity index from NbClust package +that can be used to determine optimal K if K is not specified with +'nclusters'. The default value is 'sdindex' (Halkidi et al. 2001, JIIS). +Other indices available in NBClust are "kl", "ch", "hartigan", "ccc", +"scott", "marriot", "trcovw", "tracew", "friedman", "rubin", "cindex", "db", +"silhouette", "duda", "pseudot2", "beale", "ratkowsky", "ball", +"ptbiserial", "gap", "frey", "mcclain", "gamma", "gplus", "tau", "dunn", +"hubert", "sdindex", and "sdbw".\n +One can also use all of them with the option 'alllong' or almost all indices +clusters K is detremined by the majority rule (the maximum of histogram of +the results of all indices with finite solutions). Use of some indices on +a big and/or unstructured dataset can be computationally intense and/or +could lead to numerical singularity.} + +\item{ncores}{An integer indicating the number of cores to use for parallel +computation. The default value is NULL.} +} +\value{ +A list containing: +\item{$cluster}{ + An integer vector of the occurrence of a cluster along time, i.e., when + certain data member in time is allocated to a specific cluster. +} +\item{$centers}{ + A matrix of cluster centres or centroids (e.g. [1:K, 1:spatial degrees of freedom]). +} +\item{$totss}{ + A number of the total sum of squares. +} +\item{$withinss}{ + A vector of within-cluster sum of squares, one component per cluster. +} +\item{$tot.withinss}{ + A number of the total within-cluster sum of squares, i.e., sum(withinss). +} +\item{$betweenss}{ + A number of the between-cluster sum of squares, i.e. totss-tot.withinss. +} +\item{$size}{ + A vector of the number of points in each cluster. +} +\item{$iter}{ + An interger as the number of (outer) iterations. +} +\item{$ifault}{ + An integer as an indicator of a possible algorithm problem. +} +} +\description{ +Compute cluster centers and their time series of occurrences, with the +K-means clustering method using Euclidean distance, of an array of input data +with any number of dimensions that at least contain time_dim. +Specifically, it partitions the array along time axis in K groups or clusters +in which each space vector/array belongs to (i.e., is a member of) the +cluster with the nearest center or centroid. This function relies on the +NbClust package (Charrad et al., 2014 JSS). +} +\examples{ +# Generating synthetic data +a1 <- array(dim = c(200, 4)) +mean1 <- 0 +sd1 <- 0.3 + +c0 <- seq(1, 200) +c1 <- sort(sample(x = 1:200, size = sample(x = 50:150, size = 1), replace = FALSE)) +x1 <- c(1, 1, 1, 1) +for (i1 in c1) { + a1[i1, ] <- x1 + rnorm(4, mean = mean1, sd = sd1) +} + +c1p5 <- c0[!(c0 \%in\% c1)] +c2 <- c1p5[seq(1, length(c1p5), 2)] +x2 <- c(2, 2, 4, 4) +for (i2 in c2) { + a1[i2, ] <- x2 + rnorm(4, mean = mean1, sd = sd1) +} + +c3 <- c1p5[seq(2, length(c1p5), 2)] +x3 <- c(3, 3, 1, 1) +for (i3 in c3) { + a1[i3, ] <- x3 + rnorm(4, mean = mean1, sd = sd1) +} + +# Computing the clusters +res1 <- Cluster(var = a1, weights = array(1, dim = dim(a1)[2]), nclusters = 3) +print(res1$cluster) +print(res1$centers) + +res2 <- Cluster(var = a1, weights = array(1, dim = dim(a1)[2])) +print(res2$cluster) +print(res2$centers) + +} +\references{ +Wilks, 2011, Statistical Methods in the Atmospheric Sciences, 3rd ed., Elsevire, pp 676. +} diff --git a/tests/testthat/test-Cluster.R b/tests/testthat/test-Cluster.R new file mode 100644 index 0000000..be44f3f --- /dev/null +++ b/tests/testthat/test-Cluster.R @@ -0,0 +1,120 @@ +context("s2dv::Cluster tests") + +############################################## + # dat1 + set.seed(1) + dat1 <- array(rnorm(100), + dim = c(sdate = 50, space = 2)) + weights1 <- array(c(0.9, 1.1), dim = c(space = 2)) + + # dat2 + set.seed(2) + dat2 <- array(rnorm(300), + dim = c(sdate = 50, lat = 2, lon = 3)) + weights2 <- array(c(0.9, 1.1), dim = c(lat = 2, lon = 3)) + + +############################################## + +test_that("1. Input checks", { + # data + expect_error( + Cluster(c()), + "Parameter 'data' cannot be NULL." + ) + expect_error( + Cluster(c(NA, NA)), + "Parameter 'data' must be a numeric array." + ) + expect_error( + Cluster(array(1:10, dim = c(2, 5))), + "Parameter 'data' must have dimension names." + ) + # weights + expect_error( + Cluster(dat1, weights = c()), + "Parameter 'weights' cannot be NULL." + ) + expect_error( + Cluster(dat1, weights = 'lat'), + "Parameter 'weights' must be a numeric array." + ) + expect_error( + Cluster(dat1, weights = 2), + "Parameter 'weights' must have dimension names." + ) + expect_error( + Cluster(dat1, weights = array(2, dim = c(lat = 2))), + "Parameter 'weights' must have dimensions that can be found in 'data' dimensions." + ) + # time_dim + expect_error( + Cluster(dat1, weights1, time_dim = 'a'), + "Parameter 'time_dim' is not found in 'data' dimension." + ) + expect_error( + Cluster(array(c(1:25), dim = c(dat = 1, time = 5, space = 2)), weights1), + "Parameter 'time_dim' is not found in 'data' dimension." + ) + expect_error( + Cluster(dat1, weights1, time_dim = 2), + "Parameter 'time_dim' must be a character string." + ) + expect_error( + Cluster(dat1, weights1, time_dim = c('a', 'sdate')), + "Parameter 'time_dim' must be a character string." + ) + # nclusters + expect_error( + Cluster(dat1, weights1, ncluster = 1), + "Parameter 'nclusters' must be an integer bigger than 1." + ) + # index + expect_error( + Cluster(dat1, weights1, index = 1), + "Parameter 'index' should be a character strings accepted as 'index' by the function NbClust::NbClust." + ) + # ncores + expect_error( + Cluster(dat1, weights1, ncore = 0), + "Parameter 'ncores' must be a positive integer." + ) +}) + +############################################## +test_that("2. Output checks: dat1", { +# The output is random. Only check dimensions. + expect_equal( + length(Cluster(dat1, weights1)$cluster), + 50 + ) + expect_equal( + dim(Cluster(dat1, weights1)$centers), + c(8, 2) + ) + expect_equal( + dim(Cluster(dat1, weights1, nclusters = 3)$centers), + c(3, 2) + ) + +}) + + +############################################## +test_that("3. Output checks: dat2", { + + expect_equal( + length(Cluster(dat2, weights2)$cluster), + 50 + ) + expect_equal( + dim(Cluster(dat2, weights2)$centers), + c(7, 6) + ) + expect_equal( + dim(Cluster(dat2, weights2, nclusters = 5)$centers), + c(5, 6) + ) + +}) + -- GitLab From 684720af5dc960da1c9193a53cf67e0f1b895225 Mon Sep 17 00:00:00 2001 From: aho Date: Wed, 17 Mar 2021 13:53:12 +0100 Subject: [PATCH 2/9] Fix syntax --- R/Cluster.R | 2 +- man/Cluster.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/Cluster.R b/R/Cluster.R index 5ccce7d..4e57fbf 100644 --- a/R/Cluster.R +++ b/R/Cluster.R @@ -29,7 +29,7 @@ #' "scott", "marriot", "trcovw", "tracew", "friedman", "rubin", "cindex", "db", #' "silhouette", "duda", "pseudot2", "beale", "ratkowsky", "ball", #' "ptbiserial", "gap", "frey", "mcclain", "gamma", "gplus", "tau", "dunn", -#' "hubert", "sdindex", and "sdbw".\n +#' "hubert", "sdindex", and "sdbw". #' One can also use all of them with the option 'alllong' or almost all indices # except gap, gamma, gplus and tau with 'all', when the optimal number of #' clusters K is detremined by the majority rule (the maximum of histogram of diff --git a/man/Cluster.Rd b/man/Cluster.Rd index 2750e6a..bca9872 100644 --- a/man/Cluster.Rd +++ b/man/Cluster.Rd @@ -39,7 +39,7 @@ Other indices available in NBClust are "kl", "ch", "hartigan", "ccc", "scott", "marriot", "trcovw", "tracew", "friedman", "rubin", "cindex", "db", "silhouette", "duda", "pseudot2", "beale", "ratkowsky", "ball", "ptbiserial", "gap", "frey", "mcclain", "gamma", "gplus", "tau", "dunn", -"hubert", "sdindex", and "sdbw".\n +"hubert", "sdindex", and "sdbw". One can also use all of them with the option 'alllong' or almost all indices clusters K is detremined by the majority rule (the maximum of histogram of the results of all indices with finite solutions). Use of some indices on -- GitLab From f4ae59590c0f6485a93e8584d237b9b3d8de3f5f Mon Sep 17 00:00:00 2001 From: aho Date: Wed, 17 Mar 2021 13:57:01 +0100 Subject: [PATCH 3/9] Fix bad format --- R/Persistence.R | 14 +++++++------- man/Persistence.Rd | 14 +++++++------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/R/Persistence.R b/R/Persistence.R index f569e63..1840927 100644 --- a/R/Persistence.R +++ b/R/Persistence.R @@ -39,32 +39,32 @@ #' #'@return #'A list containing: -#'\item{$persistence} { +#'\item{$persistence}{ #' A numeric array with dimensions 'memb', time (start dates), latitudes and longitudes #' containing the persistence forecast. #'} -#'\item{$persistence.mean} { +#'\item{$persistence.mean}{ #' A numeric array with same dimensions as 'persistence', except the 'memb' dimension #' which is of length 1, containing the ensemble mean persistence forecast. #'} -#'\item{$persistence.predint} { +#'\item{$persistence.predint}{ #' A numeric array with same dimensions as 'persistence', except the 'memb' dimension #' which is of length 1, containing the prediction interval of the persistence forecast. #'} -#'\item{$AR.slope} { +#'\item{$AR.slope}{ #' A numeric array with same dimensions as 'persistence', except the 'memb' dimension #' which is of length 1, containing the slope coefficient of the autoregression. #'} -#'\item{$AR.intercept} { +#'\item{$AR.intercept}{ #' A numeric array with same dimensions as 'persistence', except the 'memb' dimension #' which is of length 1, containing the intercept coefficient of the autoregression. #'} -#'\item{$AR.lowCI} { +#'\item{$AR.lowCI}{ #' A numeric array with same dimensions as 'persistence', except the 'memb' dimension #' which is of length 1, containing the lower value of the confidence interval of the #' autoregression. #'} -#'\item{$AR.highCI} { +#'\item{$AR.highCI}{ #' A numeric array with same dimensions as 'persistence', except the 'memb' dimension #' which is of length 1, containing the upper value of the confidence interval of the #' autoregression. diff --git a/man/Persistence.Rd b/man/Persistence.Rd index 84b8464..b2bd276 100644 --- a/man/Persistence.Rd +++ b/man/Persistence.Rd @@ -64,32 +64,32 @@ computation. The default value is NULL.} } \value{ A list containing: -\item{$persistence} { +\item{$persistence}{ A numeric array with dimensions 'memb', time (start dates), latitudes and longitudes containing the persistence forecast. } -\item{$persistence.mean} { +\item{$persistence.mean}{ A numeric array with same dimensions as 'persistence', except the 'memb' dimension which is of length 1, containing the ensemble mean persistence forecast. } -\item{$persistence.predint} { +\item{$persistence.predint}{ A numeric array with same dimensions as 'persistence', except the 'memb' dimension which is of length 1, containing the prediction interval of the persistence forecast. } -\item{$AR.slope} { +\item{$AR.slope}{ A numeric array with same dimensions as 'persistence', except the 'memb' dimension which is of length 1, containing the slope coefficient of the autoregression. } -\item{$AR.intercept} { +\item{$AR.intercept}{ A numeric array with same dimensions as 'persistence', except the 'memb' dimension which is of length 1, containing the intercept coefficient of the autoregression. } -\item{$AR.lowCI} { +\item{$AR.lowCI}{ A numeric array with same dimensions as 'persistence', except the 'memb' dimension which is of length 1, containing the lower value of the confidence interval of the autoregression. } -\item{$AR.highCI} { +\item{$AR.highCI}{ A numeric array with same dimensions as 'persistence', except the 'memb' dimension which is of length 1, containing the upper value of the confidence interval of the autoregression. -- GitLab From b74b50a759c1f34c1920efdec31adbc67eabc71b Mon Sep 17 00:00:00 2001 From: aho Date: Wed, 17 Mar 2021 14:06:43 +0100 Subject: [PATCH 4/9] Add NbClust --- DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/DESCRIPTION b/DESCRIPTION index 30fd237..f8f5567 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -35,6 +35,7 @@ Imports: stats, plyr, ncdf4, + NbClust, multiApply (>= 2.1.1) Suggests: easyVerification, -- GitLab From 2679bbb364d664041473506b00418474e09d859b Mon Sep 17 00:00:00 2001 From: aho Date: Wed, 17 Mar 2021 14:06:54 +0100 Subject: [PATCH 5/9] Revise example --- R/Cluster.R | 9 ++------- man/Cluster.Rd | 9 ++------- 2 files changed, 4 insertions(+), 14 deletions(-) diff --git a/R/Cluster.R b/R/Cluster.R index 4e57fbf..24f2d86 100644 --- a/R/Cluster.R +++ b/R/Cluster.R @@ -100,13 +100,8 @@ #'} #' #'# Computing the clusters -#'res1 <- Cluster(var = a1, weights = array(1, dim = dim(a1)[2]), nclusters = 3) -#'print(res1$cluster) -#'print(res1$centers) -#' -#'res2 <- Cluster(var = a1, weights = array(1, dim = dim(a1)[2])) -#'print(res2$cluster) -#'print(res2$centers) +#'res1 <- Cluster(data = a1, weights = array(1, dim = dim(a1)[2]), nclusters = 3) +#'res2 <- Cluster(data = a1, weights = array(1, dim = dim(a1)[2])) #' #'@import NbClust multiApply #'@importFrom abind abind diff --git a/man/Cluster.Rd b/man/Cluster.Rd index bca9872..2449137 100644 --- a/man/Cluster.Rd +++ b/man/Cluster.Rd @@ -116,13 +116,8 @@ for (i3 in c3) { } # Computing the clusters -res1 <- Cluster(var = a1, weights = array(1, dim = dim(a1)[2]), nclusters = 3) -print(res1$cluster) -print(res1$centers) - -res2 <- Cluster(var = a1, weights = array(1, dim = dim(a1)[2])) -print(res2$cluster) -print(res2$centers) +res1 <- Cluster(data = a1, weights = array(1, dim = dim(a1)[2]), nclusters = 3) +res2 <- Cluster(data = a1, weights = array(1, dim = dim(a1)[2])) } \references{ -- GitLab From bfa18f5eb3f087599cf123154fe5764a72cb4a72 Mon Sep 17 00:00:00 2001 From: aho Date: Wed, 17 Mar 2021 14:13:29 +0100 Subject: [PATCH 6/9] Fix example --- R/Cluster.R | 1 + man/Cluster.Rd | 1 + 2 files changed, 2 insertions(+) diff --git a/R/Cluster.R b/R/Cluster.R index 24f2d86..015b65c 100644 --- a/R/Cluster.R +++ b/R/Cluster.R @@ -100,6 +100,7 @@ #'} #' #'# Computing the clusters +#'names(dim(a1)) <- c('sdate', 'space') #'res1 <- Cluster(data = a1, weights = array(1, dim = dim(a1)[2]), nclusters = 3) #'res2 <- Cluster(data = a1, weights = array(1, dim = dim(a1)[2])) #' diff --git a/man/Cluster.Rd b/man/Cluster.Rd index 2449137..ce0e5ad 100644 --- a/man/Cluster.Rd +++ b/man/Cluster.Rd @@ -116,6 +116,7 @@ for (i3 in c3) { } # Computing the clusters +names(dim(a1)) <- c('sdate', 'space') res1 <- Cluster(data = a1, weights = array(1, dim = dim(a1)[2]), nclusters = 3) res2 <- Cluster(data = a1, weights = array(1, dim = dim(a1)[2])) -- GitLab From 08ba27ce3e14a3627d1ca6d7e2a62812e7310331 Mon Sep 17 00:00:00 2001 From: aho Date: Tue, 6 Apr 2021 19:51:40 +0200 Subject: [PATCH 7/9] Improve documentation --- R/Cluster.R | 13 ++++++++----- man/Cluster.Rd | 13 ++++++++----- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/R/Cluster.R b/R/Cluster.R index 015b65c..af0901d 100644 --- a/R/Cluster.R +++ b/R/Cluster.R @@ -5,8 +5,10 @@ #'with any number of dimensions that at least contain time_dim. #'Specifically, it partitions the array along time axis in K groups or clusters #'in which each space vector/array belongs to (i.e., is a member of) the -#'cluster with the nearest center or centroid. This function relies on the -#'NbClust package (Charrad et al., 2014 JSS). +#'cluster with the nearest center or centroid. This function is a wrapper of +#'kmeans() and relies on the NbClust package (Charrad et al., 2014 JSS) to +#'determine the optimal number of clusters used for K-means clustering if it is +#'not provided by users. #' #'@param data A numeric array with named dimensions that at least have #' 'time_dim' corresponding to time and the dimensions of 'weights' @@ -19,9 +21,10 @@ #' 'data'. The default value is 'sdate'. #'@param nclusters A positive integer K that must be bigger than 1 indicating #' the number of clusters to be computed, or K initial cluster centers to be -#' used in the method. The default is NULL, and users have to specify which -#' index from NbClust and the associated criteria for selecting the optimal -#' number of clusters will be used for K-means clustering of 'data'. +#' used in the method. The default value is NULL, which means that the number +#' of clusters will be determined by NbClust(). The parameter 'index' +#' therefore needs to be specified for NbClust() to find the optimal number of +#' clusters to be used for K-means clustering calculation. #'@param index A character string of the validity index from NbClust package #' that can be used to determine optimal K if K is not specified with #' 'nclusters'. The default value is 'sdindex' (Halkidi et al. 2001, JIIS). diff --git a/man/Cluster.Rd b/man/Cluster.Rd index ce0e5ad..5a05c5d 100644 --- a/man/Cluster.Rd +++ b/man/Cluster.Rd @@ -28,9 +28,10 @@ dimensions must also be part of the 'data' dimensions.} \item{nclusters}{A positive integer K that must be bigger than 1 indicating the number of clusters to be computed, or K initial cluster centers to be -used in the method. The default is NULL, and users have to specify which -index from NbClust and the associated criteria for selecting the optimal -number of clusters will be used for K-means clustering of 'data'.} +used in the method. The default value is NULL, which means that the number +of clusters will be determined by NbClust(). The parameter 'index' +therefore needs to be specified for NbClust() to find the optimal number of +clusters to be used for K-means clustering calculation.} \item{index}{A character string of the validity index from NbClust package that can be used to determine optimal K if K is not specified with @@ -86,8 +87,10 @@ K-means clustering method using Euclidean distance, of an array of input data with any number of dimensions that at least contain time_dim. Specifically, it partitions the array along time axis in K groups or clusters in which each space vector/array belongs to (i.e., is a member of) the -cluster with the nearest center or centroid. This function relies on the -NbClust package (Charrad et al., 2014 JSS). +cluster with the nearest center or centroid. This function is a wrapper of +kmeans() and relies on the NbClust package (Charrad et al., 2014 JSS) to +determine the optimal number of clusters used for K-means clustering if it is +not provided by users. } \examples{ # Generating synthetic data -- GitLab From 127814c33ea70eb837f4fccbb22e68061293ce12 Mon Sep 17 00:00:00 2001 From: aho Date: Wed, 7 Apr 2021 19:05:34 +0200 Subject: [PATCH 8/9] Set 'weights' default to NULL and add parameter 'space_dim' --- R/Cluster.R | 93 ++++++++++++++++++++++------------- man/Cluster.Rd | 15 ++++-- tests/testthat/test-Cluster.R | 17 +++++-- 3 files changed, 81 insertions(+), 44 deletions(-) diff --git a/R/Cluster.R b/R/Cluster.R index af0901d..36534a2 100644 --- a/R/Cluster.R +++ b/R/Cluster.R @@ -11,14 +11,17 @@ #'not provided by users. #' #'@param data A numeric array with named dimensions that at least have -#' 'time_dim' corresponding to time and the dimensions of 'weights' -#' corresponding to either area-averages over a series of domains or the grid -#' points for any sptial grid structure. +#' 'time_dim' corresponding to time and 'space_dim' (optional) corresponding +#' to either area-averages over a series of domains or the grid points for any +#' sptial grid structure. #'@param weights A numeric array with named dimension of multiplicative weights #' based on the areas covering each domain/region or grid-cell of 'data'. The -#' dimensions must also be part of the 'data' dimensions. +#' dimensions must be equal to the 'space_dim' in 'data'. The default value is +#' NULL which means no weighting is applied. #'@param time_dim A character string indicating the name of time dimension in #' 'data'. The default value is 'sdate'. +#'@param space_dim A character vector indicating the names of spatial dimensions +#' in 'data'. The default value is NULL. #'@param nclusters A positive integer K that must be bigger than 1 indicating #' the number of clusters to be computed, or K initial cluster centers to be #' used in the method. The default value is NULL, which means that the number @@ -112,7 +115,7 @@ #'@importFrom stats kmeans #'@importFrom grDevices pdf dev.off #'@export -Cluster <- function(data, weights, time_dim = 'sdate', +Cluster <- function(data, weights = NULL, time_dim = 'sdate', space_dim = NULL, nclusters = NULL, index = 'sdindex', ncores = NULL) { # Check inputs ## data @@ -131,23 +134,21 @@ Cluster <- function(data, weights, time_dim = 'sdate', } ## weights - if (is.null(weights)) { - stop("Parameter 'weights' cannot be NULL.") - } - if (!is.numeric(weights)) { - stop("Parameter 'weights' must be a numeric array.") - } - if (is.null(dim(weights))) { #is vector - dim(weights) <- c(length(weights)) - } - if(any(is.null(names(dim(weights))))| any(nchar(names(dim(weights))) == 0)) { - stop("Parameter 'weights' must have dimension names.") - } - if (any(!names(dim(weights)) %in% names(dim(data)) | - !dim(weights) %in% dim(data))) { - stop("Parameter 'weights' must have dimensions that can be found in 'data' dimensions.") + if (!is.null(weights)) { + if (!is.numeric(weights)) { + stop("Parameter 'weights' must be a numeric array.") + } + if (is.null(dim(weights))) { #is vector + dim(weights) <- c(length(weights)) + } + if (any(is.null(names(dim(weights))))| any(nchar(names(dim(weights))) == 0)) { + stop("Parameter 'weights' must have dimension names.") + } + if (any(!names(dim(weights)) %in% names(dim(data)) | + !dim(weights) %in% dim(data))) { + stop("Parameter 'weights' must have dimensions that can be found in 'data' dimensions.") + } } - ## time_dim if (!is.character(time_dim) | length(time_dim) > 1) { stop("Parameter 'time_dim' must be a character string.") @@ -155,7 +156,28 @@ Cluster <- function(data, weights, time_dim = 'sdate', if (!time_dim %in% names(dim(data))) { stop("Parameter 'time_dim' is not found in 'data' dimensions.") } - + ## space_dim + if (!is.null(space_dim)) { + if (!is.character(space_dim)) { + stop("Parameter 'space_dim' must be a character vector.") + } + if (any(!space_dim %in% names(dim(data)))) { + stop("Parameter 'space_dim' is not found in 'data' dimensions.") + } + if (!is.null(weights)) { + if (!(length(space_dim) == length(dim(weights)) & all(space_dim %in% names(dim(weights))))) { + stop("Parameter 'weights' must have dimension names the same as 'space_dim'.") + } + if (space_dim != names(dim(weights))) { + space_dim <- names(dim(weights)) + } + } + } + if (is.null(space_dim) & !is.null(weights)) { + space_dim <- names(dim(weights)) + .warning(paste0("Parameter 'weights' is assigned but not 'space_dim'. Define 'space_dim' ", + "by the dimensions of 'weights'.")) + } ## nclusters if (!is.null(nclusters)) { if (!is.numeric(nclusters) | length(nclusters) != 1) { @@ -182,7 +204,7 @@ Cluster <- function(data, weights, time_dim = 'sdate', # Calculate Cluster output <- Apply(list(data), - target_dims = c(time_dim, names(dim(weights))), + target_dims = c(time_dim, space_dim), fun = .Cluster, weights = weights, nclusters = nclusters, index = index, ncores = ncores) @@ -190,18 +212,22 @@ Cluster <- function(data, weights, time_dim = 'sdate', return(output) } -.Cluster <- function(data, weights, nclusters = NULL, index = 'sdindex') { - # data: [time, lat, lon] +.Cluster <- function(data, weights = NULL, nclusters = NULL, index = 'sdindex') { + # data: [time, (lat, lon)] dat_dim <- dim(data) - # Reshape data into two dims - dim(data) <- c(dat_dim[1], prod(dat_dim[-1])) - dim(weights) <- prod(dim(weights)) # a vector - - # weights - data_list <- lapply(1:dat_dim[1], - function(x) { data[x, ] * weights }) - data <- do.call(abind::abind, c(data_list, along = 0)) + if (length(dim(data)) != 1) { + # Reshape data into two dims + dim(data) <- c(dat_dim[1], prod(dat_dim[-1])) + + # weights + if (!is.null(weights)) { + dim(weights) <- prod(dim(weights)) # a vector + data_list <- lapply(1:dat_dim[1], + function(x) { data[x, ] * weights }) + data <- do.call(abind::abind, c(data_list, along = 0)) + } + } if (!is.null(nclusters)) { kmeans.results <- kmeans(data, centers = nclusters, iter.max = 300, @@ -224,6 +250,5 @@ Cluster <- function(data, weights, time_dim = 'sdate', kmeans.results <- kmeans(data, centers = kmc1, iter.max = 300, nstart = 30) } - invisible(kmeans.results) } diff --git a/man/Cluster.Rd b/man/Cluster.Rd index 5a05c5d..0c1f59c 100644 --- a/man/Cluster.Rd +++ b/man/Cluster.Rd @@ -6,8 +6,9 @@ \usage{ Cluster( data, - weights, + weights = NULL, time_dim = "sdate", + space_dim = NULL, nclusters = NULL, index = "sdindex", ncores = NULL @@ -15,17 +16,21 @@ Cluster( } \arguments{ \item{data}{A numeric array with named dimensions that at least have -'time_dim' corresponding to time and the dimensions of 'weights' -corresponding to either area-averages over a series of domains or the grid -points for any sptial grid structure.} +'time_dim' corresponding to time and 'space_dim' (optional) corresponding +to either area-averages over a series of domains or the grid points for any +sptial grid structure.} \item{weights}{A numeric array with named dimension of multiplicative weights based on the areas covering each domain/region or grid-cell of 'data'. The -dimensions must also be part of the 'data' dimensions.} +dimensions must be equal to the 'space_dim' in 'data'. The default value is +NULL which means no weighting is applied.} \item{time_dim}{A character string indicating the name of time dimension in 'data'. The default value is 'sdate'.} +\item{space_dim}{A character vector indicating the names of spatial dimensions +in 'data'. The default value is NULL.} + \item{nclusters}{A positive integer K that must be bigger than 1 indicating the number of clusters to be computed, or K initial cluster centers to be used in the method. The default value is NULL, which means that the number diff --git a/tests/testthat/test-Cluster.R b/tests/testthat/test-Cluster.R index be44f3f..9071cd8 100644 --- a/tests/testthat/test-Cluster.R +++ b/tests/testthat/test-Cluster.R @@ -13,7 +13,6 @@ context("s2dv::Cluster tests") dim = c(sdate = 50, lat = 2, lon = 3)) weights2 <- array(c(0.9, 1.1), dim = c(lat = 2, lon = 3)) - ############################################## test_that("1. Input checks", { @@ -32,10 +31,6 @@ test_that("1. Input checks", { ) # weights expect_error( - Cluster(dat1, weights = c()), - "Parameter 'weights' cannot be NULL." - ) - expect_error( Cluster(dat1, weights = 'lat'), "Parameter 'weights' must be a numeric array." ) @@ -88,6 +83,10 @@ test_that("2. Output checks: dat1", { length(Cluster(dat1, weights1)$cluster), 50 ) + expect_equal( + length(Cluster(dat1)$cluster), + 100 + ) expect_equal( dim(Cluster(dat1, weights1)$centers), c(8, 2) @@ -107,6 +106,14 @@ test_that("3. Output checks: dat2", { length(Cluster(dat2, weights2)$cluster), 50 ) + expect_equal( + length(Cluster(dat2)$cluster), + 300 + ) + expect_equal( + length(Cluster(dat2, space_dim = c('lon', 'lat'))$cluster), + 50 + ) expect_equal( dim(Cluster(dat2, weights2)$centers), c(7, 6) -- GitLab From b503ce0b0550b6bba40c8494838bc0b0b28b19e6 Mon Sep 17 00:00:00 2001 From: aho Date: Wed, 7 Apr 2021 20:02:57 +0200 Subject: [PATCH 9/9] Revise the documentation about return values --- R/Cluster.R | 33 +++++++++++++++++++++++---------- man/Cluster.Rd | 33 +++++++++++++++++++++++---------- 2 files changed, 46 insertions(+), 20 deletions(-) diff --git a/R/Cluster.R b/R/Cluster.R index 36534a2..33527ae 100644 --- a/R/Cluster.R +++ b/R/Cluster.R @@ -48,32 +48,45 @@ #'@return #'A list containing: #'\item{$cluster}{ -#' An integer vector of the occurrence of a cluster along time, i.e., when -#' certain data member in time is allocated to a specific cluster. +#' An integer array of the occurrence of a cluster along time, i.e., when +#' certain data member in time is allocated to a specific cluster. The dimensions +#' are same as 'data' without 'space_dim'. #'} #'\item{$centers}{ -#' A matrix of cluster centres or centroids (e.g. [1:K, 1:spatial degrees of freedom]). +#' A nemeric array of cluster centres or centroids (e.g. [1:K, 1:spatial degrees +#' of freedom]). The rest dimensions are same as 'data' except 'time_dim' +#' and 'space_dim'. #'} #'\item{$totss}{ -#' A number of the total sum of squares. +#' A numeric array of the total sum of squares. The dimensions are same as 'data' +#' except 'time_dim' and 'space_dim'. #'} #'\item{$withinss}{ -#' A vector of within-cluster sum of squares, one component per cluster. +#' A numeric array of within-cluster sum of squares, one component per cluster. +#' The first dimenion is the number of cluster, and the rest dimensions are +#' same as 'data' except 'time_dim' and 'space_dim'. #'} #'\item{$tot.withinss}{ -#' A number of the total within-cluster sum of squares, i.e., sum(withinss). +#' A numeric array of the total within-cluster sum of squares, i.e., +#' sum(withinss). The dimensions are same as 'data' except 'time_dim' and +#' 'space_dim'. #'} #'\item{$betweenss}{ -#' A number of the between-cluster sum of squares, i.e. totss-tot.withinss. +#' A numeric array of the between-cluster sum of squares, i.e. totss-tot.withinss. +#' The dimensions are same as 'data' except 'time_dim' and 'space_dim'. #'} #'\item{$size}{ -#' A vector of the number of points in each cluster. +#' A numeric array of the number of points in each cluster. The first dimenion +#' is the number of cluster, and the rest dimensions are same as 'data' except +#' 'time_dim' and 'space_dim'. #'} #'\item{$iter}{ -#' An interger as the number of (outer) iterations. +#' A numeric array of the number of (outer) iterations. The dimensions are +#' same as 'data' except 'time_dim' and 'space_dim'. #'} #'\item{$ifault}{ -#' An integer as an indicator of a possible algorithm problem. +#' A numeric array of an indicator of a possible algorithm problem. The +#' dimensions are same as 'data' except 'time_dim' and 'space_dim'. #'} #' #'@references diff --git a/man/Cluster.Rd b/man/Cluster.Rd index 0c1f59c..7ea25de 100644 --- a/man/Cluster.Rd +++ b/man/Cluster.Rd @@ -58,32 +58,45 @@ computation. The default value is NULL.} \value{ A list containing: \item{$cluster}{ - An integer vector of the occurrence of a cluster along time, i.e., when - certain data member in time is allocated to a specific cluster. + An integer array of the occurrence of a cluster along time, i.e., when + certain data member in time is allocated to a specific cluster. The dimensions + are same as 'data' without 'space_dim'. } \item{$centers}{ - A matrix of cluster centres or centroids (e.g. [1:K, 1:spatial degrees of freedom]). + A nemeric array of cluster centres or centroids (e.g. [1:K, 1:spatial degrees + of freedom]). The rest dimensions are same as 'data' except 'time_dim' + and 'space_dim'. } \item{$totss}{ - A number of the total sum of squares. + A numeric array of the total sum of squares. The dimensions are same as 'data' + except 'time_dim' and 'space_dim'. } \item{$withinss}{ - A vector of within-cluster sum of squares, one component per cluster. + A numeric array of within-cluster sum of squares, one component per cluster. + The first dimenion is the number of cluster, and the rest dimensions are + same as 'data' except 'time_dim' and 'space_dim'. } \item{$tot.withinss}{ - A number of the total within-cluster sum of squares, i.e., sum(withinss). + A numeric array of the total within-cluster sum of squares, i.e., + sum(withinss). The dimensions are same as 'data' except 'time_dim' and + 'space_dim'. } \item{$betweenss}{ - A number of the between-cluster sum of squares, i.e. totss-tot.withinss. + A numeric array of the between-cluster sum of squares, i.e. totss-tot.withinss. + The dimensions are same as 'data' except 'time_dim' and 'space_dim'. } \item{$size}{ - A vector of the number of points in each cluster. + A numeric array of the number of points in each cluster. The first dimenion + is the number of cluster, and the rest dimensions are same as 'data' except + 'time_dim' and 'space_dim'. } \item{$iter}{ - An interger as the number of (outer) iterations. + A numeric array of the number of (outer) iterations. The dimensions are + same as 'data' except 'time_dim' and 'space_dim'. } \item{$ifault}{ - An integer as an indicator of a possible algorithm problem. + A numeric array of an indicator of a possible algorithm problem. The + dimensions are same as 'data' except 'time_dim' and 'space_dim'. } } \description{ -- GitLab