diff --git a/R/Cluster.R b/R/Cluster.R index d6c5dff46659edd89f114470cd6dcfc8819cc5c7..2fac687a3aa1dc4afcde425fa170a938247f535d 100644 --- a/R/Cluster.R +++ b/R/Cluster.R @@ -37,7 +37,7 @@ #' "ptbiserial", "gap", "frey", "mcclain", "gamma", "gplus", "tau", "dunn", #' "hubert", "sdindex", and "sdbw". #' One can also use all of them with the option 'alllong' or almost all indices -# except gap, gamma, gplus and tau with 'all', when the optimal number of +#' except gap, gamma, gplus and tau with 'all', when the optimal number of #' clusters K is detremined by the majority rule (the maximum of histogram of #' the results of all indices with finite solutions). Use of some indices on #' a big and/or unstructured dataset can be computationally intense and/or @@ -214,8 +214,23 @@ Cluster <- function(data, weights = NULL, time_dim = 'sdate', space_dim = NULL, } ############################### - # Calculate Cluster + # Compute nclusters + if (is.null(nclusters)) { + pdf(file = NULL) + nbclust.results <- NbClust::NbClust(data, distance = 'euclidean', + min.nc = 2, max.nc = 20, + method = 'kmeans', index = index) + dev.off() + if (index == 'all' || index == 'alllong') { + kmc <- hist(nbclust.results$Best.nc[1, ], breaks = seq(0, 20), + plot = FALSE)$counts + nclusters <- which(kmc == max(kmc)) + } else { + nclusters <- nbclust.results$Best.nc[1] + } + } + # Calculate Cluster output <- Apply(list(data), target_dims = c(time_dim, space_dim), fun = .Cluster, @@ -225,7 +240,7 @@ Cluster <- function(data, weights = NULL, time_dim = 'sdate', space_dim = NULL, return(output) } -.Cluster <- function(data, weights = NULL, nclusters = NULL, index = 'sdindex') { +.Cluster <- function(data, weights = NULL, nclusters, index = 'sdindex') { # data: [time, (lat, lon)] dat_dim <- dim(data) @@ -241,27 +256,22 @@ Cluster <- function(data, weights = NULL, time_dim = 'sdate', space_dim = NULL, data <- do.call(abind::abind, c(data_list, along = 0)) } } + + kmeans.results <- kmeans(data, centers = nclusters, iter.max = 300, + nstart = 30) - if (!is.null(nclusters)) { - kmeans.results <- kmeans(data, centers = nclusters, iter.max = 300, - nstart = 30) - } else { - pdf(file = NULL) - nbclust.results <- NbClust::NbClust(data, distance = 'euclidean', - min.nc = 2, max.nc = 20, - method = 'kmeans', index = index) - dev.off() - - if (index == 'all' || index == 'alllong') { - kmc <- hist(nbclust.results$Best.nc[1, ], breaks = seq(0, 20), - plot = FALSE)$counts - kmc1 <- which(kmc == max(kmc)) - } else { - kmc1 <- nbclust.results$Best.nc[1] - } +#---------------NEW--------------- + # Add dimension names and shape space_dim back + kmeans.results$cluster <- as.array(kmeans.results$cluster) + names(dim(kmeans.results$cluster)) <- names(dat_dim)[1] + kmeans.results$centers <- array(kmeans.results$centers, + dim = c(nclusters, dat_dim[-1])) + names(dim(kmeans.results$centers)) <- c('K', names(dat_dim)[-1]) + kmeans.results$withinss <- as.array(kmeans.results$withinss) + names(dim(kmeans.results$withinss)) <- 'K' + kmeans.results$size <- as.array(kmeans.results$size) + names(dim(kmeans.results$size)) <- 'K' - kmeans.results <- kmeans(data, centers = kmc1, iter.max = 300, - nstart = 30) - } +#----------NEW_END---------------- invisible(kmeans.results) } diff --git a/man/Cluster.Rd b/man/Cluster.Rd index 8c46b3d7aa8cdf1ecdfb540a2af13ab991d777e4..1fa6a1ff5e5e1819d76306fb1b95a32e2beadd71 100644 --- a/man/Cluster.Rd +++ b/man/Cluster.Rd @@ -47,6 +47,7 @@ Other indices available in NBClust are "kl", "ch", "hartigan", "ccc", "ptbiserial", "gap", "frey", "mcclain", "gamma", "gplus", "tau", "dunn", "hubert", "sdindex", and "sdbw". One can also use all of them with the option 'alllong' or almost all indices + except gap, gamma, gplus and tau with 'all', when the optimal number of clusters K is detremined by the majority rule (the maximum of histogram of the results of all indices with finite solutions). Use of some indices on a big and/or unstructured dataset can be computationally intense and/or diff --git a/tests/testthat/test-Cluster.R b/tests/testthat/test-Cluster.R index 4e0a7b171d25ec559afbef8b97dafb45e87cff6e..13297d80850198518b3baba264f4dcfe900989a8 100644 --- a/tests/testthat/test-Cluster.R +++ b/tests/testthat/test-Cluster.R @@ -10,7 +10,7 @@ context("s2dv::Cluster tests") # dat2 set.seed(2) dat2 <- array(rnorm(300), - dim = c(sdate = 50, lat = 2, lon = 3)) + dim = c(dat = 1, sdate = 50, lat = 2, lon = 3)) weights2 <- array(c(0.9, 1.1), dim = c(lat = 2, lon = 3)) ############################################## @@ -78,24 +78,76 @@ test_that("1. Input checks", { ############################################## test_that("2. Output checks: dat1", { -# The output is random. Only check dimensions. expect_equal( - length(Cluster(dat1, weights1, space_dim = 'space')$cluster), - 50 + names(Cluster(dat1, weights1, space_dim = 'space')), + c("cluster", "centers", "totss", "withinss", "tot.withinss", + "betweenss", "size", "iter", "ifault") ) expect_equal( - length(Cluster(dat1)$cluster), - 100 + dim(Cluster(dat1, weights1, space_dim = 'space')$cluster), + c(sdate = 50) + ) + expect_equal( + dim(Cluster(dat1)$cluster), + c(sdate = 50, space = 2) ) expect_equal( dim(Cluster(dat1, weights1, space_dim = 'space')$centers), - c(8, 2) + c(K = 7, space = 2) ) expect_equal( dim(Cluster(dat1, weights1, nclusters = 3, space_dim = 'space')$centers), - c(3, 2) + c(K = 3, space = 2) + ) + expect_equal( + dim(Cluster(dat1, weights1, space_dim = 'space')$withinss), + c(K = 7) + ) + expect_equal( + dim(Cluster(dat1, weights1, space_dim = 'space')$size), + c(K = 7) + ) + expect_equal( + as.vector(Cluster(dat1, weights1, space_dim = 'space')$cluster)[1:5], + c(7, 5, 7, 5, 4) + ) + expect_equal( + as.vector(Cluster(dat1, weights1, space_dim = 'space')$centers[1:5, 2]), + c(-0.08553708, -1.55834000, 1.60550527, 1.79873789, -0.74919031), + tolerance = 0.0001 + ) + expect_equal( + as.vector(Cluster(dat1, weights1, space_dim = 'space')$totss), + 83.08558, + tolerance = 0.0001 + ) + expect_equal( + as.vector(Cluster(dat1, weights1, space_dim = 'space')$withinss)[1:3], + c(1.8509071, 0.6242121, 0.7800697), + tolerance = 0.0001 + ) + expect_equal( + as.vector(Cluster(dat1, weights1, space_dim = 'space')$tot.withinss), + 11.71979, + tolerance = 0.0001 + ) + expect_equal( + as.vector(Cluster(dat1, weights1, space_dim = 'space')$betweenss), + 71.36579, + tolerance = 0.0001 + ) + expect_equal( + as.vector(Cluster(dat1, weights1, space_dim = 'space')$size), + c(4, 5, 4, 6, 10, 8, 13) + ) + expect_equal( + as.vector(Cluster(dat1, weights1, space_dim = 'space')$iter), + 2 + ) + expect_equal( + as.vector(Cluster(dat1, weights1, space_dim = 'space')$ifault), + 0 ) - }) @@ -103,24 +155,37 @@ test_that("2. Output checks: dat1", { test_that("3. Output checks: dat2", { expect_equal( - length(Cluster(dat2, weights2, space_dim = c('lat', 'lon'))$cluster), - 50 + names(Cluster(dat2, weights2, space_dim = c('lat', 'lon'))), + c("cluster", "centers", "totss", "withinss", "tot.withinss", + "betweenss", "size", "iter", "ifault") ) expect_equal( - length(Cluster(dat2)$cluster), - 300 + dim(Cluster(dat2, weights2, space_dim = c('lat', 'lon'))$cluster), + c(sdate = 50, dat = 1) ) expect_equal( - length(Cluster(dat2, space_dim = c('lon', 'lat'))$cluster), - 50 + dim(Cluster(dat2)$cluster), + c(sdate = 50, dat = 1, lat = 2, lon = 3) ) expect_equal( dim(Cluster(dat2, weights2, space_dim = c('lat', 'lon'))$centers), - c(7, 6) + c(K = 6, lat = 2, lon = 3, dat = 1) + ) + expect_equal( + dim(Cluster(dat2, weights2, nclusters = 3, space_dim = c('lat', 'lon'))$centers), + c(K = 3, lat = 2, lon = 3, dat = 1) + ) + expect_equal( + dim(Cluster(dat2, weights2, space_dim = c('lat', 'lon'))$withinss), + c(K = 6, dat = 1) + ) + expect_equal( + dim(Cluster(dat2, weights2, space_dim = c('lat', 'lon'))$size), + c(K = 6, dat = 1) ) expect_equal( - dim(Cluster(dat2, weights2, nclusters = 5, space_dim = c('lat', 'lon'))$centers), - c(5, 6) + as.vector(Cluster(dat2, weights2, space_dim = c('lat', 'lon'))$cluster)[1:5], + c(5, 1, 2, 1, 5) ) })