From 07de22d56c2832db72911f66b7eb22cdcb767ddc Mon Sep 17 00:00:00 2001
From: aho <an.ho@bsc.es>
Date: Fri, 13 May 2022 17:05:18 +0200
Subject: [PATCH] Fix the bug of calculating nclusters

---
 R/Cluster.R                   |  56 +++++++++++--------
 man/Cluster.Rd                |   1 +
 tests/testthat/test-Cluster.R | 101 ++++++++++++++++++++++++++++------
 3 files changed, 117 insertions(+), 41 deletions(-)

diff --git a/R/Cluster.R b/R/Cluster.R
index d6c5dff..2fac687 100644
--- a/R/Cluster.R
+++ b/R/Cluster.R
@@ -37,7 +37,7 @@
 #'  "ptbiserial", "gap", "frey", "mcclain", "gamma", "gplus", "tau", "dunn", 
 #'  "hubert", "sdindex", and "sdbw".
 #'  One can also use all of them with the option 'alllong' or almost all indices
-#   except gap, gamma, gplus and tau with 'all', when the optimal number of
+#'   except gap, gamma, gplus and tau with 'all', when the optimal number of
 #'  clusters K is detremined by the majority rule (the maximum of histogram of 
 #'  the results of all indices with finite solutions). Use of some indices on 
 #'  a big and/or unstructured dataset can be computationally intense and/or 
@@ -214,8 +214,23 @@ Cluster <- function(data, weights = NULL, time_dim = 'sdate', space_dim = NULL,
   }
 
   ###############################
-  # Calculate Cluster
+  # Compute nclusters
+  if (is.null(nclusters)) {
+    pdf(file = NULL)
+     nbclust.results <- NbClust::NbClust(data, distance = 'euclidean', 
+                                         min.nc = 2, max.nc = 20, 
+                                         method = 'kmeans', index = index)
+    dev.off()
+    if (index == 'all' || index == 'alllong') {
+      kmc  <- hist(nbclust.results$Best.nc[1, ], breaks = seq(0, 20), 
+                   plot = FALSE)$counts
+      nclusters <- which(kmc == max(kmc))
+    } else {
+      nclusters <- nbclust.results$Best.nc[1]
+    }
+  }
 
+  # Calculate Cluster
   output <- Apply(list(data),
                   target_dims = c(time_dim, space_dim),
                   fun = .Cluster,
@@ -225,7 +240,7 @@ Cluster <- function(data, weights = NULL, time_dim = 'sdate', space_dim = NULL,
   return(output)
 }
 
-.Cluster <- function(data, weights = NULL, nclusters = NULL, index = 'sdindex') {
+.Cluster <- function(data, weights = NULL, nclusters, index = 'sdindex') {
   # data: [time, (lat, lon)]
   dat_dim <- dim(data)
 
@@ -241,27 +256,22 @@ Cluster <- function(data, weights = NULL, time_dim = 'sdate', space_dim = NULL,
       data <- do.call(abind::abind, c(data_list, along = 0))
     }
   }
+  
+  kmeans.results <- kmeans(data, centers = nclusters, iter.max = 300, 
+                           nstart = 30) 
 
-  if (!is.null(nclusters)) {
-    kmeans.results <- kmeans(data, centers = nclusters, iter.max = 300, 
-                             nstart = 30) 
-  } else {
-    pdf(file = NULL)
-    nbclust.results <- NbClust::NbClust(data, distance = 'euclidean', 
-                                        min.nc = 2, max.nc = 20, 
-                                        method = 'kmeans', index = index)
-    dev.off()
-
-    if (index == 'all' || index == 'alllong') {
-      kmc  <- hist(nbclust.results$Best.nc[1, ], breaks = seq(0, 20), 
-                   plot = FALSE)$counts
-      kmc1 <- which(kmc == max(kmc))
-    } else {
-      kmc1 <- nbclust.results$Best.nc[1]
-    }
+#---------------NEW---------------
+  # Add dimension names and shape space_dim back
+  kmeans.results$cluster <- as.array(kmeans.results$cluster)
+  names(dim(kmeans.results$cluster)) <- names(dat_dim)[1]
+  kmeans.results$centers <- array(kmeans.results$centers, 
+                                  dim = c(nclusters, dat_dim[-1]))
+  names(dim(kmeans.results$centers)) <- c('K', names(dat_dim)[-1])
+  kmeans.results$withinss <- as.array(kmeans.results$withinss)
+  names(dim(kmeans.results$withinss)) <- 'K'
+  kmeans.results$size <- as.array(kmeans.results$size)
+  names(dim(kmeans.results$size)) <- 'K'
 
-    kmeans.results <- kmeans(data, centers = kmc1, iter.max = 300, 
-                             nstart = 30)
-  }
+#----------NEW_END----------------
   invisible(kmeans.results)
 }
diff --git a/man/Cluster.Rd b/man/Cluster.Rd
index 8c46b3d..1fa6a1f 100644
--- a/man/Cluster.Rd
+++ b/man/Cluster.Rd
@@ -47,6 +47,7 @@ Other indices available in NBClust are "kl", "ch", "hartigan", "ccc",
 "ptbiserial", "gap", "frey", "mcclain", "gamma", "gplus", "tau", "dunn", 
 "hubert", "sdindex", and "sdbw".
 One can also use all of them with the option 'alllong' or almost all indices
+ except gap, gamma, gplus and tau with 'all', when the optimal number of
 clusters K is detremined by the majority rule (the maximum of histogram of 
 the results of all indices with finite solutions). Use of some indices on 
 a big and/or unstructured dataset can be computationally intense and/or 
diff --git a/tests/testthat/test-Cluster.R b/tests/testthat/test-Cluster.R
index 4e0a7b1..13297d8 100644
--- a/tests/testthat/test-Cluster.R
+++ b/tests/testthat/test-Cluster.R
@@ -10,7 +10,7 @@ context("s2dv::Cluster tests")
   # dat2
   set.seed(2)
   dat2 <- array(rnorm(300),
-                dim = c(sdate = 50, lat = 2, lon = 3))
+                dim = c(dat = 1, sdate = 50, lat = 2, lon = 3))
   weights2 <- array(c(0.9, 1.1), dim = c(lat = 2, lon = 3))
 
 ##############################################
@@ -78,24 +78,76 @@ test_that("1. Input checks", {
 
 ##############################################
 test_that("2. Output checks: dat1", {
-# The output is random. Only check dimensions.
   expect_equal(
-    length(Cluster(dat1, weights1, space_dim = 'space')$cluster),
-    50
+    names(Cluster(dat1, weights1, space_dim = 'space')),
+    c("cluster", "centers", "totss", "withinss", "tot.withinss",
+      "betweenss", "size", "iter", "ifault")
   )
   expect_equal(
-    length(Cluster(dat1)$cluster),
-    100
+    dim(Cluster(dat1, weights1, space_dim = 'space')$cluster),
+    c(sdate = 50)
+  )
+  expect_equal(
+    dim(Cluster(dat1)$cluster),
+    c(sdate = 50, space = 2)
   )
   expect_equal(
     dim(Cluster(dat1, weights1, space_dim = 'space')$centers),
-    c(8, 2)
+    c(K = 7, space = 2)
   )
   expect_equal(
     dim(Cluster(dat1, weights1, nclusters = 3, space_dim = 'space')$centers),
-    c(3, 2)
+    c(K = 3, space = 2)
+  )
+  expect_equal(
+    dim(Cluster(dat1, weights1, space_dim = 'space')$withinss),
+    c(K = 7)
+  )
+  expect_equal(
+    dim(Cluster(dat1, weights1, space_dim = 'space')$size),
+    c(K = 7)
+  )
+  expect_equal(
+  as.vector(Cluster(dat1, weights1, space_dim = 'space')$cluster)[1:5],
+  c(7, 5, 7, 5, 4)
+  )
+  expect_equal(
+  as.vector(Cluster(dat1, weights1, space_dim = 'space')$centers[1:5, 2]),
+  c(-0.08553708, -1.55834000, 1.60550527, 1.79873789, -0.74919031),
+  tolerance = 0.0001
+  )
+  expect_equal(
+  as.vector(Cluster(dat1, weights1, space_dim = 'space')$totss),
+  83.08558,
+  tolerance = 0.0001
+  )
+  expect_equal(
+  as.vector(Cluster(dat1, weights1, space_dim = 'space')$withinss)[1:3],
+  c(1.8509071, 0.6242121, 0.7800697),
+  tolerance = 0.0001
+  )
+  expect_equal(
+  as.vector(Cluster(dat1, weights1, space_dim = 'space')$tot.withinss),
+  11.71979,
+  tolerance = 0.0001
+  )
+  expect_equal(
+  as.vector(Cluster(dat1, weights1, space_dim = 'space')$betweenss),
+  71.36579,
+  tolerance = 0.0001
+  )
+  expect_equal(
+  as.vector(Cluster(dat1, weights1, space_dim = 'space')$size),
+  c(4, 5, 4, 6, 10, 8, 13)
+  )
+  expect_equal(
+  as.vector(Cluster(dat1, weights1, space_dim = 'space')$iter),
+  2
+  )
+  expect_equal(
+  as.vector(Cluster(dat1, weights1, space_dim = 'space')$ifault),
+  0
   )
-
 })
 
 
@@ -103,24 +155,37 @@ test_that("2. Output checks: dat1", {
 test_that("3. Output checks: dat2", {
 
   expect_equal(
-    length(Cluster(dat2, weights2, space_dim = c('lat', 'lon'))$cluster),
-    50
+    names(Cluster(dat2, weights2, space_dim = c('lat', 'lon'))),
+    c("cluster", "centers", "totss", "withinss", "tot.withinss",
+      "betweenss", "size", "iter", "ifault")
   )
   expect_equal(
-    length(Cluster(dat2)$cluster),
-    300
+    dim(Cluster(dat2, weights2, space_dim = c('lat', 'lon'))$cluster),
+    c(sdate = 50, dat = 1)
   )
   expect_equal(
-    length(Cluster(dat2, space_dim = c('lon', 'lat'))$cluster),
-    50
+    dim(Cluster(dat2)$cluster),
+    c(sdate = 50, dat = 1, lat = 2, lon = 3)
   )
   expect_equal(
     dim(Cluster(dat2, weights2, space_dim = c('lat', 'lon'))$centers),
-    c(7, 6)
+    c(K = 6, lat = 2, lon = 3, dat = 1)
+  )
+  expect_equal(
+    dim(Cluster(dat2, weights2, nclusters = 3, space_dim = c('lat', 'lon'))$centers),
+    c(K = 3, lat = 2, lon = 3, dat = 1)
+  )
+  expect_equal(
+    dim(Cluster(dat2, weights2, space_dim = c('lat', 'lon'))$withinss),
+    c(K = 6, dat = 1)
+  )
+  expect_equal(
+    dim(Cluster(dat2, weights2, space_dim = c('lat', 'lon'))$size),
+    c(K = 6, dat = 1)
   )
   expect_equal(
-    dim(Cluster(dat2, weights2, nclusters = 5, space_dim = c('lat', 'lon'))$centers),
-    c(5, 6)
+  as.vector(Cluster(dat2, weights2, space_dim = c('lat', 'lon'))$cluster)[1:5],
+  c(5, 1, 2, 1, 5)
   )
 
 })
-- 
GitLab