From 26f765a3540e375eca14acc40b824ec31cdbaa03 Mon Sep 17 00:00:00 2001 From: Jost von Hardenberg Date: Tue, 21 Apr 2020 22:22:56 +0200 Subject: [PATCH 1/9] expand closest member --- R/CST_EnsClustering.R | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/R/CST_EnsClustering.R b/R/CST_EnsClustering.R index 62d29244..de0733b3 100644 --- a/R/CST_EnsClustering.R +++ b/R/CST_EnsClustering.R @@ -186,6 +186,19 @@ EnsClustering <- function(data, lat, lon, time_moment = "mean", numclus = NULL, lon_lim = lon_lim, lat_lim = lat_lim, variance_explained = variance_explained, numpcs = numpcs, verbose = verbose) + + cm=result$closest_member + cml <- vector(mode = "list", length = length(cluster_dim)) + cum <- cm * 0 + dim_cd <- dim(exp)[cluster_dim] + for (i in rev(seq_along(cluster_dim))) { + cml[[i]] <- floor((cm - cum) / prod(dim_cd[-i])) + cum <- cum + cml[[i]] * prod(dim_cd[-i]) + dim_cd <- dim_cd[-i] + } + names(cml) <- cluster_dim + result$closest_member <- cml + return(append(result, list(lat = lat, lon = lon))) } -- GitLab From 8e7ce8f493dacb1236e54e51713ec8b8ee6ced0a Mon Sep 17 00:00:00 2001 From: Jost von Hardenberg Date: Tue, 21 Apr 2020 23:33:21 +0200 Subject: [PATCH 2/9] automatic selection of time dim --- R/CST_EnsClustering.R | 43 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 8 deletions(-) diff --git a/R/CST_EnsClustering.R b/R/CST_EnsClustering.R index de0733b3..3a539c62 100644 --- a/R/CST_EnsClustering.R +++ b/R/CST_EnsClustering.R @@ -61,7 +61,10 @@ #' @param variance_explained variance (percentage) to be explained by the set of EOFs. #' Defaults to 80. Not used if numpcs is specified. #' @param numpcs Number of EOFs retained in the analysis (optional). -#' @param cluster_dim Dimension along which to cluster. Typically "member" or "sdate". This can also be a list like c("member", "sdate"). +#' @param cluster_dim Dimension along which to cluster. Typically "member" or "sdate". +#' This can also be a list like c("member", "sdate"). +#' @param time_dim String or character array with name(s) of dimension(s) over which to compute statistics. +#' If omitted c("ftime", "sdate", "time") are searched in this order. #' @param verbose Logical for verbose output #' @return A list with elements \code{$cluster} (cluster assigned for each member), #' \code{$freq} (relative frequency of each cluster), \code{$closest_member} @@ -100,7 +103,7 @@ #'@export CST_EnsClustering <- function(exp, time_moment = "mean", numclus = NULL, lon_lim = NULL, lat_lim = NULL, - variance_explained = 80, numpcs = NULL, + variance_explained = 80, numpcs = NULL, time_dim = NULL, time_percentile = 90, cluster_dim = "member", verbose = F) { if (!inherits(exp, "s2dv_cube")) { @@ -112,7 +115,7 @@ CST_EnsClustering <- function(exp, time_moment = "mean", numclus = NULL, time_moment = time_moment, numclus = numclus, lon_lim = lon_lim, lat_lim = lat_lim, variance_explained = variance_explained, numpcs = numpcs, - time_percentile = time_percentile, + time_percentile = time_percentile, time_dim = time_dim, cluster_dim = cluster_dim, verbose = verbose) return(result) @@ -145,7 +148,10 @@ CST_EnsClustering <- function(exp, time_moment = "mean", numclus = NULL, #' @param variance_explained variance (percentage) to be explained by the set of EOFs. #' Defaults to 80. Not used if numpcs is specified. #' @param numpcs Number of EOFs retained in the analysis (optional). -#' @param cluster_dim Dimension along which to cluster. Typically "member" or "sdate". This can also be a list like c("member", "sdate"). +#' @param cluster_dim Dimension along which to cluster. Typically "member" or "sdate". +#' This can also be a list like c("member", "sdate"). +#' @param time_dim String or character array with name(s) of dimension(s) over which to compute statistics. +#' If omitted c("ftime", "sdate", "time") are searched in this order. #' @param verbose Logical for verbose output #' @return A list with elements \code{$cluster} (cluster assigned for each member), #' \code{$freq} (relative frequency of each cluster), \code{$closest_member} @@ -162,20 +168,40 @@ CST_EnsClustering <- function(exp, time_moment = "mean", numclus = NULL, EnsClustering <- function(data, lat, lon, time_moment = "mean", numclus = NULL, lon_lim = NULL, lat_lim = NULL, variance_explained = 80, - numpcs = NULL, time_percentile = 90, + numpcs = NULL, time_percentile = 90, time_dim = NULL, cluster_dim = "member", verbose = T) { + # Check/detect time_dim + if (is.null(time_dim)) { + time_dim_names <- c("ftime", "sdate", "time") + time_dim_num <- which(time_dim_names %in% names(dim(data))) + if (length(time_dim_num) > 0) { + # Find time dimension with length > 1 + ilong <- which(dim(data)[time_dim_names[time_dim_num]] > 1) + if (length(ilong) > 0) { + time_dim <- time_dim_names[time_dim_num[ilong[1]]] + } else { + stop("No time dimension longer than one found.") + } + } else { + stop("Could not automatically detect a target time dimension ", + "in the provided data in 'data'.") + } + .printv(paste("Selected time dim:", time_dim)) + } + # Apply time_moment if (time_moment == "mean") { .printv("Considering the time_moment: mean", verbose) - exp <- apply(data, c(1, 2, 3, 5, 6), mean) + exp <- Apply(data, target_dims = time_dim, mean)$output1 } else if (time_moment == "sd") { .printv("Considering the time_moment: sd", verbose) - exp <- apply(data, c(1, 2, 3, 5, 6), sd) + exp <- Apply(data, target_dims = time_dim, sd)$output1 } else if (time_moment == "perc") { .printv(paste0("Considering the time_moment: percentile ", sprintf("%5f", time_percentile)), verbose) - exp <- apply(data, c(1, 2, 3, 5, 6), quantile, time_percentile / 100.) + exp <- Apply(data, target_dims = time_dim, quantile, + time_percentile / 100.)$output1 } else { stop(paste0("Invalid time_moment '", time_moment, "' specified!")) } @@ -187,6 +213,7 @@ EnsClustering <- function(data, lat, lon, time_moment = "mean", numclus = NULL, variance_explained = variance_explained, numpcs = numpcs, verbose = verbose) + # Expand result$closest_member into indices in cluster_dim dimensions cm=result$closest_member cml <- vector(mode = "list", length = length(cluster_dim)) cum <- cm * 0 -- GitLab From a34660bc0a8d314f5097c63037e8a9385a1d3b1a Mon Sep 17 00:00:00 2001 From: Jost von Hardenberg Date: Wed, 22 Apr 2020 17:43:40 +0200 Subject: [PATCH 3/9] correct calculation of indices --- R/CST_EnsClustering.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/CST_EnsClustering.R b/R/CST_EnsClustering.R index 3a539c62..171f5711 100644 --- a/R/CST_EnsClustering.R +++ b/R/CST_EnsClustering.R @@ -219,8 +219,8 @@ EnsClustering <- function(data, lat, lon, time_moment = "mean", numclus = NULL, cum <- cm * 0 dim_cd <- dim(exp)[cluster_dim] for (i in rev(seq_along(cluster_dim))) { - cml[[i]] <- floor((cm - cum) / prod(dim_cd[-i])) - cum <- cum + cml[[i]] * prod(dim_cd[-i]) + cml[[i]] <- floor((cm - cum - 1) / prod(dim_cd[-i])) + 1 + cum <- cum + (cml[[i]] - 1) * prod(dim_cd[-i]) dim_cd <- dim_cd[-i] } names(cml) <- cluster_dim -- GitLab From ee4954a3307b76423b2297526731276a93306ab4 Mon Sep 17 00:00:00 2001 From: Jost von Hardenberg Date: Thu, 23 Apr 2020 14:47:54 +0200 Subject: [PATCH 4/9] verbose printing of time selection --- R/CST_EnsClustering.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/CST_EnsClustering.R b/R/CST_EnsClustering.R index 171f5711..35a4f375 100644 --- a/R/CST_EnsClustering.R +++ b/R/CST_EnsClustering.R @@ -187,7 +187,7 @@ EnsClustering <- function(data, lat, lon, time_moment = "mean", numclus = NULL, stop("Could not automatically detect a target time dimension ", "in the provided data in 'data'.") } - .printv(paste("Selected time dim:", time_dim)) + .printv(paste("Selected time dim:", time_dim), verbose) } # Apply time_moment -- GitLab From 380d384b7e9eeed1fac50ca0a445950f1e70aac6 Mon Sep 17 00:00:00 2001 From: Jost von Hardenberg Date: Thu, 23 Apr 2020 14:50:14 +0200 Subject: [PATCH 5/9] doc fix --- R/CST_EnsClustering.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/CST_EnsClustering.R b/R/CST_EnsClustering.R index 35a4f375..8a7d2c60 100644 --- a/R/CST_EnsClustering.R +++ b/R/CST_EnsClustering.R @@ -52,7 +52,7 @@ #' spatial dimensions named "lon" and "lat", and dimensions "dataset", "member", "ftime", "sdate". #' @param time_moment Decides the moment to be applied to the time dimension. Can be either 'mean' (time mean), #' 'sd' (standard deviation along time) or 'perc' (a selected percentile on time). -#' If 'perc' the keyword 'time_percentile' is also needed. +#' If 'perc' the keyword 'time_percentile' is also used. #' @param time_percentile Set the percentile in time you want to analyse (used for `time_moment = "perc"). #' @param numclus Number of clusters (scenarios) to be calculated. #' If set to NULL the number of ensemble members divided by 10 is used, with a minimum of 2 and a maximum of 8. @@ -139,7 +139,7 @@ CST_EnsClustering <- function(exp, time_moment = "mean", numclus = NULL, #' @param lon Vector of longitudes. #' @param time_moment Decides the moment to be applied to the time dimension. Can be either 'mean' (time mean), #' 'sd' (standard deviation along time) or 'perc' (a selected percentile on time). -#' If 'perc' the keyword 'time_percentile' is also needed. +#' If 'perc' the keyword 'time_percentile' is also used. #' @param time_percentile Set the percentile in time you want to analyse (used for `time_moment = "perc"). #' @param numclus Number of clusters (scenarios) to be calculated. #' If set to NULL the number of ensemble members divided by 10 is used, with a minimum of 2 and a maximum of 8. -- GitLab From 301d5fc246c6f9e2364cc8a0a528a927acdc3c48 Mon Sep 17 00:00:00 2001 From: Jost von Hardenberg Date: Thu, 23 Apr 2020 15:08:51 +0200 Subject: [PATCH 6/9] fix test --- tests/testthat/test-CST_EnsClustering.R | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/testthat/test-CST_EnsClustering.R b/tests/testthat/test-CST_EnsClustering.R index 8c507db3..2e7364d9 100644 --- a/tests/testthat/test-CST_EnsClustering.R +++ b/tests/testthat/test-CST_EnsClustering.R @@ -37,7 +37,8 @@ test_that("Sanity and Functionality tests", { res <- CST_EnsClustering(exp, numclus = 3, cluster_dim = c("member", "sdate")) expect_equivalent(dim(res$cluster), dim(exp$data)[c(2, 3, 1)]) expect_equivalent(dim(res$freq), c(cluster = 3, dim(exp$data)[1])) - expect_equivalent(dim(res$closest_member), c(cluster = 3, dim(exp$data)[1])) + expect_equivalent(dim(res$closest_member$sdate), c(cluster = 3, + dim(exp$data)[1])) expect_equivalent(dim(res$repr_field), c(cluster = 3, dim(exp$data)[c(5, 6)], dim(exp$data)[1])) expect_equivalent(dim(res$composites), c(cluster = 3, @@ -48,13 +49,13 @@ test_that("Sanity and Functionality tests", { cluster_dim = "member") # The closest member of each cluster should be member of that cluster for (i in 1:3) { - expect_equivalent(res$cluster[res$closest_member[i, 1, 1], 1, 1], i) + expect_equivalent(res$cluster[res$closest_member$member[i, 1, 1], 1, 1], i) } res <- CST_EnsClustering(exp, numclus = 3, numpcs = 8, cluster_dim = c("member")) for (i in 1:3) { - expect_equivalent(res$cluster[res$closest_member[i, 1, 1], 1, 1], i) + expect_equivalent(res$cluster[res$closest_member$member[i, 1, 1], 1, 1], i) } res <- CST_EnsClustering(exp, numclus = 3, variance_explained = 80, -- GitLab From b2729c998586338d3c3eb106bb6ef31297518f7f Mon Sep 17 00:00:00 2001 From: jhardenberg Date: Fri, 24 Apr 2020 13:57:09 +0200 Subject: [PATCH 7/9] as.vector for multiApply and quantile --- R/CST_EnsClustering.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/R/CST_EnsClustering.R b/R/CST_EnsClustering.R index 8a7d2c60..53bf013a 100644 --- a/R/CST_EnsClustering.R +++ b/R/CST_EnsClustering.R @@ -200,8 +200,9 @@ EnsClustering <- function(data, lat, lon, time_moment = "mean", numclus = NULL, } else if (time_moment == "perc") { .printv(paste0("Considering the time_moment: percentile ", sprintf("%5f", time_percentile)), verbose) - exp <- Apply(data, target_dims = time_dim, quantile, - time_percentile / 100.)$output1 + exp <- Apply(data, target_dims = time_dim, + function(x) {quantile(as.vector(x), + time_percentile / 100.)})$output1 } else { stop(paste0("Invalid time_moment '", time_moment, "' specified!")) } -- GitLab From f313d00fa76cc66feb0571d2f253bd9c6a5cfdca Mon Sep 17 00:00:00 2001 From: nperez Date: Wed, 3 Jun 2020 19:58:30 +0200 Subject: [PATCH 8/9] functions documentation automatically generated with devtools --- man/CST_EnsClustering.Rd | 9 +++++++-- man/EnsClustering.Rd | 9 +++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/man/CST_EnsClustering.Rd b/man/CST_EnsClustering.Rd index 154541d5..6ee79b7c 100644 --- a/man/CST_EnsClustering.Rd +++ b/man/CST_EnsClustering.Rd @@ -12,6 +12,7 @@ CST_EnsClustering( lat_lim = NULL, variance_explained = 80, numpcs = NULL, + time_dim = NULL, time_percentile = 90, cluster_dim = "member", verbose = F @@ -24,7 +25,7 @@ spatial dimensions named "lon" and "lat", and dimensions "dataset", "member", "f \item{time_moment}{Decides the moment to be applied to the time dimension. Can be either 'mean' (time mean), 'sd' (standard deviation along time) or 'perc' (a selected percentile on time). -If 'perc' the keyword 'time_percentile' is also needed.} +If 'perc' the keyword 'time_percentile' is also used.} \item{numclus}{Number of clusters (scenarios) to be calculated. If set to NULL the number of ensemble members divided by 10 is used, with a minimum of 2 and a maximum of 8.} @@ -38,9 +39,13 @@ Defaults to 80. Not used if numpcs is specified.} \item{numpcs}{Number of EOFs retained in the analysis (optional).} +\item{time_dim}{String or character array with name(s) of dimension(s) over which to compute statistics. +If omitted c("ftime", "sdate", "time") are searched in this order.} + \item{time_percentile}{Set the percentile in time you want to analyse (used for `time_moment = "perc").} -\item{cluster_dim}{Dimension along which to cluster. Typically "member" or "sdate". This can also be a list like c("member", "sdate").} +\item{cluster_dim}{Dimension along which to cluster. Typically "member" or "sdate". +This can also be a list like c("member", "sdate").} \item{verbose}{Logical for verbose output} } diff --git a/man/EnsClustering.Rd b/man/EnsClustering.Rd index 2fd8a3f1..30d81f87 100644 --- a/man/EnsClustering.Rd +++ b/man/EnsClustering.Rd @@ -15,6 +15,7 @@ EnsClustering( variance_explained = 80, numpcs = NULL, time_percentile = 90, + time_dim = NULL, cluster_dim = "member", verbose = T ) @@ -28,7 +29,7 @@ EnsClustering( \item{time_moment}{Decides the moment to be applied to the time dimension. Can be either 'mean' (time mean), 'sd' (standard deviation along time) or 'perc' (a selected percentile on time). -If 'perc' the keyword 'time_percentile' is also needed.} +If 'perc' the keyword 'time_percentile' is also used.} \item{numclus}{Number of clusters (scenarios) to be calculated. If set to NULL the number of ensemble members divided by 10 is used, with a minimum of 2 and a maximum of 8.} @@ -44,7 +45,11 @@ Defaults to 80. Not used if numpcs is specified.} \item{time_percentile}{Set the percentile in time you want to analyse (used for `time_moment = "perc").} -\item{cluster_dim}{Dimension along which to cluster. Typically "member" or "sdate". This can also be a list like c("member", "sdate").} +\item{time_dim}{String or character array with name(s) of dimension(s) over which to compute statistics. +If omitted c("ftime", "sdate", "time") are searched in this order.} + +\item{cluster_dim}{Dimension along which to cluster. Typically "member" or "sdate". +This can also be a list like c("member", "sdate").} \item{verbose}{Logical for verbose output} } -- GitLab From 2fcb83dbbc3477b3415553297f0fee61113e5ed1 Mon Sep 17 00:00:00 2001 From: nperez Date: Thu, 4 Jun 2020 10:22:51 +0200 Subject: [PATCH 9/9] Changes in EnsClustering listed in NEWS --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 4144a707..aa6de232 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,7 +8,7 @@ + MultiMetric vignette fixed typo text description + RainFARM checks 'slope' is not a vector + DESCRIPTION specifies the minimum multiApply version required - + + EnsClustering has a new parameter 'time_dim' and fixed 'closest_member' output ### CSTools 3.0.0 **Submission date to CRAN: 10-02-2020** -- GitLab