diff --git a/NAMESPACE b/NAMESPACE index 4ec2c8fc079e816e697b78560e44ed6ad8af31e5..22fc489a331107c86b096ad9192a93c5ea3d2003 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -9,6 +9,7 @@ export(CST_BEI_Weighting) export(CST_BiasCorrection) export(CST_Calibration) export(CST_CategoricalEnsCombination) +export(CST_EnsClustering) export(CST_Load) export(CST_MultiEOF) export(CST_MultiMetric) @@ -17,6 +18,7 @@ export(CST_RFSlope) export(CST_RFWeights) export(CST_RainFARM) export(CST_SaveExp) +export(EnsClustering) export(MultiEOF) export(PlotCombinedMap) export(PlotForecastPDF) diff --git a/R/CST_EnsClustering.R b/R/CST_EnsClustering.R new file mode 100644 index 0000000000000000000000000000000000000000..83b38ceb2a43e93d079efb82377d587d424115d5 --- /dev/null +++ b/R/CST_EnsClustering.R @@ -0,0 +1,420 @@ +#' @rdname CST_EnsClustering +#' @title Ensemble clustering +#' +#' @author Federico Fabiano - ISAC-CNR, \email{f.fabiano@isac.cnr.it} +#' @author Ignazio Giuntoli - ISAC-CNR, \email{i.giuntoli@isac.cnr.it} +#' @author Danila Volpi - ISAC-CNR, \email{d.volpi@isac.cnr.it} +#' @author Paolo Davini - ISAC-CNR, \email{p.davini@isac.cnr.it} +#' @author Jost von Hardenberg - ISAC-CNR, \email{j.vonhardenberg@isac.cnr.it} +#' +#' @description This function performs a clustering on members/starting dates +#' and returns a number of scenarios, with representative members for each of them. +#' The clustering is performed in a reduced EOF space. +#' +#' Motivation: +#' Ensemble forecasts give a probabilistic insight of average weather conditions +#' on extended timescales, i.e. from sub-seasonal to seasonal and beyond. +#' With large ensembles, it is often an advantage to be able to group members +#' according to similar characteristics and to select the most representative member for each cluster. +#' This can be useful to characterize the most probable forecast scenarios in a multi-model +#' (or single model) ensemble prediction. This approach, applied at a regional level, +#' can also be used to identify the subset of ensemble members that best represent the +#' full range of possible solutions for downscaling applications. +#' The choice of the ensemble members is made flexible in order to meet the requirements +#' of specific (regional) climate information products, to be tailored for different regions and user needs. +#' +#' Description of the tool: +#' EnsClustering is a cluster analysis tool, based on the k-means algorithm, for ensemble predictions. +#' The aim is to group ensemble members according to similar characteristics and +#' to select the most representative member for each cluster. +#' The user chooses which feature of the data is used to group the ensemble members by clustering: +#' time mean, maximum, a certain percentile (e.g., 75% as in the examples below), +#' standard deviation and trend over the time period. For each ensemble member this value +#' is computed at each grid point, obtaining N lat-lon maps, where N is the number of ensemble members. +#' The anomaly is computed subtracting the ensemble mean of these maps to each of the single maps. +#' The anomaly is therefore computed with respect to the ensemble members (and not with respect to the time) +#' and the Empirical Orthogonal Function (EOF) analysis is applied to these anomaly maps. +#' Regarding the EOF analysis, the user can choose either how many Principal Components (PCs) +#' to retain or the percentage of explained variance to keep. After reducing dimensionality via +#' EOF analysis, k-means analysis is applied using the desired subset of PCs. +#' +#' The major final outputs are the classification in clusters, i.e. which member belongs +#' to which cluster (in k-means analysis the number k of clusters needs to be defined +#' prior to the analysis) and the most representative member for each cluster, +#' which is the closest member to the cluster centroid. +#' Other outputs refer to the statistics of clustering: in the PC space, the minimum and +#' the maximum distance between a member in a cluster and the cluster centroid +#' (i.e. the closest and the furthest member), the intra-cluster standard +#' deviation for each cluster (i.e. how much the cluster is compact). +#' +#' @param exp An object of the class 's2dv_cube', containing the variables to be analysed. +#' Each data object in the list is expected to have an element named \code{$data} with at least two +#' spatial dimensions named "lon" and "lat", and dimensions "dataset", "member", "ftime", "sdate". +#' @param time_moment Decides the moment to be applied to the time dimension. Can be either 'mean' (time mean), +#' 'sd' (standard deviation along time) or 'perc' (a selected percentile on time). +#' If 'perc' the keyword 'time_percentile' is also needed. +#' @param time_percentile Set the percentile in time you want to analyse (used for `time_moment = "perc"). +#' @param numclus Number of clusters (scenarios) to be calculated. +#' If set to NULL the number of ensemble members divided by 10 is used, with a minimum of 2 and a maximum of 8. +#' @param lon_lim List with the two longitude margins in `c(-180,180)` format. +#' @param lat_lim List with the two latitude margins. +#' @param variance_explained variance (percentage) to be explained by the set of EOFs. +#' Defaults to 80. Not used if numpcs is specified. +#' @param numpcs Number of EOFs retained in the analysis (optional). +#' @param cluster_dim Dimension along which to cluster. Typically "member" or "sdate". This can also be a list like c("member", "sdate"). +#' @param verbose Logical for verbose output +#' @return A list with elements \code{$cluster} (cluster assigned for each member), +#' \code{$freq} (relative frequency of each cluster), \code{$closest_member} +#' (representative member for each cluster), \code{$repr_field} (list of fields +#' for each representative member), \code{composites} (list of mean fields for each cluster), +#' \code{$lon} (selected longitudes of output fields), +#' \code{$lat} (selected longitudes of output fields). +#' @examples +#' exp <- lonlat_data$exp +#' # Example 1: Cluster on all start dates, members and models +#' res <- CST_EnsClustering(exp, numclus = 3, +#' cluster_dim = c("member", "dataset", "sdate")) +#' iclus = res$cluster[2, 1, 3] +#'\donttest{ +#' print(paste("Cluster of 2. member, 1. dataset, 3. sdate:", iclus)) +#' print(paste("Frequency (numerosity) of cluster (", iclus, ") :", res$freq[iclus])) +#' library(s2dverification) +#' PlotEquiMap(res$repr_field[iclus, , ], exp$lon, exp$lat, +#' filled.continents = FALSE, +#' toptitle = paste("Representative field of cluster", iclus)) +#'} +#' # Example 2: Cluster on members retaining 4 EOFs during +#' # preliminary dimensional reduction +#' res <- CST_EnsClustering(exp, numclus = 3, numpcs = 4, cluster_dim = "member") +#' # Example 3: Cluster on members, retain 80% of variance during +#' # preliminary dimensional reduction +#' res <- CST_EnsClustering(exp, numclus = 3, variance_explained = 80, +#' cluster_dim = "member") +#' # Example 4: Compute percentile in time +#' res <- CST_EnsClustering(exp, numclus = 3, time_percentile = 90, +#' time_moment = "perc", cluster_dim = "member") +#'@export +CST_EnsClustering <- function(exp, time_moment = "mean", numclus = NULL, + lon_lim = NULL, lat_lim = NULL, + variance_explained = 80, numpcs = NULL, + time_percentile = 90, cluster_dim = "member", + verbose = F) { + if (!inherits(exp, "s2dv_cube")) { + stop("Parameter 'exp' must be of the class 's2dv_cube', ", + "as output by CSTools::CST_Load.") + } + + result <- EnsClustering(exp$data, exp$lat, exp$lon, + time_moment = time_moment, numclus = numclus, + lon_lim = lon_lim, lat_lim = lat_lim, + variance_explained = variance_explained, numpcs = numpcs, + time_percentile = time_percentile, + cluster_dim = cluster_dim, verbose = verbose) + + return(result) +} + +#' @rdname EnsClustering +#' @title Ensemble clustering +#' +#' @author Federico Fabiano - ISAC-CNR, \email{f.fabiano@isac.cnr.it} +#' @author Ignazio Giuntoli - ISAC-CNR, \email{i.giuntoli@isac.cnr.it} +#' @author Danila Volpi - ISAC-CNR, \email{d.volpi@isac.cnr.it} +#' @author Paolo Davini - ISAC-CNR, \email{p.davini@isac.cnr.it} +#' @author Jost von Hardenberg - ISAC-CNR, \email{j.vonhardenberg@isac.cnr.it} +#' +#' @description This function performs a clustering on members/starting dates +#' and returns a number of scenarios, with representative members for each of them. +#' The clustering is performed in a reduced EOF space. +#' +#' @param data A matrix of dimensions 'dataset member sdate ftime lat lon' containing the variables to be analysed. +#' @param lat Vector of latitudes. +#' @param lon Vector of longitudes. +#' @param time_moment Decides the moment to be applied to the time dimension. Can be either 'mean' (time mean), +#' 'sd' (standard deviation along time) or 'perc' (a selected percentile on time). +#' If 'perc' the keyword 'time_percentile' is also needed. +#' @param time_percentile Set the percentile in time you want to analyse (used for `time_moment = "perc"). +#' @param numclus Number of clusters (scenarios) to be calculated. +#' If set to NULL the number of ensemble members divided by 10 is used, with a minimum of 2 and a maximum of 8. +#' @param lon_lim List with the two longitude margins in `c(-180,180)` format. +#' @param lat_lim List with the two latitude margins. +#' @param variance_explained variance (percentage) to be explained by the set of EOFs. +#' Defaults to 80. Not used if numpcs is specified. +#' @param numpcs Number of EOFs retained in the analysis (optional). +#' @param cluster_dim Dimension along which to cluster. Typically "member" or "sdate". This can also be a list like c("member", "sdate"). +#' @param verbose Logical for verbose output +#' @return A list with elements \code{$cluster} (cluster assigned for each member), +#' \code{$freq} (relative frequency of each cluster), \code{$closest_member} +#' (representative member for each cluster), \code{$repr_field} (list of fields +#' for each representative member), \code{composites} (list of mean fields for each cluster), +#' \code{$lon} (selected longitudes of output fields), +#' \code{$lat} (selected longitudes of output fields). +#' +#' @examples +#' exp <- lonlat_data$exp +#' res <- EnsClustering(exp$data, exp$lat, exp$lon, numclus = 3, +#' cluster_dim = c("member", "dataset", "sdate")) +#'@export + +EnsClustering <- function(data, lat, lon, time_moment = "mean", numclus = NULL, + lon_lim = NULL, lat_lim = NULL, variance_explained = 80, + numpcs = NULL, time_percentile = 90, + cluster_dim = "member", verbose = T) { + + # Apply time_moment + if (time_moment == "mean") { + .printv("Considering the time_moment: mean", verbose) + exp <- apply(data, c(1, 2, 3, 5, 6), mean) + } else if (time_moment == "sd") { + .printv("Considering the time_moment: sd", verbose) + exp <- apply(data, c(1, 2, 3, 5, 6), sd) + } else if (time_moment == "perc") { + .printv(paste0("Considering the time_moment: percentile ", + sprintf("%5f", time_percentile)), verbose) + exp <- apply(data, c(1, 2, 3, 5, 6), quantile, time_percentile / 100.) + } else { + stop(paste0("Invalid time_moment '", time_moment, "' specified!")) + } + + # Repeatedly apply .ensclus + result <- Apply(exp, target_dims = c(cluster_dim, "lat", "lon"), .ensclus, + lat, lon, numclus = numclus, + lon_lim = lon_lim, lat_lim = lat_lim, + variance_explained = variance_explained, + numpcs = numpcs, verbose = verbose) + return(append(result, list(lat = lat, lon = lon))) +} + +# Atomic ensclus function +.ensclus <- function(var_ens, lat, lon, numclus = NULL, lon_lim = NULL, + lat_lim = NULL, variance_explained = 80, numpcs = NULL, + verbose = T) { + # Check if more than one dimension has been passed for clustering + sampledims <- NULL + if (length(dim(var_ens)) > 3) { + sampledims <- head(dim(var_ens), -2) + dim(var_ens) <- c(samples = prod(sampledims), + tail(dim(var_ens), 2)) + } + if (length(lat) != dim(var_ens)[2]) { + stop("Incorrect lat length") + } + if (length(lon) != dim(var_ens)[3]) { + stop("Incorrect lon length") + } + n_ens <- dim(var_ens)[1] + if (is.null(numclus)) { + numclus <- min(max(floor(n_ens / 10), 2), 8) + } + .printv(paste("Number of clusters:", numclus), verbose) + + .printv("Calculating ensemble anomalies...", verbose) + ens_mean <- apply(var_ens, c(2, 3), mean) + var_anom <- array(dim = dim(var_ens)) + for (k in seq(1, n_ens)) { + var_anom[k, , ] <- var_ens[k, , ] - ens_mean + } + + # reshaping to give the right input to regimes function + var_anom <- aperm(var_anom, c(3, 2, 1)) + clusters <- .regimes(lon, lat, var_anom, ncluster = numclus, ntime = 1000, + neof = numpcs, lon_lim, lat_lim, + perc = variance_explained, + max_eofs = n_ens - 1, verbose = verbose) + + clus_labels <- as.array(clusters$cluster) + names(dim(clus_labels))[1] <- names(dim(var_ens))[1] + if (!is.null(sampledims)) { + dim(clus_labels) <- c(sampledims, dim(clus_labels)[-1]) + } + frequencies <- as.array(clusters$frequencies) + names(dim(frequencies))[1] <- "cluster" + + clus_centers <- clusters$clus_centers + + closest_member <- array(dim = numclus) + dist_closest_member <- array(dim = numclus) + for (iclu in seq(1, numclus)) { + this_clus_labels <- which(clus_labels == iclu) + if (length(this_clus_labels) > 1) { + dist_arr <- apply(clusters$pcs[clus_labels == iclu, ], 1, + .dist_from_center, center = clus_centers[iclu, ]) + .printv(paste0("distance from cluster ", iclu, " center:"), verbose) + .printv(this_clus_labels, verbose) + .printv(dist_arr, verbose) + closest_member[iclu] <- this_clus_labels[which.min(dist_arr)] + dist_closest_member[iclu] <- min(dist_arr) + .printv(paste0("closest member to cluster ", iclu, " center is: ", + closest_member[iclu]), verbose) + } else { + .printv(paste0("distance from cluster ", iclu, " center:"), verbose) + dista <- .dist_from_center(clusters$pcs[clus_labels == iclu, ], + center = clus_centers[iclu, ]) + .printv(this_clus_labels, verbose) + .printv(dist_arr, verbose) + closest_member[iclu] <- this_clus_labels + dist_closest_member[iclu] <- dista + } + } + .printv("EnsClustering completed...", verbose) + + names(dim(closest_member))[1] <- "cluster" + repr_field <- var_ens[closest_member, , ] + names(dim(repr_field))[2:3] <- names(dim(var_ens))[2:3] + names(dim(repr_field))[1] <- "cluster" + # Bring back to original order lat lon + composites <- aperm(clusters$regimes, c(1, 3, 2)) + names(dim(composites))[2:3] <- names(dim(var_ens))[2:3] + names(dim(composites))[1] <- "cluster" + out <- list(cluster = clus_labels, freq = frequencies, + closest_member = closest_member, repr_field = repr_field, + composites = composites) + return(out) +} + +.dist_from_center <- function(y, center = NULL) { + dist <- sqrt(sum((y - center)^2)) + return(dist) +} + +.eofs <- function(lon, lat, field, neof = 4, xlim = NULL, ylim = NULL, + method = "SVD", do_standardize = F, do_regression = F, + verbose = T) { + # R tool for computing EOFs based on + # Singular Value Decomposition ("SVD", default) + # or with the eigenvectors of the covariance matrix ("covariance", slower) + # If requested, computes linear regressions and standardizes the PCs + # If you want to use the regressions, remember to standardize the PCs + # Take as input a 3D anomaly field. + # Requires "personal" functions area.weight, standardize + + if (exists(".lm.fit")) { + lin.fit <- .lm.fit + } else { + lin.fit <- lm.fit + } + + # area weighting, based on the root of cosine + .printv("Area Weighting...", verbose) + ww <- .area.weight(lon, lat, root = T) + wwfield <- sweep(field, c(1, 2), ww, "*") + + idx <- .selbox(lon, lat, xlim, ylim) + slon <- lon[idx$ilon] + slat <- lat[idx$ilat] + wwfield <- wwfield[idx$ilon, idx$ilat, ] + + # transform 3D field in a matrix + wwfield <- array(wwfield, dim = c(dim(wwfield)[1] * dim(wwfield)[2], + dim(wwfield)[3])) + + # calling SVD + if (method == "SVD") { + .printv("Calling SVD...", verbose) + SVD <- svd(wwfield, nu = neof, nv = neof) + + # extracting EOFs (loading pattern), expansions coefficient + # and variance explained + pattern <- array(SVD$u, dim = c(dim(box)[1], dim(box)[2], neof)) + coefficient <- SVD$v + variance <- (SVD$d[1:neof])^2 / sum((SVD$d)^2) + if (do_standardize) { + coefficient <- apply(coefficient, c(2), .standardize) + } else { + coefficient <- sweep(coefficient, c(2), sqrt(variance), "*") + } + } + + # calling covariance matrix + if (method == "covariance") { + .printv("Calling eigenvectors of the covariance matrix...", verbose) + covma <- cov(t(wwfield)) + eig <- eigen(covma) + coef <- (t(wwfield) %*% eig$vector)[, 1:neof] + pattern <- array(eig$vectors, dim = c(dim(box)[1], dim(box)[2], + dim(box)[3]))[, , 1:neof] + variance <- eig$values[1:neof] / sum(eig$values) + if (do_standardize) { + coefficient <- apply(coef, c(2), .standardize) + } else { + coefficient <- coef + } + } + + # linear regressions on anomalies + regression <- NULL + if (do_regression) { + .printv("Linear Regressions (it can takes a while)... ", verbose) + regression <- array(NA, dim = c(length(lon), length(lat), neof)) + for (i in 1:neof) { + regression[, , i] <- apply(field, c(1, 2), + function(x) lin.fit(as.matrix(coefficient[, i], + ncol = 1), x)$coefficients) + } + } + + # preparing output + .printv("Finalize eofs...", verbose) + pattern <- list(x = slon, y = slat, z = pattern) + out <- list(pattern = pattern, coeff = coefficient, variance = variance, + regression = regression) + return(out) +} + + +.regimes <- function(lon, lat, field, ncluster = 4, ntime = 1000, neof = 10, + xlim, ylim, alg = "Hartigan-Wong", + perc = NULL, max_eofs = 50, verbose = T) { + # R tool to compute cluster analysis based on k-means. + # Requires "personal" function eofs + # Take as input a 3D anomaly field + + # Reduce the phase space with EOFs: use SVD and do not standardize PCs + .printv("Launching EOFs...", verbose) + t0 <- proc.time() + if (is.null(neof)) { + reducedspace <- .eofs(lon, lat, field, neof = max_eofs, xlim = xlim, + ylim = ylim, method = "SVD", do_regression = F, + do_standardize = F, verbose = verbose) + neof <- which(cumsum(reducedspace$variance) > perc / 100.)[1] + .printv(paste("Number of EOFs needed for var:", neof), verbose) + PC <- reducedspace$coeff[,1:neof] + } else { + reducedspace <- .eofs(lon, lat, field, neof = neof, xlim = xlim, + ylim = ylim, method = "SVD", do_regression = F, + do_standardize = F, verbose = verbose) + PC <- reducedspace$coeff + } + t1 <- proc.time() - t0 + + # k-means computation repeat for ntime to find best solution. + .printv("Computing k-means...", verbose) + t0 <- proc.time() + regimes <- kmeans(PC, as.numeric(ncluster), nstart = ntime, + iter.max = 1000, algorithm = alg) + t1 <- proc.time() - t0 + + # Extract regimes frequency and timeseries of occupation + cluster <- regimes$cluster + frequencies <- regimes$size / dim(field)[3] * 100 + .printv("Cluster frequencies:", verbose) + .printv(frequencies[order(frequencies, decreasing = T)], verbose) + + .printv("Creating Composites...", verbose) + compose <- apply(field, c(1, 2), by, cluster, mean) + + # sorting from the more frequent to the less frequent + kk <- order(frequencies, decreasing = T) + cluster <- cluster + 100 + for (ss in 1:ncluster) { + cluster[cluster == (ss + 100)] <- which(kk == ss) + } + + # prepare output + .printv("Finalize regimes...", verbose) + out <- list(cluster = cluster, frequencies = frequencies[kk], + regimes = compose[kk, , ], pcs = PC, + clus_centers = regimes$centers[kk, ], + tot.withinss = regimes$tot.withinss) + return(out) +} diff --git a/man/CST_EnsClustering.Rd b/man/CST_EnsClustering.Rd new file mode 100644 index 0000000000000000000000000000000000000000..fe8641b0e699d765b79fe02fd867d29e2dabf976 --- /dev/null +++ b/man/CST_EnsClustering.Rd @@ -0,0 +1,124 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/CST_EnsClustering.R +\name{CST_EnsClustering} +\alias{CST_EnsClustering} +\title{Ensemble clustering} +\usage{ +CST_EnsClustering(exp, time_moment = "mean", numclus = NULL, + lon_lim = NULL, lat_lim = NULL, variance_explained = 80, + numpcs = NULL, time_percentile = 90, cluster_dim = "member", + verbose = F) +} +\arguments{ +\item{exp}{An object of the class 's2dv_cube', containing the variables to be analysed. +Each data object in the list is expected to have an element named \code{$data} with at least two +spatial dimensions named "lon" and "lat", and dimensions "dataset", "member", "ftime", "sdate".} + +\item{time_moment}{Decides the moment to be applied to the time dimension. Can be either 'mean' (time mean), +'sd' (standard deviation along time) or 'perc' (a selected percentile on time). +If 'perc' the keyword 'time_percentile' is also needed.} + +\item{numclus}{Number of clusters (scenarios) to be calculated. +If set to NULL the number of ensemble members divided by 10 is used, with a minimum of 2 and a maximum of 8.} + +\item{lon_lim}{List with the two longitude margins in `c(-180,180)` format.} + +\item{lat_lim}{List with the two latitude margins.} + +\item{variance_explained}{variance (percentage) to be explained by the set of EOFs. +Defaults to 80. Not used if numpcs is specified.} + +\item{numpcs}{Number of EOFs retained in the analysis (optional).} + +\item{time_percentile}{Set the percentile in time you want to analyse (used for `time_moment = "perc").} + +\item{cluster_dim}{Dimension along which to cluster. Typically "member" or "sdate". This can also be a list like c("member", "sdate").} + +\item{verbose}{Logical for verbose output} +} +\value{ +A list with elements \code{$cluster} (cluster assigned for each member), + \code{$freq} (relative frequency of each cluster), \code{$closest_member} + (representative member for each cluster), \code{$repr_field} (list of fields + for each representative member), \code{composites} (list of mean fields for each cluster), + \code{$lon} (selected longitudes of output fields), + \code{$lat} (selected longitudes of output fields). +} +\description{ +This function performs a clustering on members/starting dates +and returns a number of scenarios, with representative members for each of them. +The clustering is performed in a reduced EOF space. + +Motivation: +Ensemble forecasts give a probabilistic insight of average weather conditions +on extended timescales, i.e. from sub-seasonal to seasonal and beyond. +With large ensembles, it is often an advantage to be able to group members +according to similar characteristics and to select the most representative member for each cluster. +This can be useful to characterize the most probable forecast scenarios in a multi-model +(or single model) ensemble prediction. This approach, applied at a regional level, +can also be used to identify the subset of ensemble members that best represent the +full range of possible solutions for downscaling applications. +The choice of the ensemble members is made flexible in order to meet the requirements +of specific (regional) climate information products, to be tailored for different regions and user needs. + +Description of the tool: +EnsClustering is a cluster analysis tool, based on the k-means algorithm, for ensemble predictions. +The aim is to group ensemble members according to similar characteristics and +to select the most representative member for each cluster. +The user chooses which feature of the data is used to group the ensemble members by clustering: +time mean, maximum, a certain percentile (e.g., 75% as in the examples below), +standard deviation and trend over the time period. For each ensemble member this value +is computed at each grid point, obtaining N lat-lon maps, where N is the number of ensemble members. +The anomaly is computed subtracting the ensemble mean of these maps to each of the single maps. +The anomaly is therefore computed with respect to the ensemble members (and not with respect to the time) +and the Empirical Orthogonal Function (EOF) analysis is applied to these anomaly maps. +Regarding the EOF analysis, the user can choose either how many Principal Components (PCs) +to retain or the percentage of explained variance to keep. After reducing dimensionality via +EOF analysis, k-means analysis is applied using the desired subset of PCs. + +The major final outputs are the classification in clusters, i.e. which member belongs +to which cluster (in k-means analysis the number k of clusters needs to be defined +prior to the analysis) and the most representative member for each cluster, +which is the closest member to the cluster centroid. +Other outputs refer to the statistics of clustering: in the PC space, the minimum and +the maximum distance between a member in a cluster and the cluster centroid +(i.e. the closest and the furthest member), the intra-cluster standard +deviation for each cluster (i.e. how much the cluster is compact). +} +\examples{ +exp <- lonlat_data$exp +# Example 1: Cluster on all start dates, members and models +res <- CST_EnsClustering(exp, numclus = 3, + cluster_dim = c("member", "dataset", "sdate")) +iclus = res$cluster[2, 1, 3] +\donttest{ +print(paste("Cluster of 2. member, 1. dataset, 3. sdate:", iclus)) +print(paste("Frequency (numerosity) of cluster (", iclus, ") :", res$freq[iclus])) +library(s2dverification) +PlotEquiMap(res$repr_field[iclus, , ], exp$lon, exp$lat, + filled.continents = FALSE, + toptitle = paste("Representative field of cluster", iclus)) +} +# Example 2: Cluster on members retaining 4 EOFs during +# preliminary dimensional reduction +res <- CST_EnsClustering(exp, numclus = 3, numpcs = 4, cluster_dim = "member") +# Example 3: Cluster on members, retain 80\% of variance during +# preliminary dimensional reduction +res <- CST_EnsClustering(exp, numclus = 3, variance_explained = 80, + cluster_dim = "member") +# Example 4: Compute percentile in time +res <- CST_EnsClustering(exp, numclus = 3, time_percentile = 90, + time_moment = "perc", cluster_dim = "member") +} +\author{ +Federico Fabiano - ISAC-CNR, \email{f.fabiano@isac.cnr.it} + +Ignazio Giuntoli - ISAC-CNR, \email{i.giuntoli@isac.cnr.it} + +Danila Volpi - ISAC-CNR, \email{d.volpi@isac.cnr.it} + +Paolo Davini - ISAC-CNR, \email{p.davini@isac.cnr.it} + +Jost von Hardenberg - ISAC-CNR, \email{j.vonhardenberg@isac.cnr.it} +} + diff --git a/man/EnsClustering.Rd b/man/EnsClustering.Rd new file mode 100644 index 0000000000000000000000000000000000000000..27aca45312679c56cb32533c1fde0fae4064fc24 --- /dev/null +++ b/man/EnsClustering.Rd @@ -0,0 +1,70 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/CST_EnsClustering.R +\name{EnsClustering} +\alias{EnsClustering} +\title{Ensemble clustering} +\usage{ +EnsClustering(data, lat, lon, time_moment = "mean", numclus = NULL, + lon_lim = NULL, lat_lim = NULL, variance_explained = 80, + numpcs = NULL, time_percentile = 90, cluster_dim = "member", + verbose = T) +} +\arguments{ +\item{data}{A matrix of dimensions 'dataset member sdate ftime lat lon' containing the variables to be analysed.} + +\item{lat}{Vector of latitudes.} + +\item{lon}{Vector of longitudes.} + +\item{time_moment}{Decides the moment to be applied to the time dimension. Can be either 'mean' (time mean), +'sd' (standard deviation along time) or 'perc' (a selected percentile on time). +If 'perc' the keyword 'time_percentile' is also needed.} + +\item{numclus}{Number of clusters (scenarios) to be calculated. +If set to NULL the number of ensemble members divided by 10 is used, with a minimum of 2 and a maximum of 8.} + +\item{lon_lim}{List with the two longitude margins in `c(-180,180)` format.} + +\item{lat_lim}{List with the two latitude margins.} + +\item{variance_explained}{variance (percentage) to be explained by the set of EOFs. +Defaults to 80. Not used if numpcs is specified.} + +\item{numpcs}{Number of EOFs retained in the analysis (optional).} + +\item{time_percentile}{Set the percentile in time you want to analyse (used for `time_moment = "perc").} + +\item{cluster_dim}{Dimension along which to cluster. Typically "member" or "sdate". This can also be a list like c("member", "sdate").} + +\item{verbose}{Logical for verbose output} +} +\value{ +A list with elements \code{$cluster} (cluster assigned for each member), + \code{$freq} (relative frequency of each cluster), \code{$closest_member} + (representative member for each cluster), \code{$repr_field} (list of fields + for each representative member), \code{composites} (list of mean fields for each cluster), + \code{$lon} (selected longitudes of output fields), + \code{$lat} (selected longitudes of output fields). +} +\description{ +This function performs a clustering on members/starting dates +and returns a number of scenarios, with representative members for each of them. +The clustering is performed in a reduced EOF space. +} +\examples{ +exp <- lonlat_data$exp +res <- EnsClustering(exp$data, exp$lat, exp$lon, numclus = 3, + cluster_dim = c("member", "dataset", "sdate")) +} +\author{ +Federico Fabiano - ISAC-CNR, \email{f.fabiano@isac.cnr.it} + +Ignazio Giuntoli - ISAC-CNR, \email{i.giuntoli@isac.cnr.it} + +Danila Volpi - ISAC-CNR, \email{d.volpi@isac.cnr.it} + +Paolo Davini - ISAC-CNR, \email{p.davini@isac.cnr.it} + +Jost von Hardenberg - ISAC-CNR, \email{j.vonhardenberg@isac.cnr.it} +} + diff --git a/tests/testthat/test-CST_EnsClustering.R b/tests/testthat/test-CST_EnsClustering.R new file mode 100644 index 0000000000000000000000000000000000000000..8c507db3ae0c86992fc51e4d476d5c6f583b2bfd --- /dev/null +++ b/tests/testthat/test-CST_EnsClustering.R @@ -0,0 +1,73 @@ +context("Generic tests") +test_that("Sanity and Functionality tests", { + data <- rnorm(2 * 15 * 4 * 5 * 6 * 7) + dim(data) <- c(dataset = 2, member = 15, + sdate = 4, ftime = 5, lat = 6, lon = 7) + lon <- seq(0, 12, 2) + lat <- seq(10, 15, 1) + exp <- list(data = data, lat = lat, lon = lon) + + # Check error messages + expect_error( + CST_EnsClustering(exp), + "Parameter 'exp' must be of the class 's2dv_cube'" + ) + attr(exp, "class") <- "s2dv_cube" + + expect_error( + CST_EnsClustering(exp, time_moment = "invalid"), + "Invalid time_moment" + ) + + exp$lat <- 1 + expect_error(CST_EnsClustering(exp), "Incorrect lat length") + exp$lon <- 1 + exp$lat <- lat + expect_error(CST_EnsClustering(exp), "Incorrect lon length") + exp$lon <- lon + + # Sanity checks on dimensions + res <- CST_EnsClustering(exp, numclus = 3) + expect_equivalent(dim(res$cluster), dim(exp$data)[c(2, 1, 3)]) + res <- CST_EnsClustering(exp, numclus = 3, cluster_dim = "sdate") + expect_equivalent(dim(res$cluster), dim(exp$data)[c(3, 1, 2)]) + res <- CST_EnsClustering(exp, numclus = 3, + cluster_dim = c("member", "dataset", "sdate")) + expect_equivalent(dim(res$cluster), dim(exp$data)[c(2, 1, 3)]) + res <- CST_EnsClustering(exp, numclus = 3, cluster_dim = c("member", "sdate")) + expect_equivalent(dim(res$cluster), dim(exp$data)[c(2, 3, 1)]) + expect_equivalent(dim(res$freq), c(cluster = 3, dim(exp$data)[1])) + expect_equivalent(dim(res$closest_member), c(cluster = 3, dim(exp$data)[1])) + expect_equivalent(dim(res$repr_field), c(cluster = 3, + dim(exp$data)[c(5, 6)], dim(exp$data)[1])) + expect_equivalent(dim(res$composites), c(cluster = 3, + dim(exp$data)[c(5, 6)], dim(exp$data)[1])) + + # Functionality tests + res <- CST_EnsClustering(exp, numclus = 3, variance_explained = 80, + cluster_dim = "member") + # The closest member of each cluster should be member of that cluster + for (i in 1:3) { + expect_equivalent(res$cluster[res$closest_member[i, 1, 1], 1, 1], i) + } + + res <- CST_EnsClustering(exp, numclus = 3, numpcs = 8, + cluster_dim = c("member")) + for (i in 1:3) { + expect_equivalent(res$cluster[res$closest_member[i, 1, 1], 1, 1], i) + } + + res <- CST_EnsClustering(exp, numclus = 3, variance_explained = 80, + cluster_dim = c("member")) + + # The weighted mean of all cluster mean fields should be 0 + comp1 <- sweep(res$composites, c(1, 4, 5), res$freq / 100, "*") + cmean <- apply(comp1, c(2, 3, 4, 5), mean)[ , , 1, 1] + expect_equal(mean(cmean), 0., tolerance = 1e-10) + + # This should give a different result + res2 <- CST_EnsClustering(exp, numclus = 3, variance_explained = 80, + cluster_dim = c("member"), + lat_lim = c(10, 12), lon_lim = c(0, 6)) + expect_gt(sum((res$cluster - res2$cluster)^2), 0) +})