From 18dcfb2efdd0129329a9e66e65e1c66921e18797 Mon Sep 17 00:00:00 2001 From: Federico Fabiano Date: Tue, 8 Oct 2019 14:18:24 +0200 Subject: [PATCH 01/33] Added CST_EnsClus --- R/CST_EnsClus.R | 400 +++++++++++++++++++++++++++++++++++++++++++++++ R/CST_MultiEOF.R | 12 +- 2 files changed, 407 insertions(+), 5 deletions(-) create mode 100644 R/CST_EnsClus.R diff --git a/R/CST_EnsClus.R b/R/CST_EnsClus.R new file mode 100644 index 00000000..dbfe5639 --- /dev/null +++ b/R/CST_EnsClus.R @@ -0,0 +1,400 @@ +#' @rdname CST_EnsClus +#' @title Ensemble clustering +#' +#' @author Federico Fabiano - ISAC-CNR, \email{f.fabiano@isac.cnr.it} +#' @author Ignazio Giuntoli - ISAC-CNR, \email{i.giuntoli@isac.cnr.it} +#' @author Danila Volpi - ISAC-CNR, \email{d.volpi@isac.cnr.it} +#' @author Paolo Davini - ISAC-CNR, \email{p.davini@isac.cnr.it} +#' +#' @description This function performs EOF analysis over multiple variables, +#' accepting in input a list of CSTools objects. Based on Singular Value Decomposition. +#' +#' @param datalist A list of objects of the class 's2dv_cube', containing the variables to be analysed. +#' Each data object in the list is expected to have an element named \code{$data} with at least two +#' spatial dimensions named "lon" and "lat", a dimension "ftime" and a dimension "sdate". +#' @param neof_composed Number of composed eofs to return in output +#' @param minvar Minimum variance fraction to be explained in first decomposition +#' @param neof_max Maximum number of single eofs considered in the first decomposition +#' @param xlim Vector with longitudinal range limits for the calculation +#' @param ylim Vector with latitudinal range limits for the calculation +#' @return A list with elements \code{$coeff} (an array of time-varying principal component coefficients), +#' \code{$variance} (a matrix of explained variances), +#' \code{eof_pattern} (a matrix of EOF patterns obtained by regression for each variable). +#' @import abind +#' @export +#' @examples +#' \donttest{ +#' exp <- 1 : (2 * 3 * 4 * 8 * 8) +#' } + +CST_EnsClus <- function(datalist, time_moment = 'mean', numclus = 4, lon_lim = NULL, lat_lim = NULL, + varfrac_explained = 80, numpcs = 4, flag_varfrac = FALSE, time_percentile = NULL) { + # print("Pasting data...") + # exp_time <- abind(lapply(datalist, '[[', 'exp'), along = 0) + # dim(exp_time) <- c(var = length(datalist), dim(datalist[[1]]$exp)) + # print("...done") + + # print(dim(exp_time)) + # exp <- apply(exp_time, c(2), mean) + # print(" qui c'è da fare la media temporale o altro (sd/perc...) sulla dimensione 1") + # print(dim(exp)) + #exp_time = drop(datalist$exp$data) + + result <- EnsClus(datalist$exp$data, datalist$exp$lat, datalist$exp$lon, time_moment = time_moment, numclus = numclus, lon_lim = lon_lim, lat_lim = lat_lim, varfrac_explained = varfrac_explained, numpcs = numpcs, flag_varfrac = flag_varfrac, time_percentile = time_percentile) + + return(result) +} + +# EnsClus <- function(exp_time, time_moment = 'mean', numclus = 4, lon_lim = NULL, lat_lim = NULL, +# varfrac_explained = 80, numpcs = 4, flag_varfrac = FALSE) { +# exp <- apply(exp_time, c(1, 3, 4), mean) +# exp <- +# print(" qui c'è da fare la media temporale o altro (sd/perc...) sulla dimensione 1") +# print(dim(exp)) +# +# #gigi = aperm(var_anom, c(2,3,1)) # reshaping to give the right input to regimes function +# print(" e anche un reshape per mettere la matrice in forma (n_ens, lon, lat)") +# +# result <- ensclus_Raw(exp, datalist$exp$lat, datalist$exp$lon, numclus = numclus, lon_lim = lon_lim, lat_lim = lat_lim, varfrac_explained = varfrac_explained, numpcs = numpcs, flag_varfrac = flag_varfrac) +# +# return(result) +# } + +EnsClus <- function(exp_time, lat, lon, time_moment = 'mean', numclus = 4, lon_lim = NULL, lat_lim = NULL, + varfrac_explained = 80, numpcs = 4, flag_varfrac = FALSE, time_percentile = NULL) { + + # Check/detect time_dim + print(dim(exp_time)) + if (time_moment == 'mean') { + print('Considering the time_moment: mean') + #exp <- apply(exp_time, c("dataset", "member", "sdate", "lat", "lon"), mean) + exp <- apply(exp_time, c(1, 2, 3, 5, 6), mean) + } + + if (time_moment == 'sd') { + print('Considering the time_moment: sd') + exp <- apply(exp_time, c(1, 2, 3, 5, 6), sd) + } + + if (time_moment == 'perc') { + print(paste0('Considering the time_moment: percentile ', sprintf("%5f", time_percentile))) + exp <- apply(exp_time, c(1, 2, 3, 5, 6), quantile, time_percentile/100.) + } + + print(dim(exp)) + + # dataset member sdate ftime lat lon + # Repeatedly apply .ensclus_Raw + result <- apply(exp, c(1, 2), ensclus_Raw, lat, lon, numclus = numclus, lon_lim = lon_lim, lat_lim = lat_lim, varfrac_explained = varfrac_explained, numpcs = numpcs, flag_varfrac = flag_varfrac) + + return(result) +} + +# EnsClus function +ensclus_Raw <- function(var_ens, lat, lon, numclus = 4, lon_lim = NULL, lat_lim = NULL, varfrac_explained = 80, numpcs = 4, flag_varfrac = FALSE) { + print('entro') + print(lat) + # dim(var_ens) = n_ens, lon, lat!!!!! + if (length(lat) != dim(var_ens)[2]) { + print('INCORRECT LAT length') + exit() + } + + if (length(lon) != dim(var_ens)[3]) { + print('INCORRECT LON length') + exit() + } + + n_ens = dim(var_ens)[1] + + print("Calculating ensemble anomalies...") + ens_mean = apply(var_ens, c(2,3), mean) + var_anom = array(dim=dim(var_ens)) + for (k in seq(1,n_ens)) { + var_anom[k,,] = var_ens[k,,] - ens_mean + } + + gigi = aperm(var_anom, c(2,3,1)) # reshaping to give the right input to regimes function + clusters = .regimes(lon, lat, gigi, ncluster = numclus, ntime = 1000, neof = numpcs, lon_lim, lat_lim, flag_varfrac = flag_varfrac, perc = varfrac_explained, max_eofs = n_ens-1) + + clus_labels = as.array(clusters$cluster) + frequencies = as.array(clusters$frequencies) + + clus_centers = clusters$clus_centers + + closest_member = array(dim = numclus) + dist_closest_member = array(dim = numclus) + for (iclu in seq(1,numclus)) { + this_clus_labels = which(clus_labels == iclu) + #print(paste0('aaaaaaaaaaaaaaaaaaaaaaaaaaarghhhhh ',iclu, ' ', length(this_clus_labels))) + if (length(this_clus_labels) > 1) { + dist_arr = apply(clusters$pcs[clus_labels == iclu,], 1, .dist_from_center, center = clus_centers[iclu,]) + # print(clusters$pcs[clus_labels == iclu,1]) + # print(clus_centers[iclu,1]) + print(paste0("distance from cluster ",iclu," center:")) + print(this_clus_labels) + print(dist_arr) + closest_member[iclu] = this_clus_labels[which.min(dist_arr)] + dist_closest_member[iclu] = min(dist_arr) + print(paste0("closest member to cluster ", iclu, " center is: ", closest_member[iclu])) + } else { + # print(this_clus_labels) + # print(paste0('oeeeeeeeeeeeeeeeeeeeeeeeee ',iclu)) + # print(clus_labels) + print(paste0("distance from cluster ",iclu," center:")) + print(this_clus_labels) + print(dist_arr) + closest_member[iclu] = this_clus_labels + dist_closest_member[iclu] = .dist_from_center(clusters$pcs[clus_labels == iclu,], center = clus_centers[iclu,]) + } + + } + + print("EnsClus completed...") + out = list(labels = clus_labels, freq = frequencies, closest_member = closest_member, clus_repr_field = var_ens[closest_member,,], clus_composites = clusters$regimes) + # out = list(labels = clus_labels, freq = frequencies, closest_member = closest_member, repr_field = var_ens[,,closest_member], clus_centers = clus_centers, pcs = clusters$pcs) + + return(out) +} + + +.dist_from_center <- function(y, center = NULL) { + dist = sqrt(sum((y-center)^2)) + return(dist) +} + + +.eofs <- function(lon, lat, field, neof = 4, xlim = NULL, ylim = NULL, + method = "SVD", do_standardize = F, do_regression = F, verbose = T) { + # R tool for computing EOFs based on Singular Value Decomposition ("SVD", default) + # or with the eigenvectors of the covariance matrix ("covariance", slower) + # If requested, computes linear regressions and standardizes the PCs + # If you want to use the regressions, remember to standardize the PCs + # Take as input a 3D anomaly field. + # Requires "personal" functions area.weight, whicher and standardize + + # area weighting, based on the root of cosine + .printv("Area Weighting...", verbose) + ww <- .area.weight(lon, lat, root = T) + wwfield <- sweep(field, c(1, 2), ww, "*") + + # selection of the xbox and ybox if defined + if (!is.null(xlim)) { + lonselect <- .whicher(lon, xlim[1]):.whicher(lon, xlim[2]) + } else { + lonselect <- 1:length(lon) + } + + if (!is.null(ylim)) { + latselect <- .whicher(lat, ylim[1]):.whicher(lat, ylim[2]) + } else { + latselect <- 1:length(lat) + } + + # box + box <- wwfield[lonselect, latselect, ] + slon <- lon[lonselect] + slat <- lat[latselect] + + # transform 3D field in a matrix + new_box <- array(box, dim = c(dim(box)[1] * dim(box)[2], dim(box)[3])) + + # calling SVD + if (method == "SVD") { + .printv("Calling SVD...", verbose) + SVD <- svd(new_box, nu = neof, nv = neof) + + # extracting EOFs (loading pattern), expansions coefficient and variance explained + pattern <- array(SVD$u, dim = c(dim(box)[1], dim(box)[2], neof)) + coefficient <- SVD$v + variance <- (SVD$d[1:neof])^2 / sum((SVD$d)^2) + if (do_standardize) { + coefficient <- apply(coefficient, c(2), standardize) + } else { + coefficient <- sweep(coefficient, c(2), sqrt(variance), "*") + } + } + + # calling covariance matrix + if (method == "covariance") { + .printv("Calling eigenvectors of the covariance matrix...", verbose) + covma <- cov(t(new_box)) + eig <- eigen(covma) + coef <- (t(new_box) %*% eig$vector)[, 1:neof] + pattern <- array(eig$vectors, dim = c(dim(box)[1], dim(box)[2], dim(box)[3]))[, , 1:neof] + variance <- eig$values[1:neof] / sum(eig$values) + if (do_standardize) { + coefficient <- apply(coef, c(2), standardize) + } else { + coefficient <- coef + } + } + + # linear regressions on anomalies + regression <- NULL + if (do_regression) { + .printv("Linear Regressions (it can takes a while)... ", verbose) + regression <- array(NA, dim = c(length(lon), length(lat), neof)) + # for (i in 1:neof) {regression[,,i]=apply(field,c(1,2),function(x) coef(lm(x ~ coefficient[,i]))[2])} + for (i in 1:neof) { + regression[, , i] <- apply(field, c(1, 2), function(x) lin.fit(as.matrix(coefficient[, i], ncol = 1), x)$coefficients) + } + } + + # preparing output + .printv("Finalize...", verbose) + pattern <- list(x = slon, y = slat, z = pattern) + out <- list(pattern = pattern, coeff = coefficient, variance = variance, regression = regression) + return(out) +} + + +.regimes <- function(lon, lat, field, ncluster = 4, ntime = 1000, neof = 10, xlim, ylim, alg = "Hartigan-Wong", flag_varfrac = FALSE, perc = NULL, max_eofs = 50) { + # R tool to compute cluster analysis based on k-means. + # Requires "personal" function eofs + # Take as input a 3D anomaly field + + # Reduce the phase space with EOFs: use SVD and do not standardize PCs + print("Launching EOFs...") + t0 <- proc.time() + if (flag_varfrac) { + if (is.null(perc)) { + print('flag_varfrac is TRUE but no perc as been specified') + } + reducedspace <- .eofs(lon, lat, field, neof = max_eofs, xlim = xlim, ylim = ylim, method = "SVD", do_regression = F, do_standardize = F) + #print(dim(reducedspace$coeff)) + #print(cumsum(reducedspace$variance)) + neof = which(cumsum(reducedspace$variance) > perc/100.)[1] + print("Number of EOFs needed for var:") + print(neof) + PC <- reducedspace$coeff[,1:neof] + # print(dim(PC)) + } else { + reducedspace <- .eofs(lon, lat, field, neof = neof, xlim = xlim, ylim = ylim, method = "SVD", do_regression = F, do_standardize = F) + # extract the principal components + PC <- reducedspace$coeff + # print(dim(PC)) + } + t1 <- proc.time() - t0 + # print(t1) + + # k-means computation repeat for ntime to find best solution. + print("Computing k-means...") + t0 <- proc.time() + #print(str(ncluster)) + regimes <- kmeans(PC, as.numeric(ncluster), nstart = ntime, iter.max = 1000, algorithm = alg) + t1 <- proc.time() - t0 + # print(t1) + + # Extract regimes frequencyr and timeseries of occupation + cluster <- regimes$cluster + frequencies <- regimes$size / dim(field)[3] * 100 + print("Cluster frequencies:") + print(frequencies[order(frequencies, decreasing = T)]) + # print(regimes$tot.withinss) + + print("Creating Composites...") + #compose <- aperm(apply(field, c(1, 2), by, cluster, mean), c(2, 3, 1)) + compose <- apply(field, c(1, 2), by, cluster, mean) + + # print(frequencies) + # print(regimes$centers) + # sorting from the more frequent to the less frequent + kk <- order(frequencies, decreasing = T) + # print(kk) + cluster <- cluster + 100 + for (ss in 1:ncluster) { + cluster[cluster == (ss + 100)] <- which(kk == ss) + } + + # prepare output + print("Finalize...") + out <- list(cluster = cluster, frequencies = frequencies[kk], regimes = compose[kk,,], pcs = PC, clus_centers = regimes$centers[kk,], tot.withinss = regimes$tot.withinss) + return(out) +} + + +# new function to create simple list with date values - Oct-18 +# it needs a date or PCICt object, and returns also the season subdivision +.power.date <- function(datas, verbose = FALSE) { + + # create a "season" for continuous time, used by persistance tracking + startpoints <- c(0, which(diff(datas) > 1)) + deltapoints <- diff(c(startpoints, length(datas))) + seas <- inverse.rle(list(lengths = deltapoints, + values = seq(1, length(startpoints)))) + + etime <- list( + day = as.numeric(format(datas, "%d")), + month = as.numeric(format(datas, "%m")), + year = as.numeric(format(datas, "%Y")), data = datas, season = seas + ) + + .printv("Time Array Built", verbose) + .printv(paste("Length:", length(seas)), verbose) + .printv(paste("From", datas[1], "to", datas[length(seas)]), verbose) + return(etime) +} + + +# function for daily anomalies, use array predeclaration and rowMeans (40 times faster!) +.daily.anom.mean <- function(ics, ipsilon, field, etime) { + condition <- paste(etime$day, etime$month) + daily <- array(NA, dim = c(length(ics), length(ipsilon), + length(unique(condition)))) + anom <- field * NA + + for (t in unique(condition)) { + if (sum(t == condition) == 1) { + print(paste0("Cannot compute a mean with only one value: ", + "using climatological mean")) + anom[, , which(t == condition)] <- rowMeans(field, dims = 2) + } else { + daily[, , which(t == unique(condition))] <- rowMeans(field[, , t == condition], dims = 2) + anom[, , which(t == condition)] <- sweep(field[, , which(t == condition)], + c(1, 2), + daily[, , which(t == unique(condition))], "-") + } + } + return(anom) +} + + +# produce a 2d matrix of area weights +.area.weight <- function(ics, ipsilon, root = T) { + field <- array(NA, dim = c(length(ics), length(ipsilon))) + if (root == T) { + for (j in 1:length(ipsilon)) { + field[, j] <- sqrt(cos(pi / 180 * ipsilon[j])) + } + } + + if (root == F) { + for (j in 1:length(ipsilon)) { + field[, j] <- cos(pi / 180 * ipsilon[j]) + } + } + + return(field) +} + +# verbose-only printing function +.printv <- function(value, verbosity = TRUE) { + if (verbosity) { + print(value) + } +} + + +# detect ics ipsilon lat-lon +.whicher <- function(axis, number) { + out <- which.min(abs(axis - number)) + return(out) +} + + +# normalize a time series +.standardize <- function(timeseries) { + out <- (timeseries - mean(timeseries, na.rm = T)) / sd(timeseries, na.rm = T) + return(out) +} diff --git a/R/CST_MultiEOF.R b/R/CST_MultiEOF.R index fbb016c4..d3cfcf67 100644 --- a/R/CST_MultiEOF.R +++ b/R/CST_MultiEOF.R @@ -9,7 +9,7 @@ #' #' @param datalist A list of objects of the class 's2dv_cube', containing the variables to be analysed. #' Each data object in the list is expected to have an element named \code{$data} with at least two -#' spatial dimensions named "lon" and "lat", a dimension "ftime" and a dimension "sdate". +#' spatial dimensions named "lon" and "lat", a dimension "ftime" and a dimension "sdate". #' @param neof_composed Number of composed eofs to return in output #' @param minvar Minimum variance fraction to be explained in first decomposition #' @param neof_max Maximum number of single eofs considered in the first decomposition @@ -29,7 +29,7 @@ CST_MultiEOF <- function(datalist, neof_max = 40, neof_composed = 5, minvar = 0.6, xlim = NULL, ylim = NULL) { print("Pasting data...") - exp <- abind(lapply(datalist, '[[', 'exp'), along = 0) + exp <- abind(lapply(datalist, '[[', 'exp'), along = 0) dim(exp) <- c(var = length(datalist), dim(datalist[[1]]$exp)) print("...done") result <- MultiEOF(exp, datalist[[1]]$lon, datalist[[1]]$lat, @@ -47,7 +47,7 @@ CST_MultiEOF <- function(datalist, #' @author Jost von Hardenberg - ISAC-CNR, \email{j.vonhardenberg@isac.cnr.it} #' @author Paolo Davini - ISAC-CNR, \email{p.davini@isac.cnr.it} #' -#' @description This function performs EOF analysis over multiple variables, accepting in +#' @description This function performs EOF analysis over multiple variables, accepting in #' input an arry with a dimension \code{"var"} for each variable to analyse. #' Based on Singular Value Decomposition. #' @@ -57,7 +57,7 @@ CST_MultiEOF <- function(datalist, #' @param lon Vector of longitudes. #' @param lat Vector of latitudes. #' @param time Vector or matrix of dates in POSIXct format. -#' @param time_dim Character vector of dimension names to be considered as times +#' @param time_dim Character vector of dimension names to be considered as times #' @param lon_dim String with dimension name of longitudinal coordinate (default: "lon") #' @param lat_dim String with dimension name of latitudinal coordinate (default: "lat") #' @param neof_composed Number of composed eofs to return in output @@ -109,7 +109,9 @@ MultiEOF <- function(data, lon, lat, time, time_dim = NULL, dim(data) <- c(cdim[ind], samples = prod(cdim[-ind])) # Repeatedly apply .multi.eofs - result <- Apply(data, c("var", lon_dim, lat_dim, "samples"), + #result <- Apply(data, c("var", lon_dim, lat_dim, "samples"), + #result <- apply(data, c("dataset", "member"), + result <- apply(data, c(2, 3), .multi.eofs, lon, lat, time, neof_max = neof_max, neof_composed = neof_composed, minvar = minvar, xlim = xlim, ylim = ylim) -- GitLab From b9a2181b11e58ce0863f0a0fac638d3bc4d3111e Mon Sep 17 00:00:00 2001 From: Federico Fabiano Date: Tue, 8 Oct 2019 14:52:20 +0200 Subject: [PATCH 02/33] Added readme, clustering_on member/sdate --- R/CST_EnsClus.R | 110 ++++++++++++++++++++++++++++-------------------- 1 file changed, 64 insertions(+), 46 deletions(-) diff --git a/R/CST_EnsClus.R b/R/CST_EnsClus.R index dbfe5639..d0acd42c 100644 --- a/R/CST_EnsClus.R +++ b/R/CST_EnsClus.R @@ -5,66 +5,78 @@ #' @author Ignazio Giuntoli - ISAC-CNR, \email{i.giuntoli@isac.cnr.it} #' @author Danila Volpi - ISAC-CNR, \email{d.volpi@isac.cnr.it} #' @author Paolo Davini - ISAC-CNR, \email{p.davini@isac.cnr.it} +#' @author Jost von Hardenberg - ISAC-CNR, \email{j.vonhardenberg@isac.cnr.it} #' -#' @description This function performs EOF analysis over multiple variables, -#' accepting in input a list of CSTools objects. Based on Singular Value Decomposition. +#' @description This function performs a clustering on members/starting dates and returns a number of scenarios, with representative members for each of them. The clustering is performed in a reduced EOF space. #' -#' @param datalist A list of objects of the class 's2dv_cube', containing the variables to be analysed. +#' @param data An object of the class 's2dv_cube', containing the variables to be analysed. #' Each data object in the list is expected to have an element named \code{$data} with at least two #' spatial dimensions named "lon" and "lat", a dimension "ftime" and a dimension "sdate". -#' @param neof_composed Number of composed eofs to return in output -#' @param minvar Minimum variance fraction to be explained in first decomposition -#' @param neof_max Maximum number of single eofs considered in the first decomposition -#' @param xlim Vector with longitudinal range limits for the calculation -#' @param ylim Vector with latitudinal range limits for the calculation -#' @return A list with elements \code{$coeff} (an array of time-varying principal component coefficients), -#' \code{$variance} (a matrix of explained variances), -#' \code{eof_pattern} (a matrix of EOF patterns obtained by regression for each variable). -#' @import abind +#' @param time_moment Decides the moment to be applied to the time dimension. Can be either 'mean' (time mean), 'sd' (standard deviation along time) or 'perc' (a selected percentile on time). If 'perc' the keyword 'time_percentile' is also needed. +#' @param time_percentile Set the percentile in time you want to analyse. Active only if time_moment is 'perc'. +#' @param numclus Number of clusters (scenarios) to be calculated. +#' @param lon_lim List with the two longitude margins. (-180,180) format. +#' @param lat_lim List with the two latitude margins. +#' @param varfrac_explained Variance to be explained by the set of eofs. Used only if flag_varfrac is True. +#' @param numpcs Number of eofs retained in the analysis. +#' @param flag_varfrac Selects the minimum number of eofs needed to reach the varfrac_explained. +#' @param clustering_on Decides whether to calculate the scenarios among members with same sdate ('member') or among different sdates ('sdate') but fixed member. +#' @return A list with elements \code{$labels} (cluster assigned for each member), +#' \code{$freq} (relative frequency of each cluster), \code{$closest_member} +#' (representative member for each cluster), \code{$clus_repr_field} (list of fields +#' for each representative member), \code{clus_composites} (list of mean fields for each cluster). #' @export #' @examples #' \donttest{ #' exp <- 1 : (2 * 3 * 4 * 8 * 8) #' } -CST_EnsClus <- function(datalist, time_moment = 'mean', numclus = 4, lon_lim = NULL, lat_lim = NULL, - varfrac_explained = 80, numpcs = 4, flag_varfrac = FALSE, time_percentile = NULL) { - # print("Pasting data...") - # exp_time <- abind(lapply(datalist, '[[', 'exp'), along = 0) - # dim(exp_time) <- c(var = length(datalist), dim(datalist[[1]]$exp)) - # print("...done") +CST_EnsClus <- function(data, time_moment = 'mean', numclus = 4, lon_lim = NULL, lat_lim = NULL, + varfrac_explained = 80, numpcs = 4, flag_varfrac = FALSE, time_percentile = NULL, clustering_on = 'member') { - # print(dim(exp_time)) - # exp <- apply(exp_time, c(2), mean) - # print(" qui c'è da fare la media temporale o altro (sd/perc...) sulla dimensione 1") - # print(dim(exp)) - #exp_time = drop(datalist$exp$data) - - result <- EnsClus(datalist$exp$data, datalist$exp$lat, datalist$exp$lon, time_moment = time_moment, numclus = numclus, lon_lim = lon_lim, lat_lim = lat_lim, varfrac_explained = varfrac_explained, numpcs = numpcs, flag_varfrac = flag_varfrac, time_percentile = time_percentile) + result <- EnsClus(datalist$exp$data, datalist$exp$lat, datalist$exp$lon, time_moment = time_moment, numclus = numclus, lon_lim = lon_lim, lat_lim = lat_lim, varfrac_explained = varfrac_explained, numpcs = numpcs, flag_varfrac = flag_varfrac, time_percentile = time_percentile, clustering_on = clustering_on) return(result) } -# EnsClus <- function(exp_time, time_moment = 'mean', numclus = 4, lon_lim = NULL, lat_lim = NULL, -# varfrac_explained = 80, numpcs = 4, flag_varfrac = FALSE) { -# exp <- apply(exp_time, c(1, 3, 4), mean) -# exp <- -# print(" qui c'è da fare la media temporale o altro (sd/perc...) sulla dimensione 1") -# print(dim(exp)) -# -# #gigi = aperm(var_anom, c(2,3,1)) # reshaping to give the right input to regimes function -# print(" e anche un reshape per mettere la matrice in forma (n_ens, lon, lat)") -# -# result <- ensclus_Raw(exp, datalist$exp$lat, datalist$exp$lon, numclus = numclus, lon_lim = lon_lim, lat_lim = lat_lim, varfrac_explained = varfrac_explained, numpcs = numpcs, flag_varfrac = flag_varfrac) -# -# return(result) -# } +#' @rdname EnsClus +#' @title Ensemble clustering +#' +#' @author Federico Fabiano - ISAC-CNR, \email{f.fabiano@isac.cnr.it} +#' @author Ignazio Giuntoli - ISAC-CNR, \email{i.giuntoli@isac.cnr.it} +#' @author Danila Volpi - ISAC-CNR, \email{d.volpi@isac.cnr.it} +#' @author Paolo Davini - ISAC-CNR, \email{p.davini@isac.cnr.it} +#' @author Jost von Hardenberg - ISAC-CNR, \email{j.vonhardenberg@isac.cnr.it} +#' +#' @description This function performs a clustering on members/starting dates and returns a number of scenarios, with representative members for each of them. The clustering is performed in a reduced EOF space. +#' +#' @param exp_time A matrix of dimensions 'dataset member sdate ftime lat lon' containing the variables to be analysed. If dataset is a s2dv_cube, exp_time = dataset$exp$data +#' @param time_moment Decides the moment to be applied to the time dimension. Can be either 'mean' (time mean), 'sd' (standard deviation along time) or 'perc' (a selected percentile on time). If 'perc' the keyword 'time_percentile' is also needed. +#' @param time_percentile Set the percentile in time you want to analyse. Active only if time_moment is 'perc'. +#' @param numclus Number of clusters (scenarios) to be calculated. +#' @param lon_lim List with the two longitude margins. (-180,180) format. +#' @param lat_lim List with the two latitude margins. +#' @param varfrac_explained Variance to be explained by the set of eofs. Used only if flag_varfrac is True. +#' @param numpcs Number of eofs retained in the analysis. +#' @param flag_varfrac Selects the minimum number of eofs needed to reach the varfrac_explained. +#' @param clustering_on Decides whether to calculate the scenarios among members with same sdate ('member') or among different sdates ('sdate') but fixed member. +#' @return A list with elements \code{$labels} (cluster assigned for each member), +#' \code{$freq} (relative frequency of each cluster), \code{$closest_member} +#' (representative member for each cluster), \code{$clus_repr_field} (list of fields +#' for each representative member), \code{clus_composites} (list of mean fields for each cluster). +#' @export +#' @examples +#' \donttest{ +#' exp <- 1 : (2 * 3 * 4 * 8 * 8) +#' } + EnsClus <- function(exp_time, lat, lon, time_moment = 'mean', numclus = 4, lon_lim = NULL, lat_lim = NULL, - varfrac_explained = 80, numpcs = 4, flag_varfrac = FALSE, time_percentile = NULL) { + varfrac_explained = 80, numpcs = 4, flag_varfrac = FALSE, time_percentile = NULL, clustering_on = 'member') { - # Check/detect time_dim print(dim(exp_time)) + + # Apply time_moment if (time_moment == 'mean') { print('Considering the time_moment: mean') #exp <- apply(exp_time, c("dataset", "member", "sdate", "lat", "lon"), mean) @@ -85,15 +97,20 @@ EnsClus <- function(exp_time, lat, lon, time_moment = 'mean', numclus = 4, lon_l # dataset member sdate ftime lat lon # Repeatedly apply .ensclus_Raw - result <- apply(exp, c(1, 2), ensclus_Raw, lat, lon, numclus = numclus, lon_lim = lon_lim, lat_lim = lat_lim, varfrac_explained = varfrac_explained, numpcs = numpcs, flag_varfrac = flag_varfrac) + # It would be better to apply on the named dimensions here.. + if (clustering_on == 'sdate') { + result <- apply(exp, c(1, 2), ensclus_Raw, lat, lon, numclus = numclus, lon_lim = lon_lim, lat_lim = lat_lim, varfrac_explained = varfrac_explained, numpcs = numpcs, flag_varfrac = flag_varfrac) + } else if (clustering_on == 'member') { + result <- apply(exp, c(1, 3), ensclus_Raw, lat, lon, numclus = numclus, lon_lim = lon_lim, lat_lim = lat_lim, varfrac_explained = varfrac_explained, numpcs = numpcs, flag_varfrac = flag_varfrac) + } return(result) } # EnsClus function ensclus_Raw <- function(var_ens, lat, lon, numclus = 4, lon_lim = NULL, lat_lim = NULL, varfrac_explained = 80, numpcs = 4, flag_varfrac = FALSE) { - print('entro') - print(lat) + #print('entro') + #print(lat) # dim(var_ens) = n_ens, lon, lat!!!!! if (length(lat) != dim(var_ens)[2]) { print('INCORRECT LAT length') @@ -142,10 +159,11 @@ ensclus_Raw <- function(var_ens, lat, lon, numclus = 4, lon_lim = NULL, lat_lim # print(paste0('oeeeeeeeeeeeeeeeeeeeeeeeee ',iclu)) # print(clus_labels) print(paste0("distance from cluster ",iclu," center:")) + dista = .dist_from_center(clusters$pcs[clus_labels == iclu,], center = clus_centers[iclu,]) print(this_clus_labels) - print(dist_arr) + print(dista) closest_member[iclu] = this_clus_labels - dist_closest_member[iclu] = .dist_from_center(clusters$pcs[clus_labels == iclu,], center = clus_centers[iclu,]) + dist_closest_member[iclu] = dista } } -- GitLab From 676a984ee499b1f4dbe1ea7f0813aebfeffb8223 Mon Sep 17 00:00:00 2001 From: Federico Fabiano Date: Wed, 9 Oct 2019 16:06:52 +0200 Subject: [PATCH 03/33] Removed changes to multieof --- R/CST_EnsClus.R | 9 +++++---- R/CST_MultiEOF.R | 4 +--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/R/CST_EnsClus.R b/R/CST_EnsClus.R index d0acd42c..7ff1ada3 100644 --- a/R/CST_EnsClus.R +++ b/R/CST_EnsClus.R @@ -99,16 +99,17 @@ EnsClus <- function(exp_time, lat, lon, time_moment = 'mean', numclus = 4, lon_l # Repeatedly apply .ensclus_Raw # It would be better to apply on the named dimensions here.. if (clustering_on == 'sdate') { - result <- apply(exp, c(1, 2), ensclus_Raw, lat, lon, numclus = numclus, lon_lim = lon_lim, lat_lim = lat_lim, varfrac_explained = varfrac_explained, numpcs = numpcs, flag_varfrac = flag_varfrac) + result <- apply(exp, c(1, 2), .ensclus, lat, lon, numclus = numclus, lon_lim = lon_lim, lat_lim = lat_lim, varfrac_explained = varfrac_explained, numpcs = numpcs, flag_varfrac = flag_varfrac) } else if (clustering_on == 'member') { - result <- apply(exp, c(1, 3), ensclus_Raw, lat, lon, numclus = numclus, lon_lim = lon_lim, lat_lim = lat_lim, varfrac_explained = varfrac_explained, numpcs = numpcs, flag_varfrac = flag_varfrac) + result <- apply(exp, c(1, 3), .ensclus, lat, lon, numclus = numclus, lon_lim = lon_lim, lat_lim = lat_lim, varfrac_explained = varfrac_explained, numpcs = numpcs, flag_varfrac = flag_varfrac) } return(result) } -# EnsClus function -ensclus_Raw <- function(var_ens, lat, lon, numclus = 4, lon_lim = NULL, lat_lim = NULL, varfrac_explained = 80, numpcs = 4, flag_varfrac = FALSE) { + +# Atomic ensclus function +.ensclus <- function(var_ens, lat, lon, numclus = 4, lon_lim = NULL, lat_lim = NULL, varfrac_explained = 80, numpcs = 4, flag_varfrac = FALSE) { #print('entro') #print(lat) # dim(var_ens) = n_ens, lon, lat!!!!! diff --git a/R/CST_MultiEOF.R b/R/CST_MultiEOF.R index d3cfcf67..66bc5c1f 100644 --- a/R/CST_MultiEOF.R +++ b/R/CST_MultiEOF.R @@ -109,9 +109,7 @@ MultiEOF <- function(data, lon, lat, time, time_dim = NULL, dim(data) <- c(cdim[ind], samples = prod(cdim[-ind])) # Repeatedly apply .multi.eofs - #result <- Apply(data, c("var", lon_dim, lat_dim, "samples"), - #result <- apply(data, c("dataset", "member"), - result <- apply(data, c(2, 3), + result <- Apply(data, c("var", lon_dim, lat_dim, "samples"), .multi.eofs, lon, lat, time, neof_max = neof_max, neof_composed = neof_composed, minvar = minvar, xlim = xlim, ylim = ylim) -- GitLab From 456dbed9967abad5e2982b67c47386f95c7766f3 Mon Sep 17 00:00:00 2001 From: Federico Fabiano Date: Fri, 8 Nov 2019 11:35:52 +0100 Subject: [PATCH 04/33] Removed CST_multieof from this branch --- R/CST_MultiEOF.R | 338 ----------------------------------------------- 1 file changed, 338 deletions(-) delete mode 100644 R/CST_MultiEOF.R diff --git a/R/CST_MultiEOF.R b/R/CST_MultiEOF.R deleted file mode 100644 index 66bc5c1f..00000000 --- a/R/CST_MultiEOF.R +++ /dev/null @@ -1,338 +0,0 @@ -#' @rdname CST_MultiEOF -#' @title EOF analysis of multiple variables -#' -#' @author Jost von Hardenberg - ISAC-CNR, \email{j.vonhardenberg@isac.cnr.it} -#' @author Paolo Davini - ISAC-CNR, \email{p.davini@isac.cnr.it} -#' -#' @description This function performs EOF analysis over multiple variables, -#' accepting in input a list of CSTools objects. Based on Singular Value Decomposition. -#' -#' @param datalist A list of objects of the class 's2dv_cube', containing the variables to be analysed. -#' Each data object in the list is expected to have an element named \code{$data} with at least two -#' spatial dimensions named "lon" and "lat", a dimension "ftime" and a dimension "sdate". -#' @param neof_composed Number of composed eofs to return in output -#' @param minvar Minimum variance fraction to be explained in first decomposition -#' @param neof_max Maximum number of single eofs considered in the first decomposition -#' @param xlim Vector with longitudinal range limits for the calculation -#' @param ylim Vector with latitudinal range limits for the calculation -#' @return A list with elements \code{$coeff} (an array of time-varying principal component coefficients), -#' \code{$variance} (a matrix of explained variances), -#' \code{eof_pattern} (a matrix of EOF patterns obtained by regression for each variable). -#' @import abind -#' @export -#' @examples -#' \donttest{ -#' exp <- 1 : (2 * 3 * 4 * 8 * 8) -#' } - -CST_MultiEOF <- function(datalist, - neof_max = 40, neof_composed = 5, minvar = 0.6, - xlim = NULL, ylim = NULL) { - print("Pasting data...") - exp <- abind(lapply(datalist, '[[', 'exp'), along = 0) - dim(exp) <- c(var = length(datalist), dim(datalist[[1]]$exp)) - print("...done") - result <- MultiEOF(exp, datalist[[1]]$lon, datalist[[1]]$lat, - datalist[[1]]$Dates$start, - time_dim = c("ftime", "sdate"), - neof = neof, neof_composed = neof_composed, - xlim = xlim, ylim = ylim) - - return(result) -} - -#' @rdname MultiEOF -#' @title EOF analysis of multiple variables starting from an array (reduced version) -#' -#' @author Jost von Hardenberg - ISAC-CNR, \email{j.vonhardenberg@isac.cnr.it} -#' @author Paolo Davini - ISAC-CNR, \email{p.davini@isac.cnr.it} -#' -#' @description This function performs EOF analysis over multiple variables, accepting in -#' input an arry with a dimension \code{"var"} for each variable to analyse. -#' Based on Singular Value Decomposition. -#' -#' @param datalist A multidimensional array with dimension \code{"var"}, -#' containing the variables to be analysed. The other diemnsions follow the same structure as the -#' \code{"exp"} element of a 's2dv_cube' object. -#' @param lon Vector of longitudes. -#' @param lat Vector of latitudes. -#' @param time Vector or matrix of dates in POSIXct format. -#' @param time_dim Character vector of dimension names to be considered as times -#' @param lon_dim String with dimension name of longitudinal coordinate (default: "lon") -#' @param lat_dim String with dimension name of latitudinal coordinate (default: "lat") -#' @param neof_composed Number of composed eofs to return in output -#' @param minvar Minimum variance fraction to be explained in first decomposition -#' @param neof_max Maximum number of single eofs considered in the first decomposition -#' @param xlim Vector with longitudinal range limits for the calculation -#' @param ylim Vector with latitudinal range limits for the calculation -#' @return A list with elements \code{$coeff} (an array of time-varying principal component coefficients), -#' \code{$variance} (a matrix of explained variances), -#' \code{eof_pattern} (a matrix of EOF patterns obtained by regression for each variable). -#' @import multiApply -#' @export -#' @examples -#' # Example for the 'reduced' MultiEOF function -#' -#' \donttest{ -#' } - -MultiEOF <- function(data, lon, lat, time, time_dim = NULL, - lon_dim = "lon", lat_dim = "lat", - neof_max = 40, neof_composed = 5, minvar = 0.6, - xlim = NULL, ylim = NULL) { - # Check/detect time_dim - if (is.null(time_dim)) { - time_dim_names <- c("ftime", "sdate", "time") - time_dim_num <- which(time_dim_names %in% names(dim(data))) - if (length(time_dim_num) > 0) { - # Find time dimension with length > 1 - ilong <- which(dim(data)[time_dim_names[time_dim_num]] > 1) - if (length(ilong) > 0) { - time_dim <- time_dim_names[time_dim_num[ilong[1]]] - } else { - stop("No time dimension longer than one found.") - } - } else { - stop("Could not automatically detect a target time dimension ", - "in the provided data in 'data'.") - } - print(paste("Selected time dim: ", time_dim)) - } - - # reorder and group time_dim together at the end - cdim0 <- dim(data) - imask <- names(cdim0) %in% time_dim - data <- .aperm2(data, c(which(!imask), which(imask))) - cdim <- dim(data) - ind <- 1:length(which(!imask)) - # compact (multiply) time_dim dimensions - dim(data) <- c(cdim[ind], samples = prod(cdim[-ind])) - - # Repeatedly apply .multi.eofs - result <- Apply(data, c("var", lon_dim, lat_dim, "samples"), - .multi.eofs, lon, lat, time, neof_max = neof_max, - neof_composed = neof_composed, minvar = minvar, - xlim = xlim, ylim = ylim) - - return(result) -} - - -#' Atomic MultiEOF -#' @param field_arr_raw an array of dimension: (n_field, lon, lat, time). -#' where n_field is the number of variables over which to calculate -#' the multi_eofs. -#' @param neof_composed Number of composed eofs to return in output -#' @param minvar Minimum variance fraction to be explained in first decomposition -#' @param neof_max Maximum number of single eofs considered in the first decomposition -#' @param xlim Vector with longitudinal range limits for the calculation -#' @param ylim Vector with latitudinal range limits for the calculation -#' @return A list with elements \code{$coeff} (an array of time-varying principal component coefficients), -#' \code{$variance} (a matrix of explained variances), -#' \code{eof_pattern} (a matrix of EOF patterns obtained by regression for each variable). -#' @noRd - -.multi.eofs <- function(field_arr_raw, lon, lat, time, - neof_max = 40, neof_composed = 5, minvar = 0.6, - xlim = NULL, ylim = NULL) { - - if (exists(".lm.fit")) { - lin.fit <- .lm.fit - } else { - lin.fit <- lm.fit - } - - n_field <- dim(field_arr_raw)[1] - - etime <- .power.date(time) - print("Calculating anomalies...") - field_arr <- array(dim = dim(field_arr_raw)) - for (k in seq(1, n_field, 1)) { - field_arr[k, , , ] <- .daily.anom.mean( - lon, lat, field_arr_raw[k, , , ], etime - ) - } - - # area weighting, based on the root of cosine - print("Area Weighting...") - ww <- .area.weight(lon, lat, root = T) - - for (k in seq(1, n_field, 1)) { - field <- field_arr[k, , , ] - # calculate the area weight - field <- sweep(field, c(1, 2), ww, "*") - - if(is.null(xlim) | is.null(xlim)) { - slon <- lon - slat <- lat - } else { - # selection of the box - field <- field[.whicher(lon, xlim[1]):.whicher(lon, xlim[2]), - .whicher(lat, ylim[1]):.whicher(lat, ylim[2]), ] - slon <- lon[.whicher(lon, xlim[1]):.whicher(lon, xlim[2])] - slat <- lat[.whicher(lat, ylim[1]):.whicher(lat, ylim[2])] - } - - # transform 3D field in a matrix - field <- array(field, dim = c(dim(field)[1] * dim(field)[2], dim(field)[3])) - - # calling SVD - print("Calling SVD...") - SVD <- svd(field, nu = neof_max, nv = neof_max) - - # extracting EOFs (loading pattern), expansions coefficient - # and variance explained - pattern <- array(SVD$u, dim = c(dim(field)[1], dim(field)[2], neof_max)) - coefficient <- SVD$v - variance <- (SVD$d[1:neof_max]) ^ 2 / sum((SVD$d) ^ 2) - print("Accumulated variance:") - print(cumsum(variance)) - reqPC <- which(cumsum(variance) > minvar)[1] - print("Number of EOFs needed for var:") - print(reqPC) - variance <- variance[1:reqPC] - coefficient <- coefficient[, 1:reqPC] - if (reqPC == 1) { - coefficient <- replicate(1, coefficient) - } - coefficient <- apply(coefficient, c(2), .standardize) - regression <- array(NA, dim = c(length(lon), length(lat), neof_max)) - for (i in 1:reqPC) { - regression[, , i] <- apply(get(paste0("field", k)), c(1, 2), - function(x) lin.fit(as.matrix(coefficient[, i], - ncol = 1), x)$coefficients - ) - } - - assign( - paste0("pc", k), list(coeff = coefficient, variance = variance, - wcoeff = sweep(coefficient, c(2), variance, "*"), - regression = regression) - ) - } - - newpc <- NULL - for (k in seq(1, n_field, 1)) { - newpc <- cbind(newpc, get(paste0("pc", k))$wcoeff) - } - newpc <- t(newpc) - - print("Calculating composed EOFs") - SVD <- svd(newpc, nu = neof_composed, nv = neof_composed) - # extracting EOFs, expansions coefficient and variance explained - coefficient <- SVD$v - variance <- (SVD$d[1:(neof_composed)]) ^ 2 / sum( (SVD$d) ^ 2) - coefficient <- apply(coefficient, c(2), .standardize) - - # linear regressions on anomalies - regression <- array(dim = c(n_field, length(lon), - length(lat), neof_composed)) - for (k in seq(1, n_field, 1)) { - print("Linear Regressions (it can take a while)... ") - for (i in 1:neof_composed) { - regression[k, , , i] <- apply( - field_arr[k, , , ], c(1, 2), - function(x) lin.fit( - as.matrix(coefficient[, i], - ncol = 1), - x)$coefficients - ) - } - } - - print("Finalize...") - names(dim(coefficient)) <- c("time", "eof") - variance <- array(variance) - names(dim(variance)) <- "eof" - names(dim(regression)) <- c("var", "lon", "lat", "eof") - - out <- list(coeff = coefficient, variance = variance, eof_pattern = regression) - - return(out) -} - - -# new function to create simple list with date values - Oct-18 -# it needs a date or PCICt object, and returns also the season subdivision -.power.date <- function(datas, verbose = FALSE) { - - # create a "season" for continuous time, used by persistance tracking - startpoints <- c(0, which(diff(datas) > 1)) - deltapoints <- diff(c(startpoints, length(datas))) - seas <- inverse.rle(list(lengths = deltapoints, - values = seq(1, length(startpoints)))) - - etime <- list( - day = as.numeric(format(datas, "%d")), - month = as.numeric(format(datas, "%m")), - year = as.numeric(format(datas, "%Y")), data = datas, season = seas - ) - - .printv("Time Array Built", verbose) - .printv(paste("Length:", length(seas)), verbose) - .printv(paste("From", datas[1], "to", datas[length(seas)]), verbose) - return(etime) -} - - -# function for daily anomalies, use array predeclaration and rowMeans (40 times faster!) -.daily.anom.mean <- function(ics, ipsilon, field, etime) { - condition <- paste(etime$day, etime$month) - daily <- array(NA, dim = c(length(ics), length(ipsilon), - length(unique(condition)))) - anom <- field * NA - - for (t in unique(condition)) { - if (sum(t == condition) == 1) { - print(paste0("Cannot compute a mean with only one value: ", - "using climatological mean")) - anom[, , which(t == condition)] <- rowMeans(field, dims = 2) - } else { - daily[, , which(t == unique(condition))] <- rowMeans(field[, , t == condition], dims = 2) - anom[, , which(t == condition)] <- sweep(field[, , which(t == condition)], - c(1, 2), - daily[, , which(t == unique(condition))], "-") - } - } - return(anom) -} - - -# produce a 2d matrix of area weights -.area.weight <- function(ics, ipsilon, root = T) { - field <- array(NA, dim = c(length(ics), length(ipsilon))) - if (root == T) { - for (j in 1:length(ipsilon)) { - field[, j] <- sqrt(cos(pi / 180 * ipsilon[j])) - } - } - - if (root == F) { - for (j in 1:length(ipsilon)) { - field[, j] <- cos(pi / 180 * ipsilon[j]) - } - } - - return(field) -} - -# verbose-only printing function -.printv <- function(value, verbosity = TRUE) { - if (verbosity) { - print(value) - } -} - - -# detect ics ipsilon lat-lon -.whicher <- function(axis, number) { - out <- which.min(abs(axis - number)) - return(out) -} - - -# normalize a time series -.standardize <- function(timeseries) { - out <- (timeseries - mean(timeseries, na.rm = T)) / sd(timeseries, na.rm = T) - return(out) -} -- GitLab From b4a85d02050e35e3091c44e61df5dbef3a2a3e2b Mon Sep 17 00:00:00 2001 From: jhardenberg Date: Thu, 14 Nov 2019 23:59:25 +0100 Subject: [PATCH 05/33] basic delinting --- R/CST_EnsClus.R | 272 ++++++++++++++++++++++++++---------------------- 1 file changed, 145 insertions(+), 127 deletions(-) diff --git a/R/CST_EnsClus.R b/R/CST_EnsClus.R index 7ff1ada3..8ab80e98 100644 --- a/R/CST_EnsClus.R +++ b/R/CST_EnsClus.R @@ -31,10 +31,23 @@ #' exp <- 1 : (2 * 3 * 4 * 8 * 8) #' } -CST_EnsClus <- function(data, time_moment = 'mean', numclus = 4, lon_lim = NULL, lat_lim = NULL, - varfrac_explained = 80, numpcs = 4, flag_varfrac = FALSE, time_percentile = NULL, clustering_on = 'member') { - - result <- EnsClus(datalist$exp$data, datalist$exp$lat, datalist$exp$lon, time_moment = time_moment, numclus = numclus, lon_lim = lon_lim, lat_lim = lat_lim, varfrac_explained = varfrac_explained, numpcs = numpcs, flag_varfrac = flag_varfrac, time_percentile = time_percentile, clustering_on = clustering_on) +CST_EnsClus <- function(data, time_moment = "mean", numclus = 4, + lon_lim = NULL, lat_lim = NULL, + varfrac_explained = 80, numpcs = 4, + flag_varfrac = FALSE, time_percentile = NULL, + clustering_on = "member") { +# if (!inherits(data, "s2dv_cube")) { +# stop("Parameter 'data' must be of the class 's2dv_cube', ", +# "as output by CSTools::CST_Load.") +# } + + result <- EnsClus(datalist$exp$data, datalist$exp$lat, datalist$exp$lon, + time_moment = time_moment, numclus = numclus, + lon_lim = lon_lim, lat_lim = lat_lim, + varfrac_explained = varfrac_explained, numpcs = numpcs, + flag_varfrac = flag_varfrac, + time_percentile = time_percentile, + clustering_on = clustering_on) return(result) } @@ -70,122 +83,125 @@ CST_EnsClus <- function(data, time_moment = 'mean', numclus = 4, lon_lim = NULL, #' exp <- 1 : (2 * 3 * 4 * 8 * 8) #' } - -EnsClus <- function(exp_time, lat, lon, time_moment = 'mean', numclus = 4, lon_lim = NULL, lat_lim = NULL, - varfrac_explained = 80, numpcs = 4, flag_varfrac = FALSE, time_percentile = NULL, clustering_on = 'member') { +EnsClus <- function(exp_time, lat, lon, time_moment = "mean", numclus = 4, + lon_lim = NULL, lat_lim = NULL, varfrac_explained = 80, + numpcs = 4, flag_varfrac = FALSE, time_percentile = NULL, + clustering_on = "member") { print(dim(exp_time)) # Apply time_moment - if (time_moment == 'mean') { - print('Considering the time_moment: mean') - #exp <- apply(exp_time, c("dataset", "member", "sdate", "lat", "lon"), mean) + if (time_moment == "mean") { + print("Considering the time_moment: mean") exp <- apply(exp_time, c(1, 2, 3, 5, 6), mean) } - if (time_moment == 'sd') { - print('Considering the time_moment: sd') + if (time_moment == "sd") { + print("Considering the time_moment: sd") exp <- apply(exp_time, c(1, 2, 3, 5, 6), sd) } - if (time_moment == 'perc') { - print(paste0('Considering the time_moment: percentile ', sprintf("%5f", time_percentile))) - exp <- apply(exp_time, c(1, 2, 3, 5, 6), quantile, time_percentile/100.) + if (time_moment == "perc") { + print(paste0("Considering the time_moment: percentile ", + sprintf("%5f", time_percentile))) + exp <- apply(exp_time, c(1, 2, 3, 5, 6), quantile, time_percentile / 100.) } print(dim(exp)) - # dataset member sdate ftime lat lon - # Repeatedly apply .ensclus_Raw + # Repeatedly apply .ensclus # It would be better to apply on the named dimensions here.. - if (clustering_on == 'sdate') { - result <- apply(exp, c(1, 2), .ensclus, lat, lon, numclus = numclus, lon_lim = lon_lim, lat_lim = lat_lim, varfrac_explained = varfrac_explained, numpcs = numpcs, flag_varfrac = flag_varfrac) - } else if (clustering_on == 'member') { - result <- apply(exp, c(1, 3), .ensclus, lat, lon, numclus = numclus, lon_lim = lon_lim, lat_lim = lat_lim, varfrac_explained = varfrac_explained, numpcs = numpcs, flag_varfrac = flag_varfrac) - } - + if (clustering_on == "sdate") { + result <- apply(exp, c(1, 2), .ensclus, lat, lon, numclus = numclus, + lon_lim = lon_lim, lat_lim = lat_lim, + varfrac_explained = varfrac_explained, numpcs = numpcs, + flag_varfrac = flag_varfrac) +} else if (clustering_on == "member") { + result <- apply(exp, c(1, 3), .ensclus, lat, lon, numclus = numclus, + lon_lim = lon_lim, lat_lim = lat_lim, + varfrac_explained = varfrac_explained, + numpcs = numpcs, flag_varfrac = flag_varfrac) + } return(result) } - # Atomic ensclus function -.ensclus <- function(var_ens, lat, lon, numclus = 4, lon_lim = NULL, lat_lim = NULL, varfrac_explained = 80, numpcs = 4, flag_varfrac = FALSE) { - #print('entro') - #print(lat) - # dim(var_ens) = n_ens, lon, lat!!!!! - if (length(lat) != dim(var_ens)[2]) { - print('INCORRECT LAT length') - exit() - } - - if (length(lon) != dim(var_ens)[3]) { - print('INCORRECT LON length') - exit() - } - - n_ens = dim(var_ens)[1] - - print("Calculating ensemble anomalies...") - ens_mean = apply(var_ens, c(2,3), mean) - var_anom = array(dim=dim(var_ens)) - for (k in seq(1,n_ens)) { - var_anom[k,,] = var_ens[k,,] - ens_mean - } - - gigi = aperm(var_anom, c(2,3,1)) # reshaping to give the right input to regimes function - clusters = .regimes(lon, lat, gigi, ncluster = numclus, ntime = 1000, neof = numpcs, lon_lim, lat_lim, flag_varfrac = flag_varfrac, perc = varfrac_explained, max_eofs = n_ens-1) - - clus_labels = as.array(clusters$cluster) - frequencies = as.array(clusters$frequencies) - - clus_centers = clusters$clus_centers - - closest_member = array(dim = numclus) - dist_closest_member = array(dim = numclus) - for (iclu in seq(1,numclus)) { - this_clus_labels = which(clus_labels == iclu) - #print(paste0('aaaaaaaaaaaaaaaaaaaaaaaaaaarghhhhh ',iclu, ' ', length(this_clus_labels))) - if (length(this_clus_labels) > 1) { - dist_arr = apply(clusters$pcs[clus_labels == iclu,], 1, .dist_from_center, center = clus_centers[iclu,]) - # print(clusters$pcs[clus_labels == iclu,1]) - # print(clus_centers[iclu,1]) - print(paste0("distance from cluster ",iclu," center:")) - print(this_clus_labels) - print(dist_arr) - closest_member[iclu] = this_clus_labels[which.min(dist_arr)] - dist_closest_member[iclu] = min(dist_arr) - print(paste0("closest member to cluster ", iclu, " center is: ", closest_member[iclu])) - } else { - # print(this_clus_labels) - # print(paste0('oeeeeeeeeeeeeeeeeeeeeeeeee ',iclu)) - # print(clus_labels) - print(paste0("distance from cluster ",iclu," center:")) - dista = .dist_from_center(clusters$pcs[clus_labels == iclu,], center = clus_centers[iclu,]) - print(this_clus_labels) - print(dista) - closest_member[iclu] = this_clus_labels - dist_closest_member[iclu] = dista - } +.ensclus <- function(var_ens, lat, lon, numclus = 4, lon_lim = NULL, + lat_lim = NULL, varfrac_explained = 80, numpcs = 4, + flag_varfrac = FALSE) { + if (length(lat) != dim(var_ens)[2]) { + print("Incorrect lat length") + exit() + } - } + if (length(lon) != dim(var_ens)[3]) { + print("Incorrect lon length") + exit() + } - print("EnsClus completed...") - out = list(labels = clus_labels, freq = frequencies, closest_member = closest_member, clus_repr_field = var_ens[closest_member,,], clus_composites = clusters$regimes) - # out = list(labels = clus_labels, freq = frequencies, closest_member = closest_member, repr_field = var_ens[,,closest_member], clus_centers = clus_centers, pcs = clusters$pcs) + n_ens <- dim(var_ens)[1] - return(out) -} + print("Calculating ensemble anomalies...") + ens_mean <- apply(var_ens, c(2, 3), mean) + var_anom <- array(dim = dim(var_ens)) + for (k in seq(1, n_ens)) { + var_anom[k, , ] <- var_ens[k, , ] - ens_mean + } + + # reshaping to give the right input to regimes function + varr <- aperm(var_anom, c(2, 3, 1)) + clusters <- .regimes(lon, lat, varr, ncluster = numclus, ntime = 1000, + neof = numpcs, lon_lim, lat_lim, + flag_varfrac = flag_varfrac, perc = varfrac_explained, + max_eofs = n_ens - 1) + clus_labels <- as.array(clusters$cluster) + frequencies <- as.array(clusters$frequencies) + + clus_centers <- clusters$clus_centers + + closest_member <- array(dim = numclus) + dist_closest_member <- array(dim = numclus) + for (iclu in seq(1, numclus)) { + this_clus_labels <- which(clus_labels == iclu) + if (length(this_clus_labels) > 1) { + dist_arr <- apply(clusters$pcs[clus_labels == iclu, ], 1, + .dist_from_center, center = clus_centers[iclu, ]) + print(paste0("distance from cluster ", iclu, " center:")) + print(this_clus_labels) + print(dist_arr) + closest_member[iclu] <- this_clus_labels[which.min(dist_arr)] + dist_closest_member[iclu] <- min(dist_arr) + print(paste0("closest member to cluster ", iclu, " center is: ", + closest_member[iclu])) + } else { + print(paste0("distance from cluster ", iclu, " center:")) + dista <- .dist_from_center(clusters$pcs[clus_labels == iclu, ], + center = clus_centers[iclu, ]) + print(this_clus_labels) + print(dista) + closest_member[iclu] <- this_clus_labels + dist_closest_member[iclu] <- dista + } + } + print("EnsClus completed...") + out <- list(labels = clus_labels, freq = frequencies, + closest_member = closest_member, + clus_repr_field = var_ens[closest_member, , ], + clus_composites = clusters$regimes) + return(out) +} .dist_from_center <- function(y, center = NULL) { - dist = sqrt(sum((y-center)^2)) + dist <- sqrt(sum((y - center)^2)) return(dist) } - .eofs <- function(lon, lat, field, neof = 4, xlim = NULL, ylim = NULL, - method = "SVD", do_standardize = F, do_regression = F, verbose = T) { - # R tool for computing EOFs based on Singular Value Decomposition ("SVD", default) + method = "SVD", do_standardize = F, do_regression = F, + verbose = T) { + # R tool for computing EOFs based on + # Singular Value Decomposition ("SVD", default) # or with the eigenvectors of the covariance matrix ("covariance", slower) # If requested, computes linear regressions and standardizes the PCs # If you want to use the regressions, remember to standardize the PCs @@ -196,6 +212,8 @@ EnsClus <- function(exp_time, lat, lon, time_moment = 'mean', numclus = 4, lon_l .printv("Area Weighting...", verbose) ww <- .area.weight(lon, lat, root = T) wwfield <- sweep(field, c(1, 2), ww, "*") + print(dim(field)) + print(dim(ww)) # selection of the xbox and ybox if defined if (!is.null(xlim)) { @@ -223,7 +241,8 @@ EnsClus <- function(exp_time, lat, lon, time_moment = 'mean', numclus = 4, lon_l .printv("Calling SVD...", verbose) SVD <- svd(new_box, nu = neof, nv = neof) - # extracting EOFs (loading pattern), expansions coefficient and variance explained + # extracting EOFs (loading pattern), expansions coefficient + # and variance explained pattern <- array(SVD$u, dim = c(dim(box)[1], dim(box)[2], neof)) coefficient <- SVD$v variance <- (SVD$d[1:neof])^2 / sum((SVD$d)^2) @@ -240,7 +259,8 @@ EnsClus <- function(exp_time, lat, lon, time_moment = 'mean', numclus = 4, lon_l covma <- cov(t(new_box)) eig <- eigen(covma) coef <- (t(new_box) %*% eig$vector)[, 1:neof] - pattern <- array(eig$vectors, dim = c(dim(box)[1], dim(box)[2], dim(box)[3]))[, , 1:neof] + pattern <- array(eig$vectors, dim = c(dim(box)[1], dim(box)[2], + dim(box)[3]))[, , 1:neof] variance <- eig$values[1:neof] / sum(eig$values) if (do_standardize) { coefficient <- apply(coef, c(2), standardize) @@ -254,21 +274,25 @@ EnsClus <- function(exp_time, lat, lon, time_moment = 'mean', numclus = 4, lon_l if (do_regression) { .printv("Linear Regressions (it can takes a while)... ", verbose) regression <- array(NA, dim = c(length(lon), length(lat), neof)) - # for (i in 1:neof) {regression[,,i]=apply(field,c(1,2),function(x) coef(lm(x ~ coefficient[,i]))[2])} for (i in 1:neof) { - regression[, , i] <- apply(field, c(1, 2), function(x) lin.fit(as.matrix(coefficient[, i], ncol = 1), x)$coefficients) + regression[, , i] <- apply(field, c(1, 2), + function(x) lin.fit(as.matrix(coefficient[, i], + ncol = 1), x)$coefficients) } } # preparing output .printv("Finalize...", verbose) pattern <- list(x = slon, y = slat, z = pattern) - out <- list(pattern = pattern, coeff = coefficient, variance = variance, regression = regression) + out <- list(pattern = pattern, coeff = coefficient, variance = variance, + regression = regression) return(out) } -.regimes <- function(lon, lat, field, ncluster = 4, ntime = 1000, neof = 10, xlim, ylim, alg = "Hartigan-Wong", flag_varfrac = FALSE, perc = NULL, max_eofs = 50) { +.regimes <- function(lon, lat, field, ncluster = 4, ntime = 1000, neof = 10, + xlim, ylim, alg = "Hartigan-Wong", flag_varfrac = FALSE, + perc = NULL, max_eofs = 50) { # R tool to compute cluster analysis based on k-means. # Requires "personal" function eofs # Take as input a 3D anomaly field @@ -278,49 +302,41 @@ EnsClus <- function(exp_time, lat, lon, time_moment = 'mean', numclus = 4, lon_l t0 <- proc.time() if (flag_varfrac) { if (is.null(perc)) { - print('flag_varfrac is TRUE but no perc as been specified') + print("flag_varfrac is TRUE but no perc as been specified") } - reducedspace <- .eofs(lon, lat, field, neof = max_eofs, xlim = xlim, ylim = ylim, method = "SVD", do_regression = F, do_standardize = F) - #print(dim(reducedspace$coeff)) - #print(cumsum(reducedspace$variance)) - neof = which(cumsum(reducedspace$variance) > perc/100.)[1] - print("Number of EOFs needed for var:") - print(neof) + reducedspace <- .eofs(lon, lat, field, neof = max_eofs, xlim = xlim, + ylim = ylim, method = "SVD", do_regression = F, + do_standardize = F) + neof <- which(cumsum(reducedspace$variance) > perc / 100.)[1] + print(paste("Number of EOFs needed for var:", neof)) PC <- reducedspace$coeff[,1:neof] - # print(dim(PC)) } else { - reducedspace <- .eofs(lon, lat, field, neof = neof, xlim = xlim, ylim = ylim, method = "SVD", do_regression = F, do_standardize = F) + reducedspace <- .eofs(lon, lat, field, neof = neof, xlim = xlim, + ylim = ylim, method = "SVD", do_regression = F, + do_standardize = F) # extract the principal components PC <- reducedspace$coeff - # print(dim(PC)) } t1 <- proc.time() - t0 - # print(t1) # k-means computation repeat for ntime to find best solution. print("Computing k-means...") t0 <- proc.time() - #print(str(ncluster)) - regimes <- kmeans(PC, as.numeric(ncluster), nstart = ntime, iter.max = 1000, algorithm = alg) + regimes <- kmeans(PC, as.numeric(ncluster), nstart = ntime, + iter.max = 1000, algorithm = alg) t1 <- proc.time() - t0 - # print(t1) # Extract regimes frequencyr and timeseries of occupation cluster <- regimes$cluster frequencies <- regimes$size / dim(field)[3] * 100 print("Cluster frequencies:") print(frequencies[order(frequencies, decreasing = T)]) - # print(regimes$tot.withinss) print("Creating Composites...") - #compose <- aperm(apply(field, c(1, 2), by, cluster, mean), c(2, 3, 1)) compose <- apply(field, c(1, 2), by, cluster, mean) - # print(frequencies) - # print(regimes$centers) # sorting from the more frequent to the less frequent kk <- order(frequencies, decreasing = T) - # print(kk) cluster <- cluster + 100 for (ss in 1:ncluster) { cluster[cluster == (ss + 100)] <- which(kk == ss) @@ -328,7 +344,10 @@ EnsClus <- function(exp_time, lat, lon, time_moment = 'mean', numclus = 4, lon_l # prepare output print("Finalize...") - out <- list(cluster = cluster, frequencies = frequencies[kk], regimes = compose[kk,,], pcs = PC, clus_centers = regimes$centers[kk,], tot.withinss = regimes$tot.withinss) + out <- list(cluster = cluster, frequencies = frequencies[kk], + regimes = compose[kk, , ], pcs = PC, + clus_centers = regimes$centers[kk, ], + tot.withinss = regimes$tot.withinss) return(out) } @@ -355,8 +374,8 @@ EnsClus <- function(exp_time, lat, lon, time_moment = 'mean', numclus = 4, lon_l return(etime) } - -# function for daily anomalies, use array predeclaration and rowMeans (40 times faster!) +# function for daily anomalies, use array predeclaration +# and rowMeans (40 times faster!) .daily.anom.mean <- function(ics, ipsilon, field, etime) { condition <- paste(etime$day, etime$month) daily <- array(NA, dim = c(length(ics), length(ipsilon), @@ -369,16 +388,16 @@ EnsClus <- function(exp_time, lat, lon, time_moment = 'mean', numclus = 4, lon_l "using climatological mean")) anom[, , which(t == condition)] <- rowMeans(field, dims = 2) } else { - daily[, , which(t == unique(condition))] <- rowMeans(field[, , t == condition], dims = 2) - anom[, , which(t == condition)] <- sweep(field[, , which(t == condition)], - c(1, 2), - daily[, , which(t == unique(condition))], "-") + daily[, , which(t == unique(condition))] <- + rowMeans(field[, , t == condition], dims = 2) + anom[, , which(t == condition)] <- + sweep(field[, , which(t == condition)], c(1, 2), + daily[, , which(t == unique(condition))], "-") } } return(anom) } - # produce a 2d matrix of area weights .area.weight <- function(ics, ipsilon, root = T) { field <- array(NA, dim = c(length(ics), length(ipsilon))) @@ -404,16 +423,15 @@ EnsClus <- function(exp_time, lat, lon, time_moment = 'mean', numclus = 4, lon_l } } - # detect ics ipsilon lat-lon .whicher <- function(axis, number) { out <- which.min(abs(axis - number)) return(out) } - # normalize a time series .standardize <- function(timeseries) { - out <- (timeseries - mean(timeseries, na.rm = T)) / sd(timeseries, na.rm = T) + out <- (timeseries - mean(timeseries, na.rm = T)) / + sd(timeseries, na.rm = T) return(out) } -- GitLab From ccc3a8252722b37323b56fb0473cc9ae03844062 Mon Sep 17 00:00:00 2001 From: jhardenberg Date: Fri, 15 Nov 2019 00:11:06 +0100 Subject: [PATCH 06/33] dim order in calling regimes --- R/CST_EnsClus.R | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/R/CST_EnsClus.R b/R/CST_EnsClus.R index 8ab80e98..c7fc148e 100644 --- a/R/CST_EnsClus.R +++ b/R/CST_EnsClus.R @@ -149,8 +149,8 @@ EnsClus <- function(exp_time, lat, lon, time_moment = "mean", numclus = 4, } # reshaping to give the right input to regimes function - varr <- aperm(var_anom, c(2, 3, 1)) - clusters <- .regimes(lon, lat, varr, ncluster = numclus, ntime = 1000, + var_anom <- aperm(var_anom, c(3, 2, 1)) + clusters <- .regimes(lon, lat, var_anom, ncluster = numclus, ntime = 1000, neof = numpcs, lon_lim, lat_lim, flag_varfrac = flag_varfrac, perc = varfrac_explained, max_eofs = n_ens - 1) @@ -212,8 +212,6 @@ EnsClus <- function(exp_time, lat, lon, time_moment = "mean", numclus = 4, .printv("Area Weighting...", verbose) ww <- .area.weight(lon, lat, root = T) wwfield <- sweep(field, c(1, 2), ww, "*") - print(dim(field)) - print(dim(ww)) # selection of the xbox and ybox if defined if (!is.null(xlim)) { -- GitLab From 9ac6687a4cac0564094aa9346272c6c1fe5704ad Mon Sep 17 00:00:00 2001 From: jhardenberg Date: Fri, 15 Nov 2019 00:38:02 +0100 Subject: [PATCH 07/33] accept s2dv object --- R/CST_EnsClus.R | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/R/CST_EnsClus.R b/R/CST_EnsClus.R index c7fc148e..c440f091 100644 --- a/R/CST_EnsClus.R +++ b/R/CST_EnsClus.R @@ -36,12 +36,12 @@ CST_EnsClus <- function(data, time_moment = "mean", numclus = 4, varfrac_explained = 80, numpcs = 4, flag_varfrac = FALSE, time_percentile = NULL, clustering_on = "member") { -# if (!inherits(data, "s2dv_cube")) { -# stop("Parameter 'data' must be of the class 's2dv_cube', ", -# "as output by CSTools::CST_Load.") -# } + if (!inherits(data, "s2dv_cube")) { + stop("Parameter 'data' must be of the class 's2dv_cube', ", + "as output by CSTools::CST_Load.") + } - result <- EnsClus(datalist$exp$data, datalist$exp$lat, datalist$exp$lon, + result <- EnsClus(data$data, data$lat, data$lon, time_moment = time_moment, numclus = numclus, lon_lim = lon_lim, lat_lim = lat_lim, varfrac_explained = varfrac_explained, numpcs = numpcs, @@ -88,7 +88,7 @@ EnsClus <- function(exp_time, lat, lon, time_moment = "mean", numclus = 4, numpcs = 4, flag_varfrac = FALSE, time_percentile = NULL, clustering_on = "member") { - print(dim(exp_time)) + print(str(exp_time)) # Apply time_moment if (time_moment == "mean") { -- GitLab From 3a5a8d338e1734bb8356bb13424c6c9ada135698 Mon Sep 17 00:00:00 2001 From: jhardenberg Date: Sat, 16 Nov 2019 12:52:55 +0100 Subject: [PATCH 08/33] using Apply to get decently structured output --- R/CST_EnsClus.R | 83 +++++++++++++++++++++++++------------------------ 1 file changed, 43 insertions(+), 40 deletions(-) diff --git a/R/CST_EnsClus.R b/R/CST_EnsClus.R index c440f091..798fcb7e 100644 --- a/R/CST_EnsClus.R +++ b/R/CST_EnsClus.R @@ -9,7 +9,7 @@ #' #' @description This function performs a clustering on members/starting dates and returns a number of scenarios, with representative members for each of them. The clustering is performed in a reduced EOF space. #' -#' @param data An object of the class 's2dv_cube', containing the variables to be analysed. +#' @param exp An object of the class 's2dv_cube', containing the variables to be analysed. #' Each data object in the list is expected to have an element named \code{$data} with at least two #' spatial dimensions named "lon" and "lat", a dimension "ftime" and a dimension "sdate". #' @param time_moment Decides the moment to be applied to the time dimension. Can be either 'mean' (time mean), 'sd' (standard deviation along time) or 'perc' (a selected percentile on time). If 'perc' the keyword 'time_percentile' is also needed. @@ -20,34 +20,34 @@ #' @param varfrac_explained Variance to be explained by the set of eofs. Used only if flag_varfrac is True. #' @param numpcs Number of eofs retained in the analysis. #' @param flag_varfrac Selects the minimum number of eofs needed to reach the varfrac_explained. -#' @param clustering_on Decides whether to calculate the scenarios among members with same sdate ('member') or among different sdates ('sdate') but fixed member. +#' @param cluster_dim Decides whether to calculate the scenarios among members with same sdate ('member') or among different sdates ('sdate') but fixed member. #' @return A list with elements \code{$labels} (cluster assigned for each member), #' \code{$freq} (relative frequency of each cluster), \code{$closest_member} -#' (representative member for each cluster), \code{$clus_repr_field} (list of fields -#' for each representative member), \code{clus_composites} (list of mean fields for each cluster). +#' (representative member for each cluster), \code{$repr_field} (list of fields +#' for each representative member), \code{composites} (list of mean fields for each cluster). #' @export #' @examples #' \donttest{ #' exp <- 1 : (2 * 3 * 4 * 8 * 8) #' } -CST_EnsClus <- function(data, time_moment = "mean", numclus = 4, +CST_EnsClus <- function(exp, time_moment = "mean", numclus = 4, lon_lim = NULL, lat_lim = NULL, varfrac_explained = 80, numpcs = 4, flag_varfrac = FALSE, time_percentile = NULL, - clustering_on = "member") { - if (!inherits(data, "s2dv_cube")) { - stop("Parameter 'data' must be of the class 's2dv_cube', ", + cluster_dim = "member") { + if (!inherits(exp, "s2dv_cube")) { + stop("Parameter 'exp' must be of the class 's2dv_cube', ", "as output by CSTools::CST_Load.") } - result <- EnsClus(data$data, data$lat, data$lon, + result <- EnsClus(exp$data, exp$lat, exp$lon, time_moment = time_moment, numclus = numclus, lon_lim = lon_lim, lat_lim = lat_lim, varfrac_explained = varfrac_explained, numpcs = numpcs, flag_varfrac = flag_varfrac, time_percentile = time_percentile, - clustering_on = clustering_on) + cluster_dim = cluster_dim) return(result) } @@ -63,7 +63,7 @@ CST_EnsClus <- function(data, time_moment = "mean", numclus = 4, #' #' @description This function performs a clustering on members/starting dates and returns a number of scenarios, with representative members for each of them. The clustering is performed in a reduced EOF space. #' -#' @param exp_time A matrix of dimensions 'dataset member sdate ftime lat lon' containing the variables to be analysed. If dataset is a s2dv_cube, exp_time = dataset$exp$data +#' @param data A matrix of dimensions 'dataset member sdate ftime lat lon' containing the variables to be analysed. #' @param time_moment Decides the moment to be applied to the time dimension. Can be either 'mean' (time mean), 'sd' (standard deviation along time) or 'perc' (a selected percentile on time). If 'perc' the keyword 'time_percentile' is also needed. #' @param time_percentile Set the percentile in time you want to analyse. Active only if time_moment is 'perc'. #' @param numclus Number of clusters (scenarios) to be calculated. @@ -72,57 +72,50 @@ CST_EnsClus <- function(data, time_moment = "mean", numclus = 4, #' @param varfrac_explained Variance to be explained by the set of eofs. Used only if flag_varfrac is True. #' @param numpcs Number of eofs retained in the analysis. #' @param flag_varfrac Selects the minimum number of eofs needed to reach the varfrac_explained. -#' @param clustering_on Decides whether to calculate the scenarios among members with same sdate ('member') or among different sdates ('sdate') but fixed member. +#' @param cluster_dim Decides whether to calculate the scenarios among members with same sdate ('member') or among different sdates ('sdate') but fixed member. #' @return A list with elements \code{$labels} (cluster assigned for each member), #' \code{$freq} (relative frequency of each cluster), \code{$closest_member} -#' (representative member for each cluster), \code{$clus_repr_field} (list of fields -#' for each representative member), \code{clus_composites} (list of mean fields for each cluster). +#' (representative member for each cluster), \code{$repr_field} (list of fields +#' for each representative member), \code{composites} (list of mean fields for each cluster). #' @export #' @examples #' \donttest{ #' exp <- 1 : (2 * 3 * 4 * 8 * 8) #' } -EnsClus <- function(exp_time, lat, lon, time_moment = "mean", numclus = 4, +EnsClus <- function(data, lat, lon, time_moment = "mean", numclus = 4, lon_lim = NULL, lat_lim = NULL, varfrac_explained = 80, numpcs = 4, flag_varfrac = FALSE, time_percentile = NULL, - clustering_on = "member") { + cluster_dim = "member") { - print(str(exp_time)) + print(str(data)) # Apply time_moment if (time_moment == "mean") { print("Considering the time_moment: mean") - exp <- apply(exp_time, c(1, 2, 3, 5, 6), mean) + exp <- apply(data, c(1, 2, 3, 5, 6), mean) } if (time_moment == "sd") { print("Considering the time_moment: sd") - exp <- apply(exp_time, c(1, 2, 3, 5, 6), sd) + exp <- apply(data, c(1, 2, 3, 5, 6), sd) } if (time_moment == "perc") { print(paste0("Considering the time_moment: percentile ", sprintf("%5f", time_percentile))) - exp <- apply(exp_time, c(1, 2, 3, 5, 6), quantile, time_percentile / 100.) + exp <- apply(data, c(1, 2, 3, 5, 6), quantile, time_percentile / 100.) } print(dim(exp)) - # dataset member sdate ftime lat lon + # dataset member sdate lat lon # Repeatedly apply .ensclus - # It would be better to apply on the named dimensions here.. - if (clustering_on == "sdate") { - result <- apply(exp, c(1, 2), .ensclus, lat, lon, numclus = numclus, - lon_lim = lon_lim, lat_lim = lat_lim, - varfrac_explained = varfrac_explained, numpcs = numpcs, - flag_varfrac = flag_varfrac) -} else if (clustering_on == "member") { - result <- apply(exp, c(1, 3), .ensclus, lat, lon, numclus = numclus, - lon_lim = lon_lim, lat_lim = lat_lim, - varfrac_explained = varfrac_explained, - numpcs = numpcs, flag_varfrac = flag_varfrac) - } - return(result) + result <- Apply(exp, target_dims = c(cluster_dim, "lat", "lon"), .ensclus, + lat, lon, numclus = numclus, + lon_lim = lon_lim, lat_lim = lat_lim, + varfrac_explained = varfrac_explained, + numpcs = numpcs, flag_varfrac = flag_varfrac) + return(append(result, list(lat = lat, lon = lon))) } # Atomic ensclus function @@ -133,13 +126,13 @@ EnsClus <- function(exp_time, lat, lon, time_moment = "mean", numclus = 4, print("Incorrect lat length") exit() } - + if (length(lon) != dim(var_ens)[3]) { print("Incorrect lon length") exit() } - n_ens <- dim(var_ens)[1] + print(dim(var_ens)) print("Calculating ensemble anomalies...") ens_mean <- apply(var_ens, c(2, 3), mean) @@ -156,7 +149,9 @@ EnsClus <- function(exp_time, lat, lon, time_moment = "mean", numclus = 4, max_eofs = n_ens - 1) clus_labels <- as.array(clusters$cluster) + names(dim(clus_labels))[1] <- names(dim(var_ens))[1] frequencies <- as.array(clusters$frequencies) + names(dim(frequencies))[1] <- "cluster" clus_centers <- clusters$clus_centers @@ -185,10 +180,18 @@ EnsClus <- function(exp_time, lat, lon, time_moment = "mean", numclus = 4, } } print("EnsClus completed...") + + names(dim(closest_member))[1] <- "cluster" + repr_field <- var_ens[closest_member, , ] + names(dim(repr_field))[2:3] <- names(dim(var_ens))[2:3] + names(dim(repr_field))[1] <- "cluster" + # Bring back to original order lat lon + composites <- aperm(clusters$regimes, c(1, 3, 2)) + names(dim(composites))[2:3] <- names(dim(var_ens))[2:3] + names(dim(composites))[1] <- "cluster" out <- list(labels = clus_labels, freq = frequencies, - closest_member = closest_member, - clus_repr_field = var_ens[closest_member, , ], - clus_composites = clusters$regimes) + closest_member = closest_member, repr_field = repr_field, + composites = composites) return(out) } @@ -324,7 +327,7 @@ EnsClus <- function(exp_time, lat, lon, time_moment = "mean", numclus = 4, iter.max = 1000, algorithm = alg) t1 <- proc.time() - t0 - # Extract regimes frequencyr and timeseries of occupation + # Extract regimes frequency and timeseries of occupation cluster <- regimes$cluster frequencies <- regimes$size / dim(field)[3] * 100 print("Cluster frequencies:") -- GitLab From 93aa6dfb7f18db2b2bf4af85a121022765f5cdf8 Mon Sep 17 00:00:00 2001 From: jhardenberg Date: Sat, 16 Nov 2019 13:10:35 +0100 Subject: [PATCH 09/33] option for verbose output --- R/CST_EnsClus.R | 122 +++++++++++++++--------------------------------- 1 file changed, 37 insertions(+), 85 deletions(-) diff --git a/R/CST_EnsClus.R b/R/CST_EnsClus.R index 798fcb7e..83584d0d 100644 --- a/R/CST_EnsClus.R +++ b/R/CST_EnsClus.R @@ -21,6 +21,7 @@ #' @param numpcs Number of eofs retained in the analysis. #' @param flag_varfrac Selects the minimum number of eofs needed to reach the varfrac_explained. #' @param cluster_dim Decides whether to calculate the scenarios among members with same sdate ('member') or among different sdates ('sdate') but fixed member. +#' @param Logical for verbose output #' @return A list with elements \code{$labels} (cluster assigned for each member), #' \code{$freq} (relative frequency of each cluster), \code{$closest_member} #' (representative member for each cluster), \code{$repr_field} (list of fields @@ -35,7 +36,7 @@ CST_EnsClus <- function(exp, time_moment = "mean", numclus = 4, lon_lim = NULL, lat_lim = NULL, varfrac_explained = 80, numpcs = 4, flag_varfrac = FALSE, time_percentile = NULL, - cluster_dim = "member") { + cluster_dim = "member", verbose = F) { if (!inherits(exp, "s2dv_cube")) { stop("Parameter 'exp' must be of the class 's2dv_cube', ", "as output by CSTools::CST_Load.") @@ -47,7 +48,7 @@ CST_EnsClus <- function(exp, time_moment = "mean", numclus = 4, varfrac_explained = varfrac_explained, numpcs = numpcs, flag_varfrac = flag_varfrac, time_percentile = time_percentile, - cluster_dim = cluster_dim) + cluster_dim = cluster_dim, verbose = verbose) return(result) } @@ -73,6 +74,7 @@ CST_EnsClus <- function(exp, time_moment = "mean", numclus = 4, #' @param numpcs Number of eofs retained in the analysis. #' @param flag_varfrac Selects the minimum number of eofs needed to reach the varfrac_explained. #' @param cluster_dim Decides whether to calculate the scenarios among members with same sdate ('member') or among different sdates ('sdate') but fixed member. +#' @param verbose Logical for verbose output #' @return A list with elements \code{$labels} (cluster assigned for each member), #' \code{$freq} (relative frequency of each cluster), \code{$closest_member} #' (representative member for each cluster), \code{$repr_field} (list of fields @@ -86,42 +88,39 @@ CST_EnsClus <- function(exp, time_moment = "mean", numclus = 4, EnsClus <- function(data, lat, lon, time_moment = "mean", numclus = 4, lon_lim = NULL, lat_lim = NULL, varfrac_explained = 80, numpcs = 4, flag_varfrac = FALSE, time_percentile = NULL, - cluster_dim = "member") { - - print(str(data)) + cluster_dim = "member", verbose = T) { # Apply time_moment if (time_moment == "mean") { - print("Considering the time_moment: mean") + .printv("Considering the time_moment: mean", verbose) exp <- apply(data, c(1, 2, 3, 5, 6), mean) } if (time_moment == "sd") { - print("Considering the time_moment: sd") + .printv("Considering the time_moment: sd", verbose) exp <- apply(data, c(1, 2, 3, 5, 6), sd) } if (time_moment == "perc") { - print(paste0("Considering the time_moment: percentile ", - sprintf("%5f", time_percentile))) + .printv(paste0("Considering the time_moment: percentile ", + sprintf("%5f", time_percentile)), verbose) exp <- apply(data, c(1, 2, 3, 5, 6), quantile, time_percentile / 100.) } - - print(dim(exp)) - # dataset member sdate lat lon + # Repeatedly apply .ensclus result <- Apply(exp, target_dims = c(cluster_dim, "lat", "lon"), .ensclus, lat, lon, numclus = numclus, lon_lim = lon_lim, lat_lim = lat_lim, varfrac_explained = varfrac_explained, - numpcs = numpcs, flag_varfrac = flag_varfrac) + numpcs = numpcs, flag_varfrac = flag_varfrac, + verbose = verbose) return(append(result, list(lat = lat, lon = lon))) } # Atomic ensclus function .ensclus <- function(var_ens, lat, lon, numclus = 4, lon_lim = NULL, lat_lim = NULL, varfrac_explained = 80, numpcs = 4, - flag_varfrac = FALSE) { + flag_varfrac = FALSE, verbose = T) { if (length(lat) != dim(var_ens)[2]) { print("Incorrect lat length") exit() @@ -132,9 +131,8 @@ EnsClus <- function(data, lat, lon, time_moment = "mean", numclus = 4, exit() } n_ens <- dim(var_ens)[1] - print(dim(var_ens)) - print("Calculating ensemble anomalies...") + .printv("Calculating ensemble anomalies...", verbose) ens_mean <- apply(var_ens, c(2, 3), mean) var_anom <- array(dim = dim(var_ens)) for (k in seq(1, n_ens)) { @@ -146,7 +144,7 @@ EnsClus <- function(data, lat, lon, time_moment = "mean", numclus = 4, clusters <- .regimes(lon, lat, var_anom, ncluster = numclus, ntime = 1000, neof = numpcs, lon_lim, lat_lim, flag_varfrac = flag_varfrac, perc = varfrac_explained, - max_eofs = n_ens - 1) + max_eofs = n_ens - 1, verbose = verbose) clus_labels <- as.array(clusters$cluster) names(dim(clus_labels))[1] <- names(dim(var_ens))[1] @@ -162,24 +160,24 @@ EnsClus <- function(data, lat, lon, time_moment = "mean", numclus = 4, if (length(this_clus_labels) > 1) { dist_arr <- apply(clusters$pcs[clus_labels == iclu, ], 1, .dist_from_center, center = clus_centers[iclu, ]) - print(paste0("distance from cluster ", iclu, " center:")) - print(this_clus_labels) - print(dist_arr) + .printv(paste0("distance from cluster ", iclu, " center:"), verbose) + .printv(this_clus_labels, verbose) + .printv(dist_arr, verbose) closest_member[iclu] <- this_clus_labels[which.min(dist_arr)] dist_closest_member[iclu] <- min(dist_arr) - print(paste0("closest member to cluster ", iclu, " center is: ", - closest_member[iclu])) + .printv(paste0("closest member to cluster ", iclu, " center is: ", + closest_member[iclu]), verbose) } else { - print(paste0("distance from cluster ", iclu, " center:")) + .printv(paste0("distance from cluster ", iclu, " center:"), verbose) dista <- .dist_from_center(clusters$pcs[clus_labels == iclu, ], center = clus_centers[iclu, ]) - print(this_clus_labels) - print(dista) + .printv(this_clus_labels, verbose) + .printv(dist_arr, verbose) closest_member[iclu] <- this_clus_labels dist_closest_member[iclu] <- dista } } - print("EnsClus completed...") + .printv("EnsClus completed...", verbose) names(dim(closest_member))[1] <- "cluster" repr_field <- var_ens[closest_member, , ] @@ -283,7 +281,7 @@ EnsClus <- function(data, lat, lon, time_moment = "mean", numclus = 4, } # preparing output - .printv("Finalize...", verbose) + .printv("Finalize eofs...", verbose) pattern <- list(x = slon, y = slat, z = pattern) out <- list(pattern = pattern, coeff = coefficient, variance = variance, regression = regression) @@ -293,35 +291,36 @@ EnsClus <- function(data, lat, lon, time_moment = "mean", numclus = 4, .regimes <- function(lon, lat, field, ncluster = 4, ntime = 1000, neof = 10, xlim, ylim, alg = "Hartigan-Wong", flag_varfrac = FALSE, - perc = NULL, max_eofs = 50) { + perc = NULL, max_eofs = 50, verbose = T) { # R tool to compute cluster analysis based on k-means. # Requires "personal" function eofs # Take as input a 3D anomaly field # Reduce the phase space with EOFs: use SVD and do not standardize PCs - print("Launching EOFs...") + .printv("Launching EOFs...", verbose) t0 <- proc.time() if (flag_varfrac) { if (is.null(perc)) { - print("flag_varfrac is TRUE but no perc as been specified") + print("Warning: flag_varfrac is TRUE but no perc as been specified") + stop() } reducedspace <- .eofs(lon, lat, field, neof = max_eofs, xlim = xlim, ylim = ylim, method = "SVD", do_regression = F, - do_standardize = F) + do_standardize = F, verbose = verbose) neof <- which(cumsum(reducedspace$variance) > perc / 100.)[1] - print(paste("Number of EOFs needed for var:", neof)) + .printv(paste("Number of EOFs needed for var:", neof), verbose) PC <- reducedspace$coeff[,1:neof] } else { reducedspace <- .eofs(lon, lat, field, neof = neof, xlim = xlim, ylim = ylim, method = "SVD", do_regression = F, - do_standardize = F) + do_standardize = F, verbose = verbose) # extract the principal components PC <- reducedspace$coeff } t1 <- proc.time() - t0 # k-means computation repeat for ntime to find best solution. - print("Computing k-means...") + .printv("Computing k-means...", verbose) t0 <- proc.time() regimes <- kmeans(PC, as.numeric(ncluster), nstart = ntime, iter.max = 1000, algorithm = alg) @@ -330,10 +329,10 @@ EnsClus <- function(data, lat, lon, time_moment = "mean", numclus = 4, # Extract regimes frequency and timeseries of occupation cluster <- regimes$cluster frequencies <- regimes$size / dim(field)[3] * 100 - print("Cluster frequencies:") - print(frequencies[order(frequencies, decreasing = T)]) + .printv("Cluster frequencies:", verbose) + .printv(frequencies[order(frequencies, decreasing = T)], verbose) - print("Creating Composites...") + .printv("Creating Composites...", verbose) compose <- apply(field, c(1, 2), by, cluster, mean) # sorting from the more frequent to the less frequent @@ -344,7 +343,7 @@ EnsClus <- function(data, lat, lon, time_moment = "mean", numclus = 4, } # prepare output - print("Finalize...") + .printv("Finalize regimes...", verbose) out <- list(cluster = cluster, frequencies = frequencies[kk], regimes = compose[kk, , ], pcs = PC, clus_centers = regimes$centers[kk, ], @@ -352,53 +351,6 @@ EnsClus <- function(data, lat, lon, time_moment = "mean", numclus = 4, return(out) } - -# new function to create simple list with date values - Oct-18 -# it needs a date or PCICt object, and returns also the season subdivision -.power.date <- function(datas, verbose = FALSE) { - - # create a "season" for continuous time, used by persistance tracking - startpoints <- c(0, which(diff(datas) > 1)) - deltapoints <- diff(c(startpoints, length(datas))) - seas <- inverse.rle(list(lengths = deltapoints, - values = seq(1, length(startpoints)))) - - etime <- list( - day = as.numeric(format(datas, "%d")), - month = as.numeric(format(datas, "%m")), - year = as.numeric(format(datas, "%Y")), data = datas, season = seas - ) - - .printv("Time Array Built", verbose) - .printv(paste("Length:", length(seas)), verbose) - .printv(paste("From", datas[1], "to", datas[length(seas)]), verbose) - return(etime) -} - -# function for daily anomalies, use array predeclaration -# and rowMeans (40 times faster!) -.daily.anom.mean <- function(ics, ipsilon, field, etime) { - condition <- paste(etime$day, etime$month) - daily <- array(NA, dim = c(length(ics), length(ipsilon), - length(unique(condition)))) - anom <- field * NA - - for (t in unique(condition)) { - if (sum(t == condition) == 1) { - print(paste0("Cannot compute a mean with only one value: ", - "using climatological mean")) - anom[, , which(t == condition)] <- rowMeans(field, dims = 2) - } else { - daily[, , which(t == unique(condition))] <- - rowMeans(field[, , t == condition], dims = 2) - anom[, , which(t == condition)] <- - sweep(field[, , which(t == condition)], c(1, 2), - daily[, , which(t == unique(condition))], "-") - } - } - return(anom) -} - # produce a 2d matrix of area weights .area.weight <- function(ics, ipsilon, root = T) { field <- array(NA, dim = c(length(ics), length(ipsilon))) -- GitLab From 05a432297a5202e2513bf4c7205030e1ca73d50b Mon Sep 17 00:00:00 2001 From: jhardenberg Date: Sat, 16 Nov 2019 13:25:18 +0100 Subject: [PATCH 10/33] rename cluster labels to cluster --- R/CST_EnsClus.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/CST_EnsClus.R b/R/CST_EnsClus.R index 83584d0d..76edf23d 100644 --- a/R/CST_EnsClus.R +++ b/R/CST_EnsClus.R @@ -22,7 +22,7 @@ #' @param flag_varfrac Selects the minimum number of eofs needed to reach the varfrac_explained. #' @param cluster_dim Decides whether to calculate the scenarios among members with same sdate ('member') or among different sdates ('sdate') but fixed member. #' @param Logical for verbose output -#' @return A list with elements \code{$labels} (cluster assigned for each member), +#' @return A list with elements \code{$cluster} (cluster assigned for each member), #' \code{$freq} (relative frequency of each cluster), \code{$closest_member} #' (representative member for each cluster), \code{$repr_field} (list of fields #' for each representative member), \code{composites} (list of mean fields for each cluster). @@ -75,7 +75,7 @@ CST_EnsClus <- function(exp, time_moment = "mean", numclus = 4, #' @param flag_varfrac Selects the minimum number of eofs needed to reach the varfrac_explained. #' @param cluster_dim Decides whether to calculate the scenarios among members with same sdate ('member') or among different sdates ('sdate') but fixed member. #' @param verbose Logical for verbose output -#' @return A list with elements \code{$labels} (cluster assigned for each member), +#' @return A list with elements \code{$cluster} (cluster assigned for each member), #' \code{$freq} (relative frequency of each cluster), \code{$closest_member} #' (representative member for each cluster), \code{$repr_field} (list of fields #' for each representative member), \code{composites} (list of mean fields for each cluster). @@ -187,7 +187,7 @@ EnsClus <- function(data, lat, lon, time_moment = "mean", numclus = 4, composites <- aperm(clusters$regimes, c(1, 3, 2)) names(dim(composites))[2:3] <- names(dim(var_ens))[2:3] names(dim(composites))[1] <- "cluster" - out <- list(labels = clus_labels, freq = frequencies, + out <- list(cluster = clus_labels, freq = frequencies, closest_member = closest_member, repr_field = repr_field, composites = composites) return(out) -- GitLab From ed50890796807b1290e47b5a22dd5ba0e7d0f8b1 Mon Sep 17 00:00:00 2001 From: jhardenberg Date: Sat, 16 Nov 2019 14:19:03 +0100 Subject: [PATCH 11/33] simplified options and updated docs --- R/CST_EnsClus.R | 54 ++++++++++++++++++++----------------------------- 1 file changed, 22 insertions(+), 32 deletions(-) diff --git a/R/CST_EnsClus.R b/R/CST_EnsClus.R index 76edf23d..ab0fcbc2 100644 --- a/R/CST_EnsClus.R +++ b/R/CST_EnsClus.R @@ -11,16 +11,15 @@ #' #' @param exp An object of the class 's2dv_cube', containing the variables to be analysed. #' Each data object in the list is expected to have an element named \code{$data} with at least two -#' spatial dimensions named "lon" and "lat", a dimension "ftime" and a dimension "sdate". +#' spatial dimensions named "lon" and "lat", and dimensions "dataset", "member", "ftime", "sdate". #' @param time_moment Decides the moment to be applied to the time dimension. Can be either 'mean' (time mean), 'sd' (standard deviation along time) or 'perc' (a selected percentile on time). If 'perc' the keyword 'time_percentile' is also needed. -#' @param time_percentile Set the percentile in time you want to analyse. Active only if time_moment is 'perc'. +#' @param time_percentile Set the percentile in time you want to analyse (used for `time_moment = "perc"). #' @param numclus Number of clusters (scenarios) to be calculated. -#' @param lon_lim List with the two longitude margins. (-180,180) format. +#' @param lon_lim List with the two longitude margins in `c(-180,180)` format. #' @param lat_lim List with the two latitude margins. -#' @param varfrac_explained Variance to be explained by the set of eofs. Used only if flag_varfrac is True. -#' @param numpcs Number of eofs retained in the analysis. -#' @param flag_varfrac Selects the minimum number of eofs needed to reach the varfrac_explained. -#' @param cluster_dim Decides whether to calculate the scenarios among members with same sdate ('member') or among different sdates ('sdate') but fixed member. +#' @param varfrac_explained variance to be explained by the set of eofs. If left at NULL `numpcs` EOFs are kept. +#' @param numpcs Number of eofs retained in the analysis (not used if `varfrac_explained` is used) +#' @param cluster_dim Dimension along which to cluster. Typically "member" or "sdate". #' @param Logical for verbose output #' @return A list with elements \code{$cluster} (cluster assigned for each member), #' \code{$freq} (relative frequency of each cluster), \code{$closest_member} @@ -34,9 +33,9 @@ CST_EnsClus <- function(exp, time_moment = "mean", numclus = 4, lon_lim = NULL, lat_lim = NULL, - varfrac_explained = 80, numpcs = 4, - flag_varfrac = FALSE, time_percentile = NULL, - cluster_dim = "member", verbose = F) { + varfrac_explained = NULL, numpcs = 4, + time_percentile = 90, cluster_dim = "member", + verbose = F) { if (!inherits(exp, "s2dv_cube")) { stop("Parameter 'exp' must be of the class 's2dv_cube', ", "as output by CSTools::CST_Load.") @@ -46,7 +45,6 @@ CST_EnsClus <- function(exp, time_moment = "mean", numclus = 4, time_moment = time_moment, numclus = numclus, lon_lim = lon_lim, lat_lim = lat_lim, varfrac_explained = varfrac_explained, numpcs = numpcs, - flag_varfrac = flag_varfrac, time_percentile = time_percentile, cluster_dim = cluster_dim, verbose = verbose) @@ -66,15 +64,14 @@ CST_EnsClus <- function(exp, time_moment = "mean", numclus = 4, #' #' @param data A matrix of dimensions 'dataset member sdate ftime lat lon' containing the variables to be analysed. #' @param time_moment Decides the moment to be applied to the time dimension. Can be either 'mean' (time mean), 'sd' (standard deviation along time) or 'perc' (a selected percentile on time). If 'perc' the keyword 'time_percentile' is also needed. -#' @param time_percentile Set the percentile in time you want to analyse. Active only if time_moment is 'perc'. +#' @param time_percentile Set the percentile in time you want to analyse (used for `time_moment = "perc"). #' @param numclus Number of clusters (scenarios) to be calculated. -#' @param lon_lim List with the two longitude margins. (-180,180) format. +#' @param lon_lim List with the two longitude margins in `c(-180,180)` format. #' @param lat_lim List with the two latitude margins. -#' @param varfrac_explained Variance to be explained by the set of eofs. Used only if flag_varfrac is True. -#' @param numpcs Number of eofs retained in the analysis. -#' @param flag_varfrac Selects the minimum number of eofs needed to reach the varfrac_explained. -#' @param cluster_dim Decides whether to calculate the scenarios among members with same sdate ('member') or among different sdates ('sdate') but fixed member. -#' @param verbose Logical for verbose output +#' @param varfrac_explained variance to be explained by the set of eofs. If left at NULL `numpcs` EOFs are kept. +#' @param numpcs Number of eofs retained in the analysis (not used if `varfrac_explained` is used) +#' @param cluster_dim Dimension along which to cluster. Typically "member" or "sdate". +#' @param Logical for verbose output #' @return A list with elements \code{$cluster} (cluster assigned for each member), #' \code{$freq} (relative frequency of each cluster), \code{$closest_member} #' (representative member for each cluster), \code{$repr_field} (list of fields @@ -86,8 +83,8 @@ CST_EnsClus <- function(exp, time_moment = "mean", numclus = 4, #' } EnsClus <- function(data, lat, lon, time_moment = "mean", numclus = 4, - lon_lim = NULL, lat_lim = NULL, varfrac_explained = 80, - numpcs = 4, flag_varfrac = FALSE, time_percentile = NULL, + lon_lim = NULL, lat_lim = NULL, varfrac_explained = NULL, + numpcs = 4, time_percentile = 90, cluster_dim = "member", verbose = T) { # Apply time_moment @@ -112,15 +109,14 @@ EnsClus <- function(data, lat, lon, time_moment = "mean", numclus = 4, lat, lon, numclus = numclus, lon_lim = lon_lim, lat_lim = lat_lim, varfrac_explained = varfrac_explained, - numpcs = numpcs, flag_varfrac = flag_varfrac, - verbose = verbose) + numpcs = numpcs, verbose = verbose) return(append(result, list(lat = lat, lon = lon))) } # Atomic ensclus function .ensclus <- function(var_ens, lat, lon, numclus = 4, lon_lim = NULL, lat_lim = NULL, varfrac_explained = 80, numpcs = 4, - flag_varfrac = FALSE, verbose = T) { + verbose = T) { if (length(lat) != dim(var_ens)[2]) { print("Incorrect lat length") exit() @@ -142,8 +138,7 @@ EnsClus <- function(data, lat, lon, time_moment = "mean", numclus = 4, # reshaping to give the right input to regimes function var_anom <- aperm(var_anom, c(3, 2, 1)) clusters <- .regimes(lon, lat, var_anom, ncluster = numclus, ntime = 1000, - neof = numpcs, lon_lim, lat_lim, - flag_varfrac = flag_varfrac, perc = varfrac_explained, + neof = numpcs, lon_lim, lat_lim, perc = varfrac_explained, max_eofs = n_ens - 1, verbose = verbose) clus_labels <- as.array(clusters$cluster) @@ -290,7 +285,7 @@ EnsClus <- function(data, lat, lon, time_moment = "mean", numclus = 4, .regimes <- function(lon, lat, field, ncluster = 4, ntime = 1000, neof = 10, - xlim, ylim, alg = "Hartigan-Wong", flag_varfrac = FALSE, + xlim, ylim, alg = "Hartigan-Wong", perc = NULL, max_eofs = 50, verbose = T) { # R tool to compute cluster analysis based on k-means. # Requires "personal" function eofs @@ -299,11 +294,7 @@ EnsClus <- function(data, lat, lon, time_moment = "mean", numclus = 4, # Reduce the phase space with EOFs: use SVD and do not standardize PCs .printv("Launching EOFs...", verbose) t0 <- proc.time() - if (flag_varfrac) { - if (is.null(perc)) { - print("Warning: flag_varfrac is TRUE but no perc as been specified") - stop() - } + if (!is.null(perc)) { reducedspace <- .eofs(lon, lat, field, neof = max_eofs, xlim = xlim, ylim = ylim, method = "SVD", do_regression = F, do_standardize = F, verbose = verbose) @@ -314,7 +305,6 @@ EnsClus <- function(data, lat, lon, time_moment = "mean", numclus = 4, reducedspace <- .eofs(lon, lat, field, neof = neof, xlim = xlim, ylim = ylim, method = "SVD", do_regression = F, do_standardize = F, verbose = verbose) - # extract the principal components PC <- reducedspace$coeff } t1 <- proc.time() - t0 -- GitLab From 5ab2a39d8ce7a6ad39e4e750a277e97a268f280a Mon Sep 17 00:00:00 2001 From: jhardenberg Date: Sat, 16 Nov 2019 14:28:21 +0100 Subject: [PATCH 12/33] better error messages --- R/CST_EnsClus.R | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/R/CST_EnsClus.R b/R/CST_EnsClus.R index ab0fcbc2..262488f6 100644 --- a/R/CST_EnsClus.R +++ b/R/CST_EnsClus.R @@ -91,19 +91,17 @@ EnsClus <- function(data, lat, lon, time_moment = "mean", numclus = 4, if (time_moment == "mean") { .printv("Considering the time_moment: mean", verbose) exp <- apply(data, c(1, 2, 3, 5, 6), mean) - } - - if (time_moment == "sd") { + } else if (time_moment == "sd") { .printv("Considering the time_moment: sd", verbose) exp <- apply(data, c(1, 2, 3, 5, 6), sd) - } - - if (time_moment == "perc") { + } else if (time_moment == "perc") { .printv(paste0("Considering the time_moment: percentile ", sprintf("%5f", time_percentile)), verbose) exp <- apply(data, c(1, 2, 3, 5, 6), quantile, time_percentile / 100.) + } else { + stop(paste0("Invalid time_moment '", time_moment, "' specified!")) } - + # Repeatedly apply .ensclus result <- Apply(exp, target_dims = c(cluster_dim, "lat", "lon"), .ensclus, lat, lon, numclus = numclus, @@ -118,13 +116,11 @@ EnsClus <- function(data, lat, lon, time_moment = "mean", numclus = 4, lat_lim = NULL, varfrac_explained = 80, numpcs = 4, verbose = T) { if (length(lat) != dim(var_ens)[2]) { - print("Incorrect lat length") - exit() + stop("Incorrect lat length") } - + if (length(lon) != dim(var_ens)[3]) { - print("Incorrect lon length") - exit() + stop("Incorrect lon length") } n_ens <- dim(var_ens)[1] @@ -138,7 +134,8 @@ EnsClus <- function(data, lat, lon, time_moment = "mean", numclus = 4, # reshaping to give the right input to regimes function var_anom <- aperm(var_anom, c(3, 2, 1)) clusters <- .regimes(lon, lat, var_anom, ncluster = numclus, ntime = 1000, - neof = numpcs, lon_lim, lat_lim, perc = varfrac_explained, + neof = numpcs, lon_lim, lat_lim, + perc = varfrac_explained, max_eofs = n_ens - 1, verbose = verbose) clus_labels <- as.array(clusters$cluster) -- GitLab From 97a6f86c5c5e52cd84d4d4845a9bbde330a9ae5c Mon Sep 17 00:00:00 2001 From: jhardenberg Date: Sat, 16 Nov 2019 14:57:43 +0100 Subject: [PATCH 13/33] allow providing a list to cluster_dim --- R/CST_EnsClus.R | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/R/CST_EnsClus.R b/R/CST_EnsClus.R index 262488f6..7f9bc92d 100644 --- a/R/CST_EnsClus.R +++ b/R/CST_EnsClus.R @@ -19,7 +19,7 @@ #' @param lat_lim List with the two latitude margins. #' @param varfrac_explained variance to be explained by the set of eofs. If left at NULL `numpcs` EOFs are kept. #' @param numpcs Number of eofs retained in the analysis (not used if `varfrac_explained` is used) -#' @param cluster_dim Dimension along which to cluster. Typically "member" or "sdate". +#' @param cluster_dim Dimension along which to cluster. Typically "member" or "sdate". This can also be a list like c("member", "sdate"). #' @param Logical for verbose output #' @return A list with elements \code{$cluster} (cluster assigned for each member), #' \code{$freq} (relative frequency of each cluster), \code{$closest_member} @@ -70,7 +70,7 @@ CST_EnsClus <- function(exp, time_moment = "mean", numclus = 4, #' @param lat_lim List with the two latitude margins. #' @param varfrac_explained variance to be explained by the set of eofs. If left at NULL `numpcs` EOFs are kept. #' @param numpcs Number of eofs retained in the analysis (not used if `varfrac_explained` is used) -#' @param cluster_dim Dimension along which to cluster. Typically "member" or "sdate". +#' @param cluster_dim Dimension along which to cluster. Typically "member" or "sdate". This can also be a list like c("member", "sdate"). #' @param Logical for verbose output #' @return A list with elements \code{$cluster} (cluster assigned for each member), #' \code{$freq} (relative frequency of each cluster), \code{$closest_member} @@ -115,10 +115,16 @@ EnsClus <- function(data, lat, lon, time_moment = "mean", numclus = 4, .ensclus <- function(var_ens, lat, lon, numclus = 4, lon_lim = NULL, lat_lim = NULL, varfrac_explained = 80, numpcs = 4, verbose = T) { + # Check if more than one dimension has been passed for clustering + sampledims <- NULL + if(length(dim(var_ens)) > 3) { + sampledims <- head(dim(var_ens), -2) + dim(var_ens) <- c(samples = prod(sampledims), + tail(dim(var_ens), 2)) + } if (length(lat) != dim(var_ens)[2]) { stop("Incorrect lat length") } - if (length(lon) != dim(var_ens)[3]) { stop("Incorrect lon length") } @@ -140,6 +146,9 @@ EnsClus <- function(data, lat, lon, time_moment = "mean", numclus = 4, clus_labels <- as.array(clusters$cluster) names(dim(clus_labels))[1] <- names(dim(var_ens))[1] + if (!is.null(sampledims)) { + dim(clus_labels) <- c(sampledims, dim(clus_labels)[-1]) + } frequencies <- as.array(clusters$frequencies) names(dim(frequencies))[1] <- "cluster" -- GitLab From 5b0da9870427a80aec2a3711091684f2291756be Mon Sep 17 00:00:00 2001 From: jhardenberg Date: Sun, 17 Nov 2019 01:57:06 +0100 Subject: [PATCH 14/33] added unit test --- tests/testthat/test-CST_EnsClus.R | 73 +++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 tests/testthat/test-CST_EnsClus.R diff --git a/tests/testthat/test-CST_EnsClus.R b/tests/testthat/test-CST_EnsClus.R new file mode 100644 index 00000000..ac6cdb19 --- /dev/null +++ b/tests/testthat/test-CST_EnsClus.R @@ -0,0 +1,73 @@ +context("Generic tests") +test_that("Sanity and Functionality tests", { + data <- rnorm(2 * 15 * 4 * 5 * 6 * 7) + dim(data) <- c(dataset = 2, member = 15, + sdate = 4, ftime = 5, lat = 6, lon = 7) + lon <- seq(0, 12, 2) + lat <- seq(10, 15, 1) + exp <- list(data = data, lat = lat, lon = lon) + + # Check error messages + expect_error( + CST_EnsClus(exp), + "Parameter 'exp' must be of the class 's2dv_cube'" + ) + attr(exp, "class") <- "s2dv_cube" + + expect_error( + CST_EnsClus(exp, time_moment = "invalid"), + "Invalid time_moment" + ) + + exp$lat <- 1 + expect_error(CST_EnsClus(exp), "Incorrect lat length") + exp$lon <- 1 + exp$lat <- lat + expect_error(CST_EnsClus(exp), "Incorrect lon length") + exp$lon <- lon + + # Sanity checks on dimensions + res <- CST_EnsClus(exp, numclus = 3) + expect_equivalent(dim(res$cluster), dim(exp$data)[c(2, 1, 3)]) + res <- CST_EnsClus(exp, numclus = 3, cluster_dim = "sdate") + expect_equivalent(dim(res$cluster), dim(exp$data)[c(3, 1, 2)]) + res <- CST_EnsClus(exp, numclus = 3, + cluster_dim = c("member", "dataset", "sdate")) + expect_equivalent(dim(res$cluster), dim(exp$data)[c(2, 1, 3)]) + res <- CST_EnsClus(exp, numclus = 3, cluster_dim = c("member", "sdate")) + expect_equivalent(dim(res$cluster), dim(exp$data)[c(2, 3, 1)]) + expect_equivalent(dim(res$freq), c(cluster = 3, dim(exp$data)[1])) + expect_equivalent(dim(res$closest_member), c(cluster = 3, dim(exp$data)[1])) + expect_equivalent(dim(res$repr_field), c(cluster = 3, + dim(exp$data)[c(5, 6)], dim(exp$data)[1])) + expect_equivalent(dim(res$composites), c(cluster = 3, + dim(exp$data)[c(5, 6)], dim(exp$data)[1])) + + # Functionality tests + res <- CST_EnsClus(exp, numclus = 3, varfrac_explained = 80, + cluster_dim = "member") + # The closest member of each cluster should be member of that cluster + for (i in 1:3) { + expect_equivalent(res$cluster[res$closest_member[i, 1, 1], 1, 1], i) + } + + res <- CST_EnsClus(exp, numclus = 3, numpcs = 8, + cluster_dim = c("member")) + for (i in 1:3) { + expect_equivalent(res$cluster[res$closest_member[i, 1, 1], 1, 1], i) + } + + res <- CST_EnsClus(exp, numclus = 3, varfrac_explained = 80, + cluster_dim = c("member")) + + # The weighted mean of all cluster mean fields should be 0 + comp1 <- sweep(res$composites, c(1, 4, 5), res$freq / 100, "*") + cmean <- apply(comp1, c(2, 3, 4, 5), mean)[ , , 1, 1] + expect_equivalent(mean(cmean), 0., tolerance = 1e-10) + + # This should give a different result + res2 <- CST_EnsClus(exp, numclus = 3, varfrac_explained = 80, + cluster_dim = c("member"), + lat_lim = c(10, 12), lon_lim = c(0, 6)) + expect_gt(sum((res$cluster - res2$cluster)^2), 0) +}) -- GitLab From f258956a1ecfdef785fb7ac4d82449ceedd1012b Mon Sep 17 00:00:00 2001 From: jhardenberg Date: Sun, 17 Nov 2019 02:25:39 +0100 Subject: [PATCH 15/33] added examples --- R/CST_EnsClus.R | 43 +++++++++++++++++++++---------- tests/testthat/test-CST_EnsClus.R | 6 ++--- 2 files changed, 32 insertions(+), 17 deletions(-) diff --git a/R/CST_EnsClus.R b/R/CST_EnsClus.R index 7f9bc92d..f4eef4e5 100644 --- a/R/CST_EnsClus.R +++ b/R/CST_EnsClus.R @@ -17,10 +17,10 @@ #' @param numclus Number of clusters (scenarios) to be calculated. #' @param lon_lim List with the two longitude margins in `c(-180,180)` format. #' @param lat_lim List with the two latitude margins. -#' @param varfrac_explained variance to be explained by the set of eofs. If left at NULL `numpcs` EOFs are kept. -#' @param numpcs Number of eofs retained in the analysis (not used if `varfrac_explained` is used) +#' @param variance_explained variance (percentage) to be explained by the set of eofs. If left at NULL `numpcs` EOFs are kept. +#' @param numpcs Number of eofs retained in the analysis (not used if `variance_explained` is used) #' @param cluster_dim Dimension along which to cluster. Typically "member" or "sdate". This can also be a list like c("member", "sdate"). -#' @param Logical for verbose output +#' @param verbose Logical for verbose output #' @return A list with elements \code{$cluster} (cluster assigned for each member), #' \code{$freq} (relative frequency of each cluster), \code{$closest_member} #' (representative member for each cluster), \code{$repr_field} (list of fields @@ -28,12 +28,26 @@ #' @export #' @examples #' \donttest{ -#' exp <- 1 : (2 * 3 * 4 * 8 * 8) +#' exp <- lonlat_data$exp +#' # Example 1: Cluster on all start dates, members and models +#' res <- CST_EnsClus(exp, numclus = 3, cluster_dim = c("member", "dataset", "sdate")) +#' iclus = res$cluster[2, 1, 3] +#' print(paste("Cluster of 2. member, 1. dataset, 3. sdate:", iclus)) +#' print(paste("Frequency (numerosity) of cluster (", iclus, ") :", res$freq[iclus])) +#' PlotEquiMap(res$repr_field[iclus, , ], exp$lon, exp$lat, +#' filled.continents = F, +#' toptitle=paste("Representative field of cluster", iclus)) +#' # Example 2: Cluster on members retaining 4 EOFs during preliminary dimensional reduction +#' res <- CST_EnsClus(exp, numclus = 3, numpcs = 4, cluster_dim = "member") +#' # Example 3: Cluster on members, retain 80% of variance during preliminary dimensional reduction +#' res <- CST_EnsClus(exp, numclus = 3, variance_explained = 80, cluster_dim = "member") +#' # Example 4: Compute percentile in time +#' res <- CST_EnsClus(exp, numclus = 3, time_percentile = 90, time_moment = "perc", cluster_dim = "member") #' } CST_EnsClus <- function(exp, time_moment = "mean", numclus = 4, lon_lim = NULL, lat_lim = NULL, - varfrac_explained = NULL, numpcs = 4, + variance_explained = NULL, numpcs = 4, time_percentile = 90, cluster_dim = "member", verbose = F) { if (!inherits(exp, "s2dv_cube")) { @@ -44,7 +58,7 @@ CST_EnsClus <- function(exp, time_moment = "mean", numclus = 4, result <- EnsClus(exp$data, exp$lat, exp$lon, time_moment = time_moment, numclus = numclus, lon_lim = lon_lim, lat_lim = lat_lim, - varfrac_explained = varfrac_explained, numpcs = numpcs, + variance_explained = variance_explained, numpcs = numpcs, time_percentile = time_percentile, cluster_dim = cluster_dim, verbose = verbose) @@ -68,10 +82,10 @@ CST_EnsClus <- function(exp, time_moment = "mean", numclus = 4, #' @param numclus Number of clusters (scenarios) to be calculated. #' @param lon_lim List with the two longitude margins in `c(-180,180)` format. #' @param lat_lim List with the two latitude margins. -#' @param varfrac_explained variance to be explained by the set of eofs. If left at NULL `numpcs` EOFs are kept. -#' @param numpcs Number of eofs retained in the analysis (not used if `varfrac_explained` is used) +#' @param variance_explained variance to be explained (in percentage) by the set of eofs. If left at NULL `numpcs` EOFs are kept. +#' @param numpcs Number of eofs retained in the analysis (not used if `variance_explained` is used) #' @param cluster_dim Dimension along which to cluster. Typically "member" or "sdate". This can also be a list like c("member", "sdate"). -#' @param Logical for verbose output +#' @param verbose Logical for verbose output #' @return A list with elements \code{$cluster} (cluster assigned for each member), #' \code{$freq} (relative frequency of each cluster), \code{$closest_member} #' (representative member for each cluster), \code{$repr_field} (list of fields @@ -79,11 +93,12 @@ CST_EnsClus <- function(exp, time_moment = "mean", numclus = 4, #' @export #' @examples #' \donttest{ -#' exp <- 1 : (2 * 3 * 4 * 8 * 8) +#' exp <- lonlat_data$exp +#' res <- EnsClus(exp$data, exp$lat, exp$lon, numclus = 3, cluster_dim = c("member", "dataset", "sdate")) #' } EnsClus <- function(data, lat, lon, time_moment = "mean", numclus = 4, - lon_lim = NULL, lat_lim = NULL, varfrac_explained = NULL, + lon_lim = NULL, lat_lim = NULL, variance_explained = NULL, numpcs = 4, time_percentile = 90, cluster_dim = "member", verbose = T) { @@ -106,14 +121,14 @@ EnsClus <- function(data, lat, lon, time_moment = "mean", numclus = 4, result <- Apply(exp, target_dims = c(cluster_dim, "lat", "lon"), .ensclus, lat, lon, numclus = numclus, lon_lim = lon_lim, lat_lim = lat_lim, - varfrac_explained = varfrac_explained, + variance_explained = variance_explained, numpcs = numpcs, verbose = verbose) return(append(result, list(lat = lat, lon = lon))) } # Atomic ensclus function .ensclus <- function(var_ens, lat, lon, numclus = 4, lon_lim = NULL, - lat_lim = NULL, varfrac_explained = 80, numpcs = 4, + lat_lim = NULL, variance_explained = 80, numpcs = 4, verbose = T) { # Check if more than one dimension has been passed for clustering sampledims <- NULL @@ -141,7 +156,7 @@ EnsClus <- function(data, lat, lon, time_moment = "mean", numclus = 4, var_anom <- aperm(var_anom, c(3, 2, 1)) clusters <- .regimes(lon, lat, var_anom, ncluster = numclus, ntime = 1000, neof = numpcs, lon_lim, lat_lim, - perc = varfrac_explained, + perc = variance_explained, max_eofs = n_ens - 1, verbose = verbose) clus_labels <- as.array(clusters$cluster) diff --git a/tests/testthat/test-CST_EnsClus.R b/tests/testthat/test-CST_EnsClus.R index ac6cdb19..a6438a86 100644 --- a/tests/testthat/test-CST_EnsClus.R +++ b/tests/testthat/test-CST_EnsClus.R @@ -44,7 +44,7 @@ test_that("Sanity and Functionality tests", { dim(exp$data)[c(5, 6)], dim(exp$data)[1])) # Functionality tests - res <- CST_EnsClus(exp, numclus = 3, varfrac_explained = 80, + res <- CST_EnsClus(exp, numclus = 3, variance_explained = 80, cluster_dim = "member") # The closest member of each cluster should be member of that cluster for (i in 1:3) { @@ -57,7 +57,7 @@ test_that("Sanity and Functionality tests", { expect_equivalent(res$cluster[res$closest_member[i, 1, 1], 1, 1], i) } - res <- CST_EnsClus(exp, numclus = 3, varfrac_explained = 80, + res <- CST_EnsClus(exp, numclus = 3, variance_explained = 80, cluster_dim = c("member")) # The weighted mean of all cluster mean fields should be 0 @@ -66,7 +66,7 @@ test_that("Sanity and Functionality tests", { expect_equivalent(mean(cmean), 0., tolerance = 1e-10) # This should give a different result - res2 <- CST_EnsClus(exp, numclus = 3, varfrac_explained = 80, + res2 <- CST_EnsClus(exp, numclus = 3, variance_explained = 80, cluster_dim = c("member"), lat_lim = c(10, 12), lon_lim = c(0, 6)) expect_gt(sum((res$cluster - res2$cluster)^2), 0) -- GitLab From 1fc4c48366068ea5309a29c0ac419721a4f3c209 Mon Sep 17 00:00:00 2001 From: jhardenberg Date: Sun, 17 Nov 2019 02:34:13 +0100 Subject: [PATCH 16/33] tolerance in test --- tests/testthat/test-CST_EnsClus.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-CST_EnsClus.R b/tests/testthat/test-CST_EnsClus.R index a6438a86..f92f31d7 100644 --- a/tests/testthat/test-CST_EnsClus.R +++ b/tests/testthat/test-CST_EnsClus.R @@ -63,7 +63,7 @@ test_that("Sanity and Functionality tests", { # The weighted mean of all cluster mean fields should be 0 comp1 <- sweep(res$composites, c(1, 4, 5), res$freq / 100, "*") cmean <- apply(comp1, c(2, 3, 4, 5), mean)[ , , 1, 1] - expect_equivalent(mean(cmean), 0., tolerance = 1e-10) + expect_equal(mean(cmean), 0., tolerance = 1e-10) # This should give a different result res2 <- CST_EnsClus(exp, numclus = 3, variance_explained = 80, -- GitLab From 69e51c5f6587cf493bdb157647b01d6c1c56acba Mon Sep 17 00:00:00 2001 From: Federico Fabiano Date: Mon, 18 Nov 2019 14:55:53 +0100 Subject: [PATCH 17/33] Corrected bug in .eofs: longitude selection with first meridian crossing --- R/CST_EnsClus.R | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/R/CST_EnsClus.R b/R/CST_EnsClus.R index f4eef4e5..72c6128e 100644 --- a/R/CST_EnsClus.R +++ b/R/CST_EnsClus.R @@ -13,7 +13,7 @@ #' Each data object in the list is expected to have an element named \code{$data} with at least two #' spatial dimensions named "lon" and "lat", and dimensions "dataset", "member", "ftime", "sdate". #' @param time_moment Decides the moment to be applied to the time dimension. Can be either 'mean' (time mean), 'sd' (standard deviation along time) or 'perc' (a selected percentile on time). If 'perc' the keyword 'time_percentile' is also needed. -#' @param time_percentile Set the percentile in time you want to analyse (used for `time_moment = "perc"). +#' @param time_percentile Set the percentile in time you want to analyse (used for `time_moment = "perc"). #' @param numclus Number of clusters (scenarios) to be calculated. #' @param lon_lim List with the two longitude margins in `c(-180,180)` format. #' @param lat_lim List with the two latitude margins. @@ -78,7 +78,7 @@ CST_EnsClus <- function(exp, time_moment = "mean", numclus = 4, #' #' @param data A matrix of dimensions 'dataset member sdate ftime lat lon' containing the variables to be analysed. #' @param time_moment Decides the moment to be applied to the time dimension. Can be either 'mean' (time mean), 'sd' (standard deviation along time) or 'perc' (a selected percentile on time). If 'perc' the keyword 'time_percentile' is also needed. -#' @param time_percentile Set the percentile in time you want to analyse (used for `time_moment = "perc"). +#' @param time_percentile Set the percentile in time you want to analyse (used for `time_moment = "perc"). #' @param numclus Number of clusters (scenarios) to be calculated. #' @param lon_lim List with the two longitude margins in `c(-180,180)` format. #' @param lat_lim List with the two latitude margins. @@ -135,7 +135,7 @@ EnsClus <- function(data, lat, lon, time_moment = "mean", numclus = 4, if(length(dim(var_ens)) > 3) { sampledims <- head(dim(var_ens), -2) dim(var_ens) <- c(samples = prod(sampledims), - tail(dim(var_ens), 2)) + tail(dim(var_ens), 2)) } if (length(lat) != dim(var_ens)[2]) { stop("Incorrect lat length") @@ -244,9 +244,20 @@ EnsClus <- function(data, lat, lon, time_moment = "mean", numclus = 4, } # box - box <- wwfield[lonselect, latselect, ] - slon <- lon[lonselect] - slat <- lat[latselect] + ### IF THE BOX CROSSES THE ZERO LINE THIS IS NEEDED: + if (lonselect[2] < lonselect[1]) { + gigi = lon > xlim[1] | lon < xlim[2] + box <- wwfield[gigi, latselect, ] + slon <- lon[gigi] + slat <- lat[latselect] + } else { + box <- wwfield[lonselect, latselect, ] + slon <- lon[lonselect] + slat <- lat[latselect] + } + + print('Selected longitudes: ', slon) + print('Selected latitudes: ', slat) # transform 3D field in a matrix new_box <- array(box, dim = c(dim(box)[1] * dim(box)[2], dim(box)[3])) -- GitLab From 5901443e5fd0023a734f3d72c2b18bf93ebd287e Mon Sep 17 00:00:00 2001 From: Federico Fabiano Date: Mon, 18 Nov 2019 15:02:37 +0100 Subject: [PATCH 18/33] Corrected print slon, slat --- R/CST_EnsClus.R | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/R/CST_EnsClus.R b/R/CST_EnsClus.R index 72c6128e..1e177513 100644 --- a/R/CST_EnsClus.R +++ b/R/CST_EnsClus.R @@ -256,8 +256,10 @@ EnsClus <- function(data, lat, lon, time_moment = "mean", numclus = 4, slat <- lat[latselect] } - print('Selected longitudes: ', slon) - print('Selected latitudes: ', slat) + print('Selected longitudes: ') + print(slon) + print('Selected latitudes: ') + print(slat) # transform 3D field in a matrix new_box <- array(box, dim = c(dim(box)[1] * dim(box)[2], dim(box)[3])) -- GitLab From 69c166f7d3af1c12aa514e761c894d17e44b8029 Mon Sep 17 00:00:00 2001 From: Federico Fabiano Date: Mon, 18 Nov 2019 15:30:59 +0100 Subject: [PATCH 19/33] changed print to printv --- R/CST_EnsClus.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/CST_EnsClus.R b/R/CST_EnsClus.R index 1e177513..b1c3923f 100644 --- a/R/CST_EnsClus.R +++ b/R/CST_EnsClus.R @@ -256,10 +256,10 @@ EnsClus <- function(data, lat, lon, time_moment = "mean", numclus = 4, slat <- lat[latselect] } - print('Selected longitudes: ') - print(slon) - print('Selected latitudes: ') - print(slat) + .printv('Selected longitudes: ', verbose) + .printv(slon, verbose) + .printv('Selected latitudes: ', verbose) + .printv(slat, verbose) # transform 3D field in a matrix new_box <- array(box, dim = c(dim(box)[1] * dim(box)[2], dim(box)[3])) -- GitLab From 5dd737031b0210cde2ed9d5cf2ac6b032f9dc913 Mon Sep 17 00:00:00 2001 From: jhardenberg Date: Mon, 18 Nov 2019 16:19:59 +0100 Subject: [PATCH 20/33] some delinting --- R/CST_EnsClus.R | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/R/CST_EnsClus.R b/R/CST_EnsClus.R index b1c3923f..9127ca42 100644 --- a/R/CST_EnsClus.R +++ b/R/CST_EnsClus.R @@ -243,12 +243,10 @@ EnsClus <- function(data, lat, lon, time_moment = "mean", numclus = 4, latselect <- 1:length(lat) } - # box - ### IF THE BOX CROSSES THE ZERO LINE THIS IS NEEDED: if (lonselect[2] < lonselect[1]) { - gigi = lon > xlim[1] | lon < xlim[2] - box <- wwfield[gigi, latselect, ] - slon <- lon[gigi] + isel <- lon > xlim[1] | lon < xlim[2] + box <- wwfield[isel, latselect, ] + slon <- lon[isel] slat <- lat[latselect] } else { box <- wwfield[lonselect, latselect, ] @@ -256,9 +254,9 @@ EnsClus <- function(data, lat, lon, time_moment = "mean", numclus = 4, slat <- lat[latselect] } - .printv('Selected longitudes: ', verbose) + .printv("Selected longitudes: ", verbose) .printv(slon, verbose) - .printv('Selected latitudes: ', verbose) + .printv("Selected latitudes: ", verbose) .printv(slat, verbose) # transform 3D field in a matrix -- GitLab From 5ff831697dd35b656f577c74b25b4bac8c2f8fe0 Mon Sep 17 00:00:00 2001 From: Jost von Hardenberg Date: Mon, 18 Nov 2019 17:12:05 +0100 Subject: [PATCH 21/33] fix limits in box selection --- R/CST_EnsClus.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/CST_EnsClus.R b/R/CST_EnsClus.R index 9127ca42..ffcc3760 100644 --- a/R/CST_EnsClus.R +++ b/R/CST_EnsClus.R @@ -244,7 +244,7 @@ EnsClus <- function(data, lat, lon, time_moment = "mean", numclus = 4, } if (lonselect[2] < lonselect[1]) { - isel <- lon > xlim[1] | lon < xlim[2] + isel <- lon >= xlim[1] | lon <= xlim[2] box <- wwfield[isel, latselect, ] slon <- lon[isel] slat <- lat[latselect] -- GitLab From cb5e5f62d8ddce5d19dacdb640f05c78ca2e1e5a Mon Sep 17 00:00:00 2001 From: jhardenberg Date: Tue, 19 Nov 2019 14:59:48 +0100 Subject: [PATCH 22/33] renamed to EnsClustering (1) --- R/{CST_EnsClus.R => CST_EnsClustering.R} | 0 tests/testthat/{test-CST_EnsClus.R => test-CST_EnsClustering.R} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename R/{CST_EnsClus.R => CST_EnsClustering.R} (100%) rename tests/testthat/{test-CST_EnsClus.R => test-CST_EnsClustering.R} (100%) diff --git a/R/CST_EnsClus.R b/R/CST_EnsClustering.R similarity index 100% rename from R/CST_EnsClus.R rename to R/CST_EnsClustering.R diff --git a/tests/testthat/test-CST_EnsClus.R b/tests/testthat/test-CST_EnsClustering.R similarity index 100% rename from tests/testthat/test-CST_EnsClus.R rename to tests/testthat/test-CST_EnsClustering.R -- GitLab From 430ae9c709744c9197b60a9cb4c85754e27cd298 Mon Sep 17 00:00:00 2001 From: jhardenberg Date: Tue, 19 Nov 2019 15:03:22 +0100 Subject: [PATCH 23/33] renamed to EnsClustering (2) --- R/CST_EnsClustering.R | 24 ++++++++++++------------ tests/testthat/test-CST_EnsClustering.R | 24 ++++++++++++------------ 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/R/CST_EnsClustering.R b/R/CST_EnsClustering.R index ffcc3760..327bcd39 100644 --- a/R/CST_EnsClustering.R +++ b/R/CST_EnsClustering.R @@ -1,4 +1,4 @@ -#' @rdname CST_EnsClus +#' @rdname CST_EnsClustering #' @title Ensemble clustering #' #' @author Federico Fabiano - ISAC-CNR, \email{f.fabiano@isac.cnr.it} @@ -30,7 +30,7 @@ #' \donttest{ #' exp <- lonlat_data$exp #' # Example 1: Cluster on all start dates, members and models -#' res <- CST_EnsClus(exp, numclus = 3, cluster_dim = c("member", "dataset", "sdate")) +#' res <- CST_EnsClustering(exp, numclus = 3, cluster_dim = c("member", "dataset", "sdate")) #' iclus = res$cluster[2, 1, 3] #' print(paste("Cluster of 2. member, 1. dataset, 3. sdate:", iclus)) #' print(paste("Frequency (numerosity) of cluster (", iclus, ") :", res$freq[iclus])) @@ -38,14 +38,14 @@ #' filled.continents = F, #' toptitle=paste("Representative field of cluster", iclus)) #' # Example 2: Cluster on members retaining 4 EOFs during preliminary dimensional reduction -#' res <- CST_EnsClus(exp, numclus = 3, numpcs = 4, cluster_dim = "member") +#' res <- CST_EnsClustering(exp, numclus = 3, numpcs = 4, cluster_dim = "member") #' # Example 3: Cluster on members, retain 80% of variance during preliminary dimensional reduction -#' res <- CST_EnsClus(exp, numclus = 3, variance_explained = 80, cluster_dim = "member") +#' res <- CST_EnsClustering(exp, numclus = 3, variance_explained = 80, cluster_dim = "member") #' # Example 4: Compute percentile in time -#' res <- CST_EnsClus(exp, numclus = 3, time_percentile = 90, time_moment = "perc", cluster_dim = "member") +#' res <- CST_EnsClustering(exp, numclus = 3, time_percentile = 90, time_moment = "perc", cluster_dim = "member") #' } -CST_EnsClus <- function(exp, time_moment = "mean", numclus = 4, +CST_EnsClustering <- function(exp, time_moment = "mean", numclus = 4, lon_lim = NULL, lat_lim = NULL, variance_explained = NULL, numpcs = 4, time_percentile = 90, cluster_dim = "member", @@ -55,7 +55,7 @@ CST_EnsClus <- function(exp, time_moment = "mean", numclus = 4, "as output by CSTools::CST_Load.") } - result <- EnsClus(exp$data, exp$lat, exp$lon, + result <- EnsClustering(exp$data, exp$lat, exp$lon, time_moment = time_moment, numclus = numclus, lon_lim = lon_lim, lat_lim = lat_lim, variance_explained = variance_explained, numpcs = numpcs, @@ -65,7 +65,7 @@ CST_EnsClus <- function(exp, time_moment = "mean", numclus = 4, return(result) } -#' @rdname EnsClus +#' @rdname EnsClustering #' @title Ensemble clustering #' #' @author Federico Fabiano - ISAC-CNR, \email{f.fabiano@isac.cnr.it} @@ -94,10 +94,10 @@ CST_EnsClus <- function(exp, time_moment = "mean", numclus = 4, #' @examples #' \donttest{ #' exp <- lonlat_data$exp -#' res <- EnsClus(exp$data, exp$lat, exp$lon, numclus = 3, cluster_dim = c("member", "dataset", "sdate")) +#' res <- EnsClustering(exp$data, exp$lat, exp$lon, numclus = 3, cluster_dim = c("member", "dataset", "sdate")) #' } -EnsClus <- function(data, lat, lon, time_moment = "mean", numclus = 4, +EnsClustering <- function(data, lat, lon, time_moment = "mean", numclus = 4, lon_lim = NULL, lat_lim = NULL, variance_explained = NULL, numpcs = 4, time_percentile = 90, cluster_dim = "member", verbose = T) { @@ -193,7 +193,7 @@ EnsClus <- function(data, lat, lon, time_moment = "mean", numclus = 4, dist_closest_member[iclu] <- dista } } - .printv("EnsClus completed...", verbose) + .printv("EnsClustering completed...", verbose) names(dim(closest_member))[1] <- "cluster" repr_field <- var_ens[closest_member, , ] @@ -244,7 +244,7 @@ EnsClus <- function(data, lat, lon, time_moment = "mean", numclus = 4, } if (lonselect[2] < lonselect[1]) { - isel <- lon >= xlim[1] | lon <= xlim[2] + isel <- lon > xlim[1] | lon < xlim[2] box <- wwfield[isel, latselect, ] slon <- lon[isel] slat <- lat[latselect] diff --git a/tests/testthat/test-CST_EnsClustering.R b/tests/testthat/test-CST_EnsClustering.R index f92f31d7..8c507db3 100644 --- a/tests/testthat/test-CST_EnsClustering.R +++ b/tests/testthat/test-CST_EnsClustering.R @@ -9,32 +9,32 @@ test_that("Sanity and Functionality tests", { # Check error messages expect_error( - CST_EnsClus(exp), + CST_EnsClustering(exp), "Parameter 'exp' must be of the class 's2dv_cube'" ) attr(exp, "class") <- "s2dv_cube" expect_error( - CST_EnsClus(exp, time_moment = "invalid"), + CST_EnsClustering(exp, time_moment = "invalid"), "Invalid time_moment" ) exp$lat <- 1 - expect_error(CST_EnsClus(exp), "Incorrect lat length") + expect_error(CST_EnsClustering(exp), "Incorrect lat length") exp$lon <- 1 exp$lat <- lat - expect_error(CST_EnsClus(exp), "Incorrect lon length") + expect_error(CST_EnsClustering(exp), "Incorrect lon length") exp$lon <- lon # Sanity checks on dimensions - res <- CST_EnsClus(exp, numclus = 3) + res <- CST_EnsClustering(exp, numclus = 3) expect_equivalent(dim(res$cluster), dim(exp$data)[c(2, 1, 3)]) - res <- CST_EnsClus(exp, numclus = 3, cluster_dim = "sdate") + res <- CST_EnsClustering(exp, numclus = 3, cluster_dim = "sdate") expect_equivalent(dim(res$cluster), dim(exp$data)[c(3, 1, 2)]) - res <- CST_EnsClus(exp, numclus = 3, + res <- CST_EnsClustering(exp, numclus = 3, cluster_dim = c("member", "dataset", "sdate")) expect_equivalent(dim(res$cluster), dim(exp$data)[c(2, 1, 3)]) - res <- CST_EnsClus(exp, numclus = 3, cluster_dim = c("member", "sdate")) + res <- CST_EnsClustering(exp, numclus = 3, cluster_dim = c("member", "sdate")) expect_equivalent(dim(res$cluster), dim(exp$data)[c(2, 3, 1)]) expect_equivalent(dim(res$freq), c(cluster = 3, dim(exp$data)[1])) expect_equivalent(dim(res$closest_member), c(cluster = 3, dim(exp$data)[1])) @@ -44,20 +44,20 @@ test_that("Sanity and Functionality tests", { dim(exp$data)[c(5, 6)], dim(exp$data)[1])) # Functionality tests - res <- CST_EnsClus(exp, numclus = 3, variance_explained = 80, + res <- CST_EnsClustering(exp, numclus = 3, variance_explained = 80, cluster_dim = "member") # The closest member of each cluster should be member of that cluster for (i in 1:3) { expect_equivalent(res$cluster[res$closest_member[i, 1, 1], 1, 1], i) } - res <- CST_EnsClus(exp, numclus = 3, numpcs = 8, + res <- CST_EnsClustering(exp, numclus = 3, numpcs = 8, cluster_dim = c("member")) for (i in 1:3) { expect_equivalent(res$cluster[res$closest_member[i, 1, 1], 1, 1], i) } - res <- CST_EnsClus(exp, numclus = 3, variance_explained = 80, + res <- CST_EnsClustering(exp, numclus = 3, variance_explained = 80, cluster_dim = c("member")) # The weighted mean of all cluster mean fields should be 0 @@ -66,7 +66,7 @@ test_that("Sanity and Functionality tests", { expect_equal(mean(cmean), 0., tolerance = 1e-10) # This should give a different result - res2 <- CST_EnsClus(exp, numclus = 3, variance_explained = 80, + res2 <- CST_EnsClustering(exp, numclus = 3, variance_explained = 80, cluster_dim = c("member"), lat_lim = c(10, 12), lon_lim = c(0, 6)) expect_gt(sum((res$cluster - res2$cluster)^2), 0) -- GitLab From edbfcb1642a3fbbe489c29bf9709385a5683a808 Mon Sep 17 00:00:00 2001 From: jhardenberg Date: Tue, 19 Nov 2019 16:25:26 +0100 Subject: [PATCH 24/33] new box selection --- R/CST_EnsClustering.R | 53 +++++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 25 deletions(-) diff --git a/R/CST_EnsClustering.R b/R/CST_EnsClustering.R index 327bcd39..a00f9fb2 100644 --- a/R/CST_EnsClustering.R +++ b/R/CST_EnsClustering.R @@ -232,40 +232,43 @@ EnsClustering <- function(data, lat, lon, time_moment = "mean", numclus = 4, # selection of the xbox and ybox if defined if (!is.null(xlim)) { - lonselect <- .whicher(lon, xlim[1]):.whicher(lon, xlim[2]) + # This transforms c(-20, -10) to c(340, 350) but c(-20, 10) is unchanged + # Bring them all to the same units in the 0:360 range + xlim1 <- xlim[1] %% 360 + xlim2 <- xlim[2] %% 360 + if (xlim1 > xlim2) { + # If box crosses 0 first limit is negative + xlim1 <- xlim1 - 360 + } + lonm <- lon %% 360 + # If it crosses 0 leave unchanged + if (lonm[1] > tail(lonm, 1)) { + lonm <- lon + } + ilonsel <- (lonm >= xlim1) & (lonm <= xlim2) + if (!any(ilonsel)) { + stop("No intersection between longitude bounds and data domain.") + } } else { - lonselect <- 1:length(lon) + ilonsel <- 1:length(lon) } - if (!is.null(ylim)) { - latselect <- .whicher(lat, ylim[1]):.whicher(lat, ylim[2]) + ilatsel <- (lat >= ylim[1]) & (lat <= ylim[2]) } else { - latselect <- 1:length(lat) + ilatsel <- 1:length(lat) } - - if (lonselect[2] < lonselect[1]) { - isel <- lon > xlim[1] | lon < xlim[2] - box <- wwfield[isel, latselect, ] - slon <- lon[isel] - slat <- lat[latselect] - } else { - box <- wwfield[lonselect, latselect, ] - slon <- lon[lonselect] - slat <- lat[latselect] - } - - .printv("Selected longitudes: ", verbose) - .printv(slon, verbose) - .printv("Selected latitudes: ", verbose) - .printv(slat, verbose) + slon <- lon[ilonsel] + slat <- lat[ilatsel] + wwfield <- wwfield[ilonsel, ilatsel, ] # transform 3D field in a matrix - new_box <- array(box, dim = c(dim(box)[1] * dim(box)[2], dim(box)[3])) + wwfield <- array(wwfield, dim = c(dim(wwfield)[1] * dim(wwfield)[2], + dim(wwfield)[3])) # calling SVD if (method == "SVD") { .printv("Calling SVD...", verbose) - SVD <- svd(new_box, nu = neof, nv = neof) + SVD <- svd(wwfield, nu = neof, nv = neof) # extracting EOFs (loading pattern), expansions coefficient # and variance explained @@ -282,9 +285,9 @@ EnsClustering <- function(data, lat, lon, time_moment = "mean", numclus = 4, # calling covariance matrix if (method == "covariance") { .printv("Calling eigenvectors of the covariance matrix...", verbose) - covma <- cov(t(new_box)) + covma <- cov(t(wwfield)) eig <- eigen(covma) - coef <- (t(new_box) %*% eig$vector)[, 1:neof] + coef <- (t(wwfield) %*% eig$vector)[, 1:neof] pattern <- array(eig$vectors, dim = c(dim(box)[1], dim(box)[2], dim(box)[3]))[, , 1:neof] variance <- eig$values[1:neof] / sum(eig$values) -- GitLab From b81d98bed73988b7992423b5f6853fdaef4c92bb Mon Sep 17 00:00:00 2001 From: jhardenberg Date: Tue, 19 Nov 2019 18:16:32 +0100 Subject: [PATCH 25/33] the final box fix --- R/CST_EnsClustering.R | 64 +++++++++++++++++++++++-------------------- 1 file changed, 34 insertions(+), 30 deletions(-) diff --git a/R/CST_EnsClustering.R b/R/CST_EnsClustering.R index a00f9fb2..a896f7f3 100644 --- a/R/CST_EnsClustering.R +++ b/R/CST_EnsClustering.R @@ -230,36 +230,10 @@ EnsClustering <- function(data, lat, lon, time_moment = "mean", numclus = 4, ww <- .area.weight(lon, lat, root = T) wwfield <- sweep(field, c(1, 2), ww, "*") - # selection of the xbox and ybox if defined - if (!is.null(xlim)) { - # This transforms c(-20, -10) to c(340, 350) but c(-20, 10) is unchanged - # Bring them all to the same units in the 0:360 range - xlim1 <- xlim[1] %% 360 - xlim2 <- xlim[2] %% 360 - if (xlim1 > xlim2) { - # If box crosses 0 first limit is negative - xlim1 <- xlim1 - 360 - } - lonm <- lon %% 360 - # If it crosses 0 leave unchanged - if (lonm[1] > tail(lonm, 1)) { - lonm <- lon - } - ilonsel <- (lonm >= xlim1) & (lonm <= xlim2) - if (!any(ilonsel)) { - stop("No intersection between longitude bounds and data domain.") - } - } else { - ilonsel <- 1:length(lon) - } - if (!is.null(ylim)) { - ilatsel <- (lat >= ylim[1]) & (lat <= ylim[2]) - } else { - ilatsel <- 1:length(lat) - } - slon <- lon[ilonsel] - slat <- lat[ilatsel] - wwfield <- wwfield[ilonsel, ilatsel, ] + idx <- .selbox(lon, lat, xlim, ylim) + slon <- lon[idx$ilon] + slat <- lat[idx$ilat] + wwfield <- wwfield[idx$ilon, idx$ilat, ] # transform 3D field in a matrix wwfield <- array(wwfield, dim = c(dim(wwfield)[1] * dim(wwfield)[2], @@ -413,3 +387,33 @@ EnsClustering <- function(data, lat, lon, time_moment = "mean", numclus = 4, sd(timeseries, na.rm = T) return(out) } + +.selbox <- function(lon, lat, xlim = NULL, ylim = NULL) { + if (!is.null(xlim)) { + # This transforms c(-20, -10) to c(340, 350) but c(-20, 10) is unchanged + # Bring them all to the same units in the 0:360 range + xlim1 <- xlim[1] %% 360 + xlim2 <- xlim[2] %% 360 + lonm <- lon %% 360 + if (lonm[1] > tail(lonm, 1)) { + lonm <- lon + } + if (xlim1 > xlim2) { + # If box crosses 0 + ilonsel <- (lonm >= xlim1) | (lonm <= xlim2) + } else { + ilonsel <- (lonm >= xlim1) & (lonm <= xlim2) + } + if (!any(ilonsel)) { + stop("No intersection between longitude bounds and data domain.") + } + } else { + ilonsel <- 1:length(lon) + } + if (!is.null(ylim)) { + ilatsel <- (lat >= ylim[1]) & (lat <= ylim[2]) + } else { + ilatsel <- 1:length(lat) + } + return(list(ilon = ilonsel, ilat = ilatsel)) +} -- GitLab From 85090db1e0daf791a03cd4d640403df1b3aecbf2 Mon Sep 17 00:00:00 2001 From: jhardenberg Date: Tue, 19 Nov 2019 18:43:43 +0100 Subject: [PATCH 26/33] small cleanup --- R/CST_EnsClustering.R | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/R/CST_EnsClustering.R b/R/CST_EnsClustering.R index a896f7f3..3110bb2b 100644 --- a/R/CST_EnsClustering.R +++ b/R/CST_EnsClustering.R @@ -223,7 +223,7 @@ EnsClustering <- function(data, lat, lon, time_moment = "mean", numclus = 4, # If requested, computes linear regressions and standardizes the PCs # If you want to use the regressions, remember to standardize the PCs # Take as input a 3D anomaly field. - # Requires "personal" functions area.weight, whicher and standardize + # Requires "personal" functions area.weight, standardize # area weighting, based on the root of cosine .printv("Area Weighting...", verbose) @@ -250,7 +250,7 @@ EnsClustering <- function(data, lat, lon, time_moment = "mean", numclus = 4, coefficient <- SVD$v variance <- (SVD$d[1:neof])^2 / sum((SVD$d)^2) if (do_standardize) { - coefficient <- apply(coefficient, c(2), standardize) + coefficient <- apply(coefficient, c(2), .standardize) } else { coefficient <- sweep(coefficient, c(2), sqrt(variance), "*") } @@ -375,12 +375,6 @@ EnsClustering <- function(data, lat, lon, time_moment = "mean", numclus = 4, } } -# detect ics ipsilon lat-lon -.whicher <- function(axis, number) { - out <- which.min(abs(axis - number)) - return(out) -} - # normalize a time series .standardize <- function(timeseries) { out <- (timeseries - mean(timeseries, na.rm = T)) / -- GitLab From ddd52672a0ba1662c0df79ebf72b04874604aa4f Mon Sep 17 00:00:00 2001 From: jhardenberg Date: Thu, 21 Nov 2019 13:18:01 +0100 Subject: [PATCH 27/33] extended documentation --- R/CST_EnsClustering.R | 52 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 48 insertions(+), 4 deletions(-) diff --git a/R/CST_EnsClustering.R b/R/CST_EnsClustering.R index 3110bb2b..80821055 100644 --- a/R/CST_EnsClustering.R +++ b/R/CST_EnsClustering.R @@ -7,7 +7,45 @@ #' @author Paolo Davini - ISAC-CNR, \email{p.davini@isac.cnr.it} #' @author Jost von Hardenberg - ISAC-CNR, \email{j.vonhardenberg@isac.cnr.it} #' -#' @description This function performs a clustering on members/starting dates and returns a number of scenarios, with representative members for each of them. The clustering is performed in a reduced EOF space. +#' @description This function performs a clustering on members/starting dates +#' and returns a number of scenarios, with representative members for each of them. +#' The clustering is performed in a reduced EOF space. +#' +#' Motivation: +#' Ensemble forecasts give a probabilistic insight of average weather conditions +#' on extended timescales, i.e. from sub-seasonal to seasonal and beyond. +#' With large ensembles, it is often an advantage to be able to group members +#' according to similar characteristics and to select the most representative member for each cluster. +#' This can be useful to characterize the most probable forecast scenarios in a multi-model +#' (or single model) ensemble prediction. This approach, applied at a regional level, +#' can also be used to identify the subset of ensemble members that best represent the +#' full range of possible solutions for downscaling applications. +#' The choice of the ensemble members is made flexible in order to meet the requirements +#' of specific (regional) climate information products, to be tailored for different regions and user needs. +#' +#' Description of the tool: +#' EnsClustering is a cluster analysis tool, based on the k-means algorithm, for ensemble predictions. +#' The aim is to group ensemble members according to similar characteristics and +#' to select the most representative member for each cluster. +#' The user chooses which feature of the data is used to group the ensemble members by clustering: +#' time mean, maximum, a certain percentile (e.g., 75% as in the examples below), +#' standard deviation and trend over the time period. For each ensemble member this value +#' is computed at each grid point, obtaining N lat-lon maps, where N is the number of ensemble members. +#' The anomaly is computed subtracting the ensemble mean of these maps to each of the single maps. +#' The anomaly is therefore computed with respect to the ensemble members (and not with respect to the time) +#' and the Empirical Orthogonal Function (EOF) analysis is applied to these anomaly maps. +#' Regarding the EOF analysis, the user can choose either how many Principal Components (PCs) +#' to retain or the percentage of explained variance to keep. After reducing dimensionality via +#' EOF analysis, k-means analysis is applied using the desired subset of PCs. +#' +#' The major final outputs are the classification in clusters, i.e. which member belongs +#' to which cluster (in k-means analysis the number k of clusters needs to be defined +#' prior to the analysis) and the most representative member for each cluster, +#' which is the closest member to the cluster centroid. +#' Other outputs refer to the statistics of clustering: in the PC space, the minimum and +#' the maximum distance between a member in a cluster and the cluster centroid +#' (i.e. the closest and the furthest member), the intra-cluster standard +#' deviation for each cluster (i.e. how much the cluster is compact). #' #' @param exp An object of the class 's2dv_cube', containing the variables to be analysed. #' Each data object in the list is expected to have an element named \code{$data} with at least two @@ -24,7 +62,9 @@ #' @return A list with elements \code{$cluster} (cluster assigned for each member), #' \code{$freq} (relative frequency of each cluster), \code{$closest_member} #' (representative member for each cluster), \code{$repr_field} (list of fields -#' for each representative member), \code{composites} (list of mean fields for each cluster). +#' for each representative member), \code{composites} (list of mean fields for each cluster), +#' \code{$lon} (selected longitudes of output fields), +#' \code{$lat} (selected longitudes of output fields). #' @export #' @examples #' \donttest{ @@ -74,7 +114,9 @@ CST_EnsClustering <- function(exp, time_moment = "mean", numclus = 4, #' @author Paolo Davini - ISAC-CNR, \email{p.davini@isac.cnr.it} #' @author Jost von Hardenberg - ISAC-CNR, \email{j.vonhardenberg@isac.cnr.it} #' -#' @description This function performs a clustering on members/starting dates and returns a number of scenarios, with representative members for each of them. The clustering is performed in a reduced EOF space. +#' @description This function performs a clustering on members/starting dates +#' and returns a number of scenarios, with representative members for each of them. +#' The clustering is performed in a reduced EOF space. #' #' @param data A matrix of dimensions 'dataset member sdate ftime lat lon' containing the variables to be analysed. #' @param time_moment Decides the moment to be applied to the time dimension. Can be either 'mean' (time mean), 'sd' (standard deviation along time) or 'perc' (a selected percentile on time). If 'perc' the keyword 'time_percentile' is also needed. @@ -89,7 +131,9 @@ CST_EnsClustering <- function(exp, time_moment = "mean", numclus = 4, #' @return A list with elements \code{$cluster} (cluster assigned for each member), #' \code{$freq} (relative frequency of each cluster), \code{$closest_member} #' (representative member for each cluster), \code{$repr_field} (list of fields -#' for each representative member), \code{composites} (list of mean fields for each cluster). +#' for each representative member), \code{composites} (list of mean fields for each cluster), +#' \code($lon) (selected longitudes of output fields), +#' \code($lat) (selected longitudes of output fields). #' @export #' @examples #' \donttest{ -- GitLab From 6ec546074095d6a5af9e4ed378ad8ed6e9bf6af5 Mon Sep 17 00:00:00 2001 From: jhardenberg Date: Thu, 21 Nov 2019 23:12:04 +0100 Subject: [PATCH 28/33] better defaults for variance_explained and numpcs --- R/CST_EnsClustering.R | 46 +++++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/R/CST_EnsClustering.R b/R/CST_EnsClustering.R index 80821055..c6b70477 100644 --- a/R/CST_EnsClustering.R +++ b/R/CST_EnsClustering.R @@ -50,13 +50,17 @@ #' @param exp An object of the class 's2dv_cube', containing the variables to be analysed. #' Each data object in the list is expected to have an element named \code{$data} with at least two #' spatial dimensions named "lon" and "lat", and dimensions "dataset", "member", "ftime", "sdate". -#' @param time_moment Decides the moment to be applied to the time dimension. Can be either 'mean' (time mean), 'sd' (standard deviation along time) or 'perc' (a selected percentile on time). If 'perc' the keyword 'time_percentile' is also needed. +#' @param time_moment Decides the moment to be applied to the time dimension. Can be either 'mean' (time mean), +#' 'sd' (standard deviation along time) or 'perc' (a selected percentile on time). +#' If 'perc' the keyword 'time_percentile' is also needed. #' @param time_percentile Set the percentile in time you want to analyse (used for `time_moment = "perc"). #' @param numclus Number of clusters (scenarios) to be calculated. +#' If set to NULL the number of ensemble members divided by 10 is used, with a minimum of 2 and a maximum of 8. #' @param lon_lim List with the two longitude margins in `c(-180,180)` format. #' @param lat_lim List with the two latitude margins. -#' @param variance_explained variance (percentage) to be explained by the set of eofs. If left at NULL `numpcs` EOFs are kept. -#' @param numpcs Number of eofs retained in the analysis (not used if `variance_explained` is used) +#' @param variance_explained variance (percentage) to be explained by the set of EOFs. +#' Defaults to 80. Not used if numpcs is specified. +#' @param numpcs Number of EOFs retained in the analysis (optional). #' @param cluster_dim Dimension along which to cluster. Typically "member" or "sdate". This can also be a list like c("member", "sdate"). #' @param verbose Logical for verbose output #' @return A list with elements \code{$cluster} (cluster assigned for each member), @@ -64,7 +68,7 @@ #' (representative member for each cluster), \code{$repr_field} (list of fields #' for each representative member), \code{composites} (list of mean fields for each cluster), #' \code{$lon} (selected longitudes of output fields), -#' \code{$lat} (selected longitudes of output fields). +#' \code{$lat} (selected longitudes of output fields). #' @export #' @examples #' \donttest{ @@ -85,9 +89,9 @@ #' res <- CST_EnsClustering(exp, numclus = 3, time_percentile = 90, time_moment = "perc", cluster_dim = "member") #' } -CST_EnsClustering <- function(exp, time_moment = "mean", numclus = 4, +CST_EnsClustering <- function(exp, time_moment = "mean", numclus = NULL, lon_lim = NULL, lat_lim = NULL, - variance_explained = NULL, numpcs = 4, + variance_explained = 80, numpcs = NULL, time_percentile = 90, cluster_dim = "member", verbose = F) { if (!inherits(exp, "s2dv_cube")) { @@ -119,13 +123,17 @@ CST_EnsClustering <- function(exp, time_moment = "mean", numclus = 4, #' The clustering is performed in a reduced EOF space. #' #' @param data A matrix of dimensions 'dataset member sdate ftime lat lon' containing the variables to be analysed. -#' @param time_moment Decides the moment to be applied to the time dimension. Can be either 'mean' (time mean), 'sd' (standard deviation along time) or 'perc' (a selected percentile on time). If 'perc' the keyword 'time_percentile' is also needed. +#' @param time_moment Decides the moment to be applied to the time dimension. Can be either 'mean' (time mean), +#' 'sd' (standard deviation along time) or 'perc' (a selected percentile on time). +#' If 'perc' the keyword 'time_percentile' is also needed. #' @param time_percentile Set the percentile in time you want to analyse (used for `time_moment = "perc"). #' @param numclus Number of clusters (scenarios) to be calculated. +#' If set to NULL the number of ensemble members divided by 10 is used, with a minimum of 2 and a maximum of 8. #' @param lon_lim List with the two longitude margins in `c(-180,180)` format. #' @param lat_lim List with the two latitude margins. -#' @param variance_explained variance to be explained (in percentage) by the set of eofs. If left at NULL `numpcs` EOFs are kept. -#' @param numpcs Number of eofs retained in the analysis (not used if `variance_explained` is used) +#' @param variance_explained variance (percentage) to be explained by the set of EOFs. +#' Defaults to 80. Not used if numpcs is specified. +#' @param numpcs Number of EOFs retained in the analysis (optional). #' @param cluster_dim Dimension along which to cluster. Typically "member" or "sdate". This can also be a list like c("member", "sdate"). #' @param verbose Logical for verbose output #' @return A list with elements \code{$cluster} (cluster assigned for each member), @@ -133,7 +141,7 @@ CST_EnsClustering <- function(exp, time_moment = "mean", numclus = 4, #' (representative member for each cluster), \code{$repr_field} (list of fields #' for each representative member), \code{composites} (list of mean fields for each cluster), #' \code($lon) (selected longitudes of output fields), -#' \code($lat) (selected longitudes of output fields). +#' \code($lat) (selected longitudes of output fields). #' @export #' @examples #' \donttest{ @@ -141,9 +149,9 @@ CST_EnsClustering <- function(exp, time_moment = "mean", numclus = 4, #' res <- EnsClustering(exp$data, exp$lat, exp$lon, numclus = 3, cluster_dim = c("member", "dataset", "sdate")) #' } -EnsClustering <- function(data, lat, lon, time_moment = "mean", numclus = 4, - lon_lim = NULL, lat_lim = NULL, variance_explained = NULL, - numpcs = 4, time_percentile = 90, +EnsClustering <- function(data, lat, lon, time_moment = "mean", numclus = NULL, + lon_lim = NULL, lat_lim = NULL, variance_explained = 80, + numpcs = NULL, time_percentile = 90, cluster_dim = "member", verbose = T) { # Apply time_moment @@ -171,12 +179,12 @@ EnsClustering <- function(data, lat, lon, time_moment = "mean", numclus = 4, } # Atomic ensclus function -.ensclus <- function(var_ens, lat, lon, numclus = 4, lon_lim = NULL, - lat_lim = NULL, variance_explained = 80, numpcs = 4, +.ensclus <- function(var_ens, lat, lon, numclus = NULL, lon_lim = NULL, + lat_lim = NULL, variance_explained = 80, numpcs = NULL, verbose = T) { # Check if more than one dimension has been passed for clustering sampledims <- NULL - if(length(dim(var_ens)) > 3) { + if (length(dim(var_ens)) > 3) { sampledims <- head(dim(var_ens), -2) dim(var_ens) <- c(samples = prod(sampledims), tail(dim(var_ens), 2)) @@ -188,6 +196,10 @@ EnsClustering <- function(data, lat, lon, time_moment = "mean", numclus = 4, stop("Incorrect lon length") } n_ens <- dim(var_ens)[1] + if (is.null(numclus)) { + numclus <- min(max(floor(n_ens / 10), 2), 8) + } + .printv(paste("Number of clusters:", numclus), verbose) .printv("Calculating ensemble anomalies...", verbose) ens_mean <- apply(var_ens, c(2, 3), mean) @@ -347,7 +359,7 @@ EnsClustering <- function(data, lat, lon, time_moment = "mean", numclus = 4, # Reduce the phase space with EOFs: use SVD and do not standardize PCs .printv("Launching EOFs...", verbose) t0 <- proc.time() - if (!is.null(perc)) { + if (is.null(neof)) { reducedspace <- .eofs(lon, lat, field, neof = max_eofs, xlim = xlim, ylim = ylim, method = "SVD", do_regression = F, do_standardize = F, verbose = verbose) -- GitLab From 6a6d731196d2c34313caf16ac5d10d12f18cfc0e Mon Sep 17 00:00:00 2001 From: jhardenberg Date: Thu, 21 Nov 2019 23:17:08 +0100 Subject: [PATCH 29/33] small typo fix --- R/CST_EnsClustering.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/CST_EnsClustering.R b/R/CST_EnsClustering.R index c6b70477..6b0ad911 100644 --- a/R/CST_EnsClustering.R +++ b/R/CST_EnsClustering.R @@ -140,8 +140,8 @@ CST_EnsClustering <- function(exp, time_moment = "mean", numclus = NULL, #' \code{$freq} (relative frequency of each cluster), \code{$closest_member} #' (representative member for each cluster), \code{$repr_field} (list of fields #' for each representative member), \code{composites} (list of mean fields for each cluster), -#' \code($lon) (selected longitudes of output fields), -#' \code($lat) (selected longitudes of output fields). +#' \code{$lon} (selected longitudes of output fields), +#' \code{$lat} (selected longitudes of output fields). #' @export #' @examples #' \donttest{ -- GitLab From a69defd85a587e17dbf602a95298932e5875692a Mon Sep 17 00:00:00 2001 From: nperez Date: Fri, 22 Nov 2019 11:59:18 +0100 Subject: [PATCH 30/33] documentation updated with devtools --- NAMESPACE | 2 + man/CST_EnsClustering.Rd | 118 +++++++++++++++++++++++++++++++++++++++ man/EnsClustering.Rd | 67 ++++++++++++++++++++++ 3 files changed, 187 insertions(+) create mode 100644 man/CST_EnsClustering.Rd create mode 100644 man/EnsClustering.Rd diff --git a/NAMESPACE b/NAMESPACE index 4ec2c8fc..22fc489a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -9,6 +9,7 @@ export(CST_BEI_Weighting) export(CST_BiasCorrection) export(CST_Calibration) export(CST_CategoricalEnsCombination) +export(CST_EnsClustering) export(CST_Load) export(CST_MultiEOF) export(CST_MultiMetric) @@ -17,6 +18,7 @@ export(CST_RFSlope) export(CST_RFWeights) export(CST_RainFARM) export(CST_SaveExp) +export(EnsClustering) export(MultiEOF) export(PlotCombinedMap) export(PlotForecastPDF) diff --git a/man/CST_EnsClustering.Rd b/man/CST_EnsClustering.Rd new file mode 100644 index 00000000..9a6d7809 --- /dev/null +++ b/man/CST_EnsClustering.Rd @@ -0,0 +1,118 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/CST_EnsClustering.R +\name{CST_EnsClustering} +\alias{CST_EnsClustering} +\title{Ensemble clustering} +\usage{ +CST_EnsClustering(exp, time_moment = "mean", numclus = NULL, + lon_lim = NULL, lat_lim = NULL, variance_explained = 80, + numpcs = NULL, time_percentile = 90, cluster_dim = "member", + verbose = F) +} +\arguments{ +\item{exp}{An object of the class 's2dv_cube', containing the variables to be analysed. +Each data object in the list is expected to have an element named \code{$data} with at least two +spatial dimensions named "lon" and "lat", and dimensions "dataset", "member", "ftime", "sdate".} + +\item{time_moment}{Decides the moment to be applied to the time dimension. Can be either 'mean' (time mean), +'sd' (standard deviation along time) or 'perc' (a selected percentile on time). +If 'perc' the keyword 'time_percentile' is also needed.} + +\item{numclus}{Number of clusters (scenarios) to be calculated. +If set to NULL the number of ensemble members divided by 10 is used, with a minimum of 2 and a maximum of 8.} + +\item{lon_lim}{List with the two longitude margins in `c(-180,180)` format.} + +\item{lat_lim}{List with the two latitude margins.} + +\item{variance_explained}{variance (percentage) to be explained by the set of EOFs. +Defaults to 80. Not used if numpcs is specified.} + +\item{numpcs}{Number of EOFs retained in the analysis (optional).} + +\item{time_percentile}{Set the percentile in time you want to analyse (used for `time_moment = "perc").} + +\item{cluster_dim}{Dimension along which to cluster. Typically "member" or "sdate". This can also be a list like c("member", "sdate").} + +\item{verbose}{Logical for verbose output} +} +\value{ +A list with elements \code{$cluster} (cluster assigned for each member), + \code{$freq} (relative frequency of each cluster), \code{$closest_member} + (representative member for each cluster), \code{$repr_field} (list of fields + for each representative member), \code{composites} (list of mean fields for each cluster), + \code{$lon} (selected longitudes of output fields), + \code{$lat} (selected longitudes of output fields). +} +\description{ +This function performs a clustering on members/starting dates +and returns a number of scenarios, with representative members for each of them. +The clustering is performed in a reduced EOF space. + +Motivation: +Ensemble forecasts give a probabilistic insight of average weather conditions +on extended timescales, i.e. from sub-seasonal to seasonal and beyond. +With large ensembles, it is often an advantage to be able to group members +according to similar characteristics and to select the most representative member for each cluster. +This can be useful to characterize the most probable forecast scenarios in a multi-model +(or single model) ensemble prediction. This approach, applied at a regional level, +can also be used to identify the subset of ensemble members that best represent the +full range of possible solutions for downscaling applications. +The choice of the ensemble members is made flexible in order to meet the requirements +of specific (regional) climate information products, to be tailored for different regions and user needs. + +Description of the tool: +EnsClustering is a cluster analysis tool, based on the k-means algorithm, for ensemble predictions. +The aim is to group ensemble members according to similar characteristics and +to select the most representative member for each cluster. +The user chooses which feature of the data is used to group the ensemble members by clustering: +time mean, maximum, a certain percentile (e.g., 75% as in the examples below), +standard deviation and trend over the time period. For each ensemble member this value +is computed at each grid point, obtaining N lat-lon maps, where N is the number of ensemble members. +The anomaly is computed subtracting the ensemble mean of these maps to each of the single maps. +The anomaly is therefore computed with respect to the ensemble members (and not with respect to the time) +and the Empirical Orthogonal Function (EOF) analysis is applied to these anomaly maps. +Regarding the EOF analysis, the user can choose either how many Principal Components (PCs) +to retain or the percentage of explained variance to keep. After reducing dimensionality via +EOF analysis, k-means analysis is applied using the desired subset of PCs. + +The major final outputs are the classification in clusters, i.e. which member belongs +to which cluster (in k-means analysis the number k of clusters needs to be defined +prior to the analysis) and the most representative member for each cluster, +which is the closest member to the cluster centroid. +Other outputs refer to the statistics of clustering: in the PC space, the minimum and +the maximum distance between a member in a cluster and the cluster centroid +(i.e. the closest and the furthest member), the intra-cluster standard +deviation for each cluster (i.e. how much the cluster is compact). +} +\examples{ +\donttest{ +exp <- lonlat_data$exp +# Example 1: Cluster on all start dates, members and models +res <- CST_EnsClustering(exp, numclus = 3, cluster_dim = c("member", "dataset", "sdate")) +iclus = res$cluster[2, 1, 3] +print(paste("Cluster of 2. member, 1. dataset, 3. sdate:", iclus)) +print(paste("Frequency (numerosity) of cluster (", iclus, ") :", res$freq[iclus])) +PlotEquiMap(res$repr_field[iclus, , ], exp$lon, exp$lat, + filled.continents = F, + toptitle=paste("Representative field of cluster", iclus)) +# Example 2: Cluster on members retaining 4 EOFs during preliminary dimensional reduction +res <- CST_EnsClustering(exp, numclus = 3, numpcs = 4, cluster_dim = "member") +# Example 3: Cluster on members, retain 80\% of variance during preliminary dimensional reduction +res <- CST_EnsClustering(exp, numclus = 3, variance_explained = 80, cluster_dim = "member") +# Example 4: Compute percentile in time +res <- CST_EnsClustering(exp, numclus = 3, time_percentile = 90, time_moment = "perc", cluster_dim = "member") +} +} +\author{ +Federico Fabiano - ISAC-CNR, \email{f.fabiano@isac.cnr.it} + +Ignazio Giuntoli - ISAC-CNR, \email{i.giuntoli@isac.cnr.it} + +Danila Volpi - ISAC-CNR, \email{d.volpi@isac.cnr.it} + +Paolo Davini - ISAC-CNR, \email{p.davini@isac.cnr.it} + +Jost von Hardenberg - ISAC-CNR, \email{j.vonhardenberg@isac.cnr.it} +} + diff --git a/man/EnsClustering.Rd b/man/EnsClustering.Rd new file mode 100644 index 00000000..ce3805fb --- /dev/null +++ b/man/EnsClustering.Rd @@ -0,0 +1,67 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/CST_EnsClustering.R +\name{EnsClustering} +\alias{EnsClustering} +\title{Ensemble clustering} +\usage{ +EnsClustering(data, lat, lon, time_moment = "mean", numclus = NULL, + lon_lim = NULL, lat_lim = NULL, variance_explained = 80, + numpcs = NULL, time_percentile = 90, cluster_dim = "member", + verbose = T) +} +\arguments{ +\item{data}{A matrix of dimensions 'dataset member sdate ftime lat lon' containing the variables to be analysed.} + +\item{time_moment}{Decides the moment to be applied to the time dimension. Can be either 'mean' (time mean), +'sd' (standard deviation along time) or 'perc' (a selected percentile on time). +If 'perc' the keyword 'time_percentile' is also needed.} + +\item{numclus}{Number of clusters (scenarios) to be calculated. +If set to NULL the number of ensemble members divided by 10 is used, with a minimum of 2 and a maximum of 8.} + +\item{lon_lim}{List with the two longitude margins in `c(-180,180)` format.} + +\item{lat_lim}{List with the two latitude margins.} + +\item{variance_explained}{variance (percentage) to be explained by the set of EOFs. +Defaults to 80. Not used if numpcs is specified.} + +\item{numpcs}{Number of EOFs retained in the analysis (optional).} + +\item{time_percentile}{Set the percentile in time you want to analyse (used for `time_moment = "perc").} + +\item{cluster_dim}{Dimension along which to cluster. Typically "member" or "sdate". This can also be a list like c("member", "sdate").} + +\item{verbose}{Logical for verbose output} +} +\value{ +A list with elements \code{$cluster} (cluster assigned for each member), + \code{$freq} (relative frequency of each cluster), \code{$closest_member} + (representative member for each cluster), \code{$repr_field} (list of fields + for each representative member), \code{composites} (list of mean fields for each cluster), + \code{$lon} (selected longitudes of output fields), + \code{$lat} (selected longitudes of output fields). +} +\description{ +This function performs a clustering on members/starting dates +and returns a number of scenarios, with representative members for each of them. +The clustering is performed in a reduced EOF space. +} +\examples{ +\donttest{ +exp <- lonlat_data$exp +res <- EnsClustering(exp$data, exp$lat, exp$lon, numclus = 3, cluster_dim = c("member", "dataset", "sdate")) +} +} +\author{ +Federico Fabiano - ISAC-CNR, \email{f.fabiano@isac.cnr.it} + +Ignazio Giuntoli - ISAC-CNR, \email{i.giuntoli@isac.cnr.it} + +Danila Volpi - ISAC-CNR, \email{d.volpi@isac.cnr.it} + +Paolo Davini - ISAC-CNR, \email{p.davini@isac.cnr.it} + +Jost von Hardenberg - ISAC-CNR, \email{j.vonhardenberg@isac.cnr.it} +} + -- GitLab From 0d32553c266b857259e633ee87f586eb98a2d7a9 Mon Sep 17 00:00:00 2001 From: nperez Date: Fri, 22 Nov 2019 12:25:37 +0100 Subject: [PATCH 31/33] adding s2dv library to test examples --- R/CST_EnsClustering.R | 3 ++- man/CST_EnsClustering.Rd | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/R/CST_EnsClustering.R b/R/CST_EnsClustering.R index 6b0ad911..ff9a5d4c 100644 --- a/R/CST_EnsClustering.R +++ b/R/CST_EnsClustering.R @@ -78,9 +78,10 @@ #' iclus = res$cluster[2, 1, 3] #' print(paste("Cluster of 2. member, 1. dataset, 3. sdate:", iclus)) #' print(paste("Frequency (numerosity) of cluster (", iclus, ") :", res$freq[iclus])) +#' library(s2dverification) #' PlotEquiMap(res$repr_field[iclus, , ], exp$lon, exp$lat, #' filled.continents = F, -#' toptitle=paste("Representative field of cluster", iclus)) +#' toptitle = paste("Representative field of cluster", iclus)) #' # Example 2: Cluster on members retaining 4 EOFs during preliminary dimensional reduction #' res <- CST_EnsClustering(exp, numclus = 3, numpcs = 4, cluster_dim = "member") #' # Example 3: Cluster on members, retain 80% of variance during preliminary dimensional reduction diff --git a/man/CST_EnsClustering.Rd b/man/CST_EnsClustering.Rd index 9a6d7809..2784327e 100644 --- a/man/CST_EnsClustering.Rd +++ b/man/CST_EnsClustering.Rd @@ -93,9 +93,10 @@ res <- CST_EnsClustering(exp, numclus = 3, cluster_dim = c("member", "dataset", iclus = res$cluster[2, 1, 3] print(paste("Cluster of 2. member, 1. dataset, 3. sdate:", iclus)) print(paste("Frequency (numerosity) of cluster (", iclus, ") :", res$freq[iclus])) +library(s2dverification) PlotEquiMap(res$repr_field[iclus, , ], exp$lon, exp$lat, filled.continents = F, - toptitle=paste("Representative field of cluster", iclus)) + toptitle = paste("Representative field of cluster", iclus)) # Example 2: Cluster on members retaining 4 EOFs during preliminary dimensional reduction res <- CST_EnsClustering(exp, numclus = 3, numpcs = 4, cluster_dim = "member") # Example 3: Cluster on members, retain 80\% of variance during preliminary dimensional reduction -- GitLab From 2358a507e20b70b5c82d9ea029b52f6356ab2047 Mon Sep 17 00:00:00 2001 From: nperez Date: Fri, 22 Nov 2019 12:54:57 +0100 Subject: [PATCH 32/33] fixing length lines, examples and missing . in .standardized fun --- R/CST_EnsClustering.R | 98 +++++++++------------------------------- man/CST_EnsClustering.Rd | 21 +++++---- man/EnsClustering.Rd | 9 ++-- 3 files changed, 40 insertions(+), 88 deletions(-) diff --git a/R/CST_EnsClustering.R b/R/CST_EnsClustering.R index ff9a5d4c..f99dad27 100644 --- a/R/CST_EnsClustering.R +++ b/R/CST_EnsClustering.R @@ -69,27 +69,31 @@ #' for each representative member), \code{composites} (list of mean fields for each cluster), #' \code{$lon} (selected longitudes of output fields), #' \code{$lat} (selected longitudes of output fields). -#' @export #' @examples -#' \donttest{ #' exp <- lonlat_data$exp #' # Example 1: Cluster on all start dates, members and models -#' res <- CST_EnsClustering(exp, numclus = 3, cluster_dim = c("member", "dataset", "sdate")) +#' res <- CST_EnsClustering(exp, numclus = 3, +#' cluster_dim = c("member", "dataset", "sdate")) #' iclus = res$cluster[2, 1, 3] +#'\donttest{ #' print(paste("Cluster of 2. member, 1. dataset, 3. sdate:", iclus)) #' print(paste("Frequency (numerosity) of cluster (", iclus, ") :", res$freq[iclus])) #' library(s2dverification) #' PlotEquiMap(res$repr_field[iclus, , ], exp$lon, exp$lat, -#' filled.continents = F, +#' filled.continents = FALSE, #' toptitle = paste("Representative field of cluster", iclus)) -#' # Example 2: Cluster on members retaining 4 EOFs during preliminary dimensional reduction +#'} +#' # Example 2: Cluster on members retaining 4 EOFs during +#' # preliminary dimensional reduction #' res <- CST_EnsClustering(exp, numclus = 3, numpcs = 4, cluster_dim = "member") -#' # Example 3: Cluster on members, retain 80% of variance during preliminary dimensional reduction -#' res <- CST_EnsClustering(exp, numclus = 3, variance_explained = 80, cluster_dim = "member") +#' # Example 3: Cluster on members, retain 80% of variance during +#' # preliminary dimensional reduction +#' res <- CST_EnsClustering(exp, numclus = 3, variance_explained = 80, +#' cluster_dim = "member") #' # Example 4: Compute percentile in time -#' res <- CST_EnsClustering(exp, numclus = 3, time_percentile = 90, time_moment = "perc", cluster_dim = "member") -#' } - +#' res <- CST_EnsClustering(exp, numclus = 3, time_percentile = 90, +#' time_moment = "perc", cluster_dim = "member") +#'@export CST_EnsClustering <- function(exp, time_moment = "mean", numclus = NULL, lon_lim = NULL, lat_lim = NULL, variance_explained = 80, numpcs = NULL, @@ -124,6 +128,8 @@ CST_EnsClustering <- function(exp, time_moment = "mean", numclus = NULL, #' The clustering is performed in a reduced EOF space. #' #' @param data A matrix of dimensions 'dataset member sdate ftime lat lon' containing the variables to be analysed. +#' @param lat Vector of latitudes. +#' @param lon Vector of longitudes. #' @param time_moment Decides the moment to be applied to the time dimension. Can be either 'mean' (time mean), #' 'sd' (standard deviation along time) or 'perc' (a selected percentile on time). #' If 'perc' the keyword 'time_percentile' is also needed. @@ -143,12 +149,12 @@ CST_EnsClustering <- function(exp, time_moment = "mean", numclus = NULL, #' for each representative member), \code{composites} (list of mean fields for each cluster), #' \code{$lon} (selected longitudes of output fields), #' \code{$lat} (selected longitudes of output fields). -#' @export +#' #' @examples -#' \donttest{ #' exp <- lonlat_data$exp -#' res <- EnsClustering(exp$data, exp$lat, exp$lon, numclus = 3, cluster_dim = c("member", "dataset", "sdate")) -#' } +#' res <- EnsClustering(exp$data, exp$lat, exp$lon, numclus = 3, +#' cluster_dim = c("member", "dataset", "sdate")) +#'@export EnsClustering <- function(data, lat, lon, time_moment = "mean", numclus = NULL, lon_lim = NULL, lat_lim = NULL, variance_explained = 80, @@ -323,7 +329,7 @@ EnsClustering <- function(data, lat, lon, time_moment = "mean", numclus = NULL, dim(box)[3]))[, , 1:neof] variance <- eig$values[1:neof] / sum(eig$values) if (do_standardize) { - coefficient <- apply(coef, c(2), standardize) + coefficient <- apply(coef, c(2), .standardize) } else { coefficient <- coef } @@ -406,65 +412,3 @@ EnsClustering <- function(data, lat, lon, time_moment = "mean", numclus = NULL, tot.withinss = regimes$tot.withinss) return(out) } - -# produce a 2d matrix of area weights -.area.weight <- function(ics, ipsilon, root = T) { - field <- array(NA, dim = c(length(ics), length(ipsilon))) - if (root == T) { - for (j in 1:length(ipsilon)) { - field[, j] <- sqrt(cos(pi / 180 * ipsilon[j])) - } - } - - if (root == F) { - for (j in 1:length(ipsilon)) { - field[, j] <- cos(pi / 180 * ipsilon[j]) - } - } - - return(field) -} - -# verbose-only printing function -.printv <- function(value, verbosity = TRUE) { - if (verbosity) { - print(value) - } -} - -# normalize a time series -.standardize <- function(timeseries) { - out <- (timeseries - mean(timeseries, na.rm = T)) / - sd(timeseries, na.rm = T) - return(out) -} - -.selbox <- function(lon, lat, xlim = NULL, ylim = NULL) { - if (!is.null(xlim)) { - # This transforms c(-20, -10) to c(340, 350) but c(-20, 10) is unchanged - # Bring them all to the same units in the 0:360 range - xlim1 <- xlim[1] %% 360 - xlim2 <- xlim[2] %% 360 - lonm <- lon %% 360 - if (lonm[1] > tail(lonm, 1)) { - lonm <- lon - } - if (xlim1 > xlim2) { - # If box crosses 0 - ilonsel <- (lonm >= xlim1) | (lonm <= xlim2) - } else { - ilonsel <- (lonm >= xlim1) & (lonm <= xlim2) - } - if (!any(ilonsel)) { - stop("No intersection between longitude bounds and data domain.") - } - } else { - ilonsel <- 1:length(lon) - } - if (!is.null(ylim)) { - ilatsel <- (lat >= ylim[1]) & (lat <= ylim[2]) - } else { - ilatsel <- 1:length(lat) - } - return(list(ilon = ilonsel, ilat = ilatsel)) -} diff --git a/man/CST_EnsClustering.Rd b/man/CST_EnsClustering.Rd index 2784327e..fe8641b0 100644 --- a/man/CST_EnsClustering.Rd +++ b/man/CST_EnsClustering.Rd @@ -86,24 +86,29 @@ the maximum distance between a member in a cluster and the cluster centroid deviation for each cluster (i.e. how much the cluster is compact). } \examples{ -\donttest{ exp <- lonlat_data$exp # Example 1: Cluster on all start dates, members and models -res <- CST_EnsClustering(exp, numclus = 3, cluster_dim = c("member", "dataset", "sdate")) +res <- CST_EnsClustering(exp, numclus = 3, + cluster_dim = c("member", "dataset", "sdate")) iclus = res$cluster[2, 1, 3] +\donttest{ print(paste("Cluster of 2. member, 1. dataset, 3. sdate:", iclus)) print(paste("Frequency (numerosity) of cluster (", iclus, ") :", res$freq[iclus])) library(s2dverification) PlotEquiMap(res$repr_field[iclus, , ], exp$lon, exp$lat, - filled.continents = F, + filled.continents = FALSE, toptitle = paste("Representative field of cluster", iclus)) -# Example 2: Cluster on members retaining 4 EOFs during preliminary dimensional reduction +} +# Example 2: Cluster on members retaining 4 EOFs during +# preliminary dimensional reduction res <- CST_EnsClustering(exp, numclus = 3, numpcs = 4, cluster_dim = "member") -# Example 3: Cluster on members, retain 80\% of variance during preliminary dimensional reduction -res <- CST_EnsClustering(exp, numclus = 3, variance_explained = 80, cluster_dim = "member") +# Example 3: Cluster on members, retain 80\% of variance during +# preliminary dimensional reduction +res <- CST_EnsClustering(exp, numclus = 3, variance_explained = 80, + cluster_dim = "member") # Example 4: Compute percentile in time -res <- CST_EnsClustering(exp, numclus = 3, time_percentile = 90, time_moment = "perc", cluster_dim = "member") -} +res <- CST_EnsClustering(exp, numclus = 3, time_percentile = 90, + time_moment = "perc", cluster_dim = "member") } \author{ Federico Fabiano - ISAC-CNR, \email{f.fabiano@isac.cnr.it} diff --git a/man/EnsClustering.Rd b/man/EnsClustering.Rd index ce3805fb..27aca453 100644 --- a/man/EnsClustering.Rd +++ b/man/EnsClustering.Rd @@ -12,6 +12,10 @@ EnsClustering(data, lat, lon, time_moment = "mean", numclus = NULL, \arguments{ \item{data}{A matrix of dimensions 'dataset member sdate ftime lat lon' containing the variables to be analysed.} +\item{lat}{Vector of latitudes.} + +\item{lon}{Vector of longitudes.} + \item{time_moment}{Decides the moment to be applied to the time dimension. Can be either 'mean' (time mean), 'sd' (standard deviation along time) or 'perc' (a selected percentile on time). If 'perc' the keyword 'time_percentile' is also needed.} @@ -48,10 +52,9 @@ and returns a number of scenarios, with representative members for each of them. The clustering is performed in a reduced EOF space. } \examples{ -\donttest{ exp <- lonlat_data$exp -res <- EnsClustering(exp$data, exp$lat, exp$lon, numclus = 3, cluster_dim = c("member", "dataset", "sdate")) -} +res <- EnsClustering(exp$data, exp$lat, exp$lon, numclus = 3, + cluster_dim = c("member", "dataset", "sdate")) } \author{ Federico Fabiano - ISAC-CNR, \email{f.fabiano@isac.cnr.it} -- GitLab From 4fa15bc674f4372f9a791d51f20615577c62b7d5 Mon Sep 17 00:00:00 2001 From: jhardenberg Date: Fri, 22 Nov 2019 18:47:30 +0100 Subject: [PATCH 33/33] fix lin.fit --- R/CST_EnsClustering.R | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/R/CST_EnsClustering.R b/R/CST_EnsClustering.R index f99dad27..83b38ceb 100644 --- a/R/CST_EnsClustering.R +++ b/R/CST_EnsClustering.R @@ -288,6 +288,12 @@ EnsClustering <- function(data, lat, lon, time_moment = "mean", numclus = NULL, # Take as input a 3D anomaly field. # Requires "personal" functions area.weight, standardize + if (exists(".lm.fit")) { + lin.fit <- .lm.fit + } else { + lin.fit <- lm.fit + } + # area weighting, based on the root of cosine .printv("Area Weighting...", verbose) ww <- .area.weight(lon, lat, root = T) -- GitLab