Newer
Older
#'@rdname CST_WeatherRegimes
#'@title Function for Calculating the Cluster analysis
#'@author Verónica Torralba - BSC, \email{veronica.torralba@bsc.es}
#'@description This function computes the weather regimes from a cluster
#'analysis. It is applied on the array \code{data} in a 's2dv_cube' object. The
#'dimensionality of this object can be also reduced by using PCs obtained from
#'the application of the #'EOFs analysis to filter the dataset. The cluster
#'analysis can be performed with the traditional k-means or those methods
#'included in the hclust (stats package).
#'
#'@references Cortesi, N., V., Torralba, N., González-Reviriego, A., Soret, and
#'F.J., Doblas-Reyes (2019). Characterization of European wind speed variability
#'using weather regimes. Climate Dynamics,53, 4961–4976, doi:10.1007/s00382-019-04839-5.
#'@references Torralba, V. (2019) Seasonal climate prediction for the wind
#'energy sector: methods and tools for the development of a climate service.
#'Thesis. Available online: \url{https://eprints.ucm.es/56841/}.
#'@param data An 's2dv_cube' object.
#'@param ncenters Number of clusters to be calculated with the clustering
#' function.
#'@param EOFs Whether to compute the EOFs (default = 'TRUE') or not (FALSE) to
#' filter the data.
#'@param neofs Number of modes to be kept (default = 30).
#'@param varThreshold Value with the percentage of variance to be explained by
#' the PCs. Only sufficient PCs to explain this much variance will be used in
#' the clustering.
#'@param method Different options to estimate the clusters. The most traditional
#' approach is the k-means analysis (default=’kmeans’) but the function also
#' support the different methods included in the hclust . These methods are:
#' "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty"
#' (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC). For more details
#' about these methods see the hclust function documentation included in the
#' stats package.
#'@param iter.max Parameter to select the maximum number of iterations allowed
#' (Only if method='kmeans' is selected).
#'@param nstart Parameter for the cluster analysis determining how many random
#' sets to choose (Only if method='kmeans' is selected).
#'@param ncores The number of multicore threads to use for parallel computation.
#'@return A list with two elements \code{$data} (a 's2dv_cube' object containing
#'the composites cluster = 1,..,K for case (*1) or only k = 1 for any specific
#'cluster, i.e., case (*2)) and \code{$statistics} that includes \code{$pvalue}
#'(array with the same structure as \code{$data} containing the pvalue of the
#'composites obtained through a t-test that accounts for the serial dependence.),
#'\code{cluster} (A matrix or vector with integers (from 1:k) indicating the
#'cluster to which each time step is allocated.), \code{persistence} (Percentage
#'of days in a month/season before a cluster is replaced for a new one (only if
#'method=’kmeans’ has been selected.)), \code{frequency} (Percentage of days in
#'a month/season belonging to each cluster (only if method=’kmeans’ has been
#'selected).),
#'@examples
#'data <- array(abs(rnorm(1280, 283.7, 6)), dim = c(dataset = 2, member = 2,
#' sdate = 3, ftime = 3,
#' lat = 4, lon = 4))
#'coords <- list(lon = seq(0, 3), lat = seq(47, 44))
#'obs <- list(data = data, coords = coords)
#'class(obs) <- 's2dv_cube'
#'
#'res1 <- CST_WeatherRegimes(data = obs, EOFs = FALSE, ncenters = 4)
#'res2 <- CST_WeatherRegimes(data = obs, EOFs = TRUE, ncenters = 3)
#'
#'@import multiApply
#'@export
CST_WeatherRegimes <- function(data, ncenters = NULL,
EOFs = TRUE, neofs = 30,
varThreshold = NULL,
method = "kmeans",
iter.max = 100, nstart = 30,
ncores = NULL) {
# Check 's2dv_cube'
if (!inherits(data, 's2dv_cube')) {
stop("Parameter 'data' must be of the class 's2dv_cube', ",
"as output by CSTools::CST_Load.")
}
# Check 'exp' object structure
if (!all(c('data', 'coords') %in% names(data))) {
stop("Parameter 'data' must have 'data' and 'coords' elements ",
"within the 's2dv_cube' structure.")
}
if (!any(names(data$coords) %in% .KnownLonNames()) |
!any(names(data$coords) %in% .KnownLatNames())) {
stop("Spatial coordinate names do not match any of the names accepted ",
"the package.")
} else {
lon_name <- names(data$coords)[[which(names(data$coords) %in% .KnownLonNames())]]
lat_name <- names(data$coords)[[which(names(data$coords) %in% .KnownLatNames())]]
lon <- as.vector(data$coords[[lon_name]])
lat <- as.vector(data$coords[[lat_name]])
result <- WeatherRegime(data$data, ncenters = ncenters,
lat = lat, method = method,
iter.max = iter.max, nstart = nstart,
ncores = ncores)
data$data <- result$composite
data$statistics <- result[-1]
#'@rdname WeatherRegimes
#'@title Function for Calculating the Cluster analysis
#'@author Verónica Torralba - BSC, \email{veronica.torralba@bsc.es}
#'@description This function computes the weather regimes from a cluster analysis.
#'It can be applied over the dataset with dimensions c(year/month, month/day,
#'lon, lat), or by using PCs obtained from the application of the EOFs analysis
#'to filter the dataset. The cluster analysis can be performed with the
#'traditional k-means or those methods included in the hclust (stats package).
#'@references Cortesi, N., V., Torralba, N., González-Reviriego, A., Soret, and
#'F.J., Doblas-Reyes (2019). Characterization of European wind speed variability
#'using weather regimes. Climate Dynamics,53, 4961–4976, doi:10.1007/s00382-019-04839-5.
#'@references Torralba, V. (2019) Seasonal climate prediction for the wind
#'energy sector: methods and tools for the development of a climate service.
#'Thesis. Available online: \url{https://eprints.ucm.es/56841/}
#'@param data An array containing anomalies with named dimensions with at least
#' start date 'sdate', forecast time 'ftime', latitude 'lat' and longitude
#' 'lon'.
#'@param ncenters Number of clusters to be calculated with the clustering
#' function.
#'@param EOFs Whether to compute the EOFs (default = 'TRUE') or not (FALSE) to
#' filter the data.
#'@param neofs Number of modes to be kept only if EOFs = TRUE has been selected.
#' (default = 30).
#'@param varThreshold Value with the percentage of variance to be explained by
#' the PCs. Only sufficient PCs to explain this much variance will be used in
#' the clustering.
#'@param lon Vector of longitudes.
#'@param lat Vector of latitudes.
#'@param method Different options to estimate the clusters. The most traditional
#' approach is the k-means analysis (default=’kmeans’) but the function also
#' support the different methods included in the hclust . These methods are:
#' "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty"
#' (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC). For more details
#' about these methods see the hclust function documentation included in the
#' stats package.
#'@param iter.max Parameter to select the maximum number of iterations allowed
#' (Only if method = 'kmeans' is selected).
#'@param nstart Parameter for the cluster analysis determining how many random
#' sets to choose (Only if method='kmeans' is selected).
#'@param ncores The number of multicore threads to use for parallel computation.
#'@return A list with elements \code{$composite} (array with at least 3-d ('lat',
#''lon', 'cluster') containing the composites k = 1,..,K for case (*1) or only k = 1
#'for any specific cluster, i.e., case (*2)), \code{pvalue} (array with at least
#'3-d ('lat','lon','cluster') with the pvalue of the composites obtained through
#'a t-test that accounts for the serial dependence of the data with the same
#'structure as Composite.), \code{cluster} (A matrix or vector with integers
#'(from 1:k) indicating the cluster to which each time step is allocated.),
#'\code{persistence} (Percentage of days in a month/season before a cluster is
#'replaced for a new one (only if method=’kmeans’ has been selected.)),
#'\code{frequency} (Percentage of days in a month/season belonging to each
#'cluster (only if method=’kmeans’ has been selected).),
#'@examples
#'data <- array(abs(rnorm(1280, 283.7, 6)), dim = c(dataset = 2, member = 2,
#' sdate = 3, ftime = 3,
#' lat = 4, lon = 4))
#'lat <- seq(47, 44)
#'res <- WeatherRegime(data = data, lat = lat,
#' EOFs = FALSE, ncenters = 4)
#'@import multiApply
#'@export
WeatherRegime <- function(data, ncenters = NULL,
EOFs = TRUE, neofs = 30,
varThreshold = NULL, lon = NULL,
lat = NULL, method = "kmeans",
iter.max=100, nstart = 30,
ncores = NULL) {
## Check inputs
# data
if (is.null(names(dim(data)))) {
stop("Parameter 'data' must be an array with named dimensions.")
if (EOFs == TRUE && is.null(lon)) {
stop("Parameter 'lon' must be specified.")
}
stop("Parameter 'lat' must be specified.")
dimData <- names(dim(data))
# temporal dimensions
if ('sdate' %in% dimData && 'ftime' %in% dimData) {
nsdates <- dim(data)['sdate']
nftimes <- dim(data)['ftime']
merge_dims = c('ftime', 'sdate'),
} else if ('sdate' %in% dimData | 'ftime' %in% dimData) {
names(dim(data))[which(dimData == 'sdate' | dimData == 'ftime') ] = 'time'
} else {
if (!('time' %in% dimData)) {
stop("Parameter 'data' must have temporal dimensions.")
# spatial dimensions
if (!any(names(dim(data)) %in% .KnownLonNames()) |
!any(names(dim(data)) %in% .KnownLatNames())) {
stop("Spatial coordinate names do not match any of the names accepted ",
"by the package.")
}
lon_name <- names(dim(data))[[which(names(dim(data)) %in% .KnownLonNames())]]
lat_name <- names(dim(data))[[which(names(dim(data)) %in% .KnownLatNames())]]
if (!is.null(lat) && dim(data)[lat_name] != length(lat)) {
stop("The length of the paramter 'lat' does not match with the ['lat'] dimension of
the parameter 'data'.")
}
# ncenters
if (is.null(ncenters)) {
stop("Parameter 'ncenters' must be specified.")
}
output <- Apply(data = list(data),
target_dims = c('time', lat_name, lon_name),
varThreshold = varThreshold,
lon = lon, lat = lat,
ncenters = ncenters,
method = method,
ncores = ncores,
lon_name = lon_name, lat_name = lat_name)
if (method == 'kmeans' && 'sdate' %in% dimData && 'ftime' %in% dimData) {
# The frequency and the persistency are computed as they are useful
# parameters in the cluster analysis
extra_output <- Apply(data = output$cluster,
target_dims = 'time',
fun = .freqPer,
nsdates = nsdates,
nftimes = nftimes ,
ncenters = ncenters)
output$cluster <- t(array(output$cluster, dim = c(nftimes, nsdates)))
names(dim(output$cluster)) <- c('sdate', 'ftime')
output <- list(composite = output$composite,
pvalue = output$pvalue,
cluster = output$cluster,
frequency = extra_output$frequency,
persistence = extra_output$persistence)
}
return(output)
}
.WeatherRegime <- function(data, ncenters = NULL, EOFs = TRUE, neofs = 30,
varThreshold = NULL, lon = NULL,
lat = NULL, method = "kmeans",
iter.max = 100, nstart = 30, lon_name = 'lon',
lat_name = 'lat') {
nlon <- dim(data)[lat_name]
nlat <- dim(data)[lon_name]
nas_test <- MergeDims(data, merge_dims = c(lat_name,lon_name),
rename_dim = 'space', na.rm = TRUE)
if (dim(nas_test)['space']== c(nlat*nlon)){
stop("Parameter 'data' contains NAs in the 'time' dimensions.")
}
}
if (is.null(varThreshold)) {
dataPC <- EOF(data,
lat = as.vector(lat),
lon = as.vector(lon),
cluster_input <- dataPC$PC
} else {
dataPC <- EOF(data,
lat = as.vector(lat),
lon = as.vector(lon),
minPC <-
head(as.numeric(which(cumsum(dataPC$var) > varThreshold)), 1)
cluster_input <- dataPC$PC[, 1:minPC]
}
} else {
dataW <- aperm(Apply(data, target_dims = lat_name,
function (x, la) {
x * cos(la * pi / 180)},
la = lat)[[1]], c(2, 1, 3))
cluster_input <- MergeDims(dataW, merge_dims = c(lat_name, lon_name),
}
if (method == "kmeans") {
clust <- kmeans(
cluster_input,
centers = ncenters,
iter.max = iter.max,
nstart = nstart,
trace = FALSE)
result <- array(0, c(ncenters, nlat, nlon))
# the order of the data dimensions is changed ('lat','lon','time')
result <- Composite(aperm(data,c(2, 3, 1)), clust$cluster)
} else {
result <- hclust(dist(cluster_input), method = method)
clusterCut <- cutree(result, ncenters)
result <- Composite(aperm(data, c(2, 3, 1)), clusterCut)
}
result <- lapply(1:length(result),
names(dim(result[[n]])) <- c(lat_name, lon_name, "cluster")
names(result) <- c('composite','pvalue')
if (method == "kmeans") {
clust <- as.array(clust$cluster)
names(dim(clust)) <- 'time'
composite = result$composite,
pvalue = result$pvalue,
cluster = clust))
} else {
clust <- as.array(clusterCut)
names(dim(clust)) <- 'time'
return(list(
composite = result$composite,
pvalue = result$pvalue,
cluster = clust))
}
}
.freqPer<- function (clust, nsdates, nftimes, ncenters){
frequency <- persistence <- matrix(NA, nsdates, ncenters)
x <- as.vector(clust)
for (i in 1:nsdates) {
occurences <-rle(x[((i * nftimes) + 1 - nftimes):(i * nftimes)])
for (j in 1:ncenters) {
frequency[i, j] <-(sum(occurences$lengths[occurences$values == j]) / nftimes) * 100
persistence[i, j] <- mean(occurences$lengths[occurences$values == j])
}
}
return(list(frequency = frequency,
persistence = persistence))