Newer
Older
#' @rdname CST_WeatherRegimes
#' @title Function for Calculating the Cluster analysis
#'
#' @author Verónica Torralba - BSC, \email{veronica.torralba@bsc.es}
#'
#' @description This function computes the weather regimes from a cluster analysis.
#'It is applied on the array \code{data} in a 's2dv_cube' object. The dimensionality of this object can be also reduced
#'by using PCs obtained from the application of the #'EOFs analysis to filter the dataset.
#'The cluster analysis can be performed with the traditional k-means or those methods
#'included in the hclust (stats package).
#'
#'@references Cortesi, N., V., Torralba, N., González-Reviriego, A., Soret, and F.J., Doblas-Reyes (2019).
#' Characterization of European wind speed variability using weather regimes. Climate Dynamics,53,
#' 4961–4976, doi:10.1007/s00382-019-04839-5.
#'@references Torralba, V. (2019) Seasonal climate prediction for the wind energy sector: methods and tools
#' for the development of a climate service. Thesis. Available online: \url{https://eprints.ucm.es/56841/}
#'
#'@param data a 's2dv_cube' object
#'@param EOFs Whether to compute the EOFs (default = 'TRUE') or not (FALSE) over data.
#'@param neofs number of modes to be kept (default = 30).
#'@param varThreshold Value with the percentage of variance to be explained by the PCs.
#' Only sufficient PCs to explain this much variance will be used in the clustering.
#'@param lon Vector of longitudes.
#'@param lat Vector of latitudes.
#'@param ncenters Number of clusters to be calculated with the clustering function.
#'@param method Different options to estimate the clusters. The most traditional approach is the k-means analysis (default=’kmeans’)
#'but the function also support the different methods included in the hclust . These methods are:
#'"ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).
#' For more details about these methods see the hclust function documentation included in the stats package.
#'@param nstarts Parameter for the cluster analysis determining how many random sets to choose (Only if method='kmeans' is selected).
#'@param iter.max Parameter to select the maximum number of iterations allowed (Only if method='kmeans' is selected).
#'@param ncores The number of multicore threads to use for parallel computation.
#'@return A list with elements \code{$composite} (3-d array (lon, lat, k) containing the composites k=1,..,K for case (*1)
# or only k=1 for any specific cluster, i.e., case (*2)),
#' \code{pvalue} (3-d array (lon, lat, k) containing the pvalue of the composites obtained through a t-test that accounts for the serial
# dependence of the data with the same structure as Composite.),
#' \code{cluster} (A time series of integers (from 1:k) indicating the cluster to which each point is allocated.),
#' \code{persistence} (The value of the regime whose length is given in cluster_lengths (only if method=’kmeans’ has been selected)),
#' \code{frequency} (Percentage of days in a month/season belonging to each cluster (only if method=’kmeans’ has been selected).),
#'@import s2dverification
#'@import multiApply
#'@examples
#'res1 <- CST_WeatherRegimes(data = lonlat_data$obs, EOFS = FALSE, ncenters = 4)
#'res2 <- CST_WeatherRegimes(data = lonlat_data$obs, EOFS = TRUE, ncenters = 3)
CST_WeatherRegimes <- function(data, ncenters = NULL,
varThreshold = NULL,
method = "kmeans",
iter.max=100, nstart = 30,
ncores = NULL) {
if (!inherits(data, 's2dv_cube')) {
stop("Parameter 'data' must be of the class 's2dv_cube', ",
"as output by CSTools::CST_Load.")
}
if ('lon' %in% names(data)){
lon <- data$lon
}else {
lon <- NULL
}
result <- WeatherRegime(data$data,ncenters = ncenters,
EOFS = EOFS, neofs = neofs,
varThreshold = varThreshold, lon = lon,
lat = data$lat, method = method,
iter.max=iter.max, nstart = nstart,
ncores = ncores)
data$data <- result$composite
data$statistics <- result[-1]
return(data)
}
#' @rdname WeatherRegimes
#' @title Function for Calculating the Cluster analysis
#'
#' @author Verónica Torralba - BSC, \email{veronica.torralba@bsc.es}
#'
#' @description This function computes the weather regimes from a cluster analysis.
#'It can be applied over the dataset with dimensions
#'c(year/month, month/day, lon, lat), or by using PCs obtained from the application of the
#'EOFs analysis to filter the dataset.
#'The cluster analysis can be performed with the traditional k-means or those methods
#'included in the hclust (stats package).
#'
#'@references Cortesi, N., V., Torralba, N., González-Reviriego, A., Soret, and F.J., Doblas-Reyes (2019).
#' Characterization of European wind speed variability using weather regimes. Climate Dynamics,53,
#' 4961–4976, doi:10.1007/s00382-019-04839-5.
#'@references Torralba, V. (2019) Seasonal climate prediction for the wind energy sector: methods and tools
#' for the development of a climate service. Thesis. Available online: \url{https://eprints.ucm.es/56841/}
#'
#'@param data an array containing anomalies with named dimensions with at least start date 'sdate', forecast time 'ftime', latitude 'lat' and longitude 'lon'.
#'@param EOFs Whether to compute the EOFs (default = 'TRUE') or not (FALSE) over data.
#'@param neofs number of modes to be kept only if EOFs = TRUE has been selected. (default = 30).
#'@param varThreshold Value with the percentage of variance to be explained by the PCs.
#' Only sufficient PCs to explain this much variance will be used in the clustering.
#'@param lon Vector of longitudes.
#'@param lat Vector of latitudes.
#'@param ncenters Number of clusters to be calculated with the clustering function.
#'@param method Different options to estimate the clusters. The most traditional approach is the k-means analysis (default=’kmeans’)
#'but the function also support the different methods included in the hclust . These methods are:
#'"ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).
#' For more details about these methods see the hclust function documentation included in the stats package.
#'@param nstarts Parameter for the cluster analysis determining how many random sets to choose (Only if method='kmeans' is selected).
#'@param iter.max Parameter to select the maximum number of iterations allowed (Only if method='kmeans' is selected).
#'@param ncores The number of multicore threads to use for parallel computation.
#'@return A list with elements \code{$composite} (3-d array (lon, lat, k) containing the composites k=1,..,K for case (*1)
# or only k=1 for any specific cluster, i.e., case (*2)),
#' \code{pvalue} (3-d array (lon, lat, k) containing the pvalue of the composites obtained through a t-test that accounts for the serial
# dependence of the data with the same structure as Composite.),
#' \code{cluster} (A time series of integers (from 1:k) indicating the cluster to which each point is allocated.),
#' \code{persistence} (The value of the regime whose length is given in cluster_lengths (only if method=’kmeans’ has been selected)),
#' \code{frequency} (Percentage of days in a month/season belonging to each cluster (only if method=’kmeans’ has been selected).),
#'@import s2dverification
#'@import multiApply
#'@examples
#'res <- WeatherRegime(data=lonlat_data$obs$data, lat= lonlat_data$obs$lat, EOFS = FALSE, ncenters = 4)
#'@export
WeatherRegime <- function(data, ncenters = NULL,
EOFS = TRUE,neofs = 30,
varThreshold = NULL, lon = NULL,
lat = NULL, method = "kmeans",
iter.max=100, nstart = 30,
ncores = NULL) {
if (is.null(names(dim(data)))) {
stop("Parameter 'data' must be an array with named dimensions.")
stop("Parameter 'lat' must be specified.")
dimData <- names(dim(data))
if ('sdate' %in% dimData && 'ftime' %in% dimData) {
nsdates <- dim(data)['sdate']
nftimes <- dim(data)['ftime']
data <- MergeDims(data,
merge_dims = c('ftime','sdate'),
rename_dim = 'time')
} else if ('sdate' %in% dimData | 'ftime' %in% dimData) {
names(dim(data))[which(dimData == 'sdate' | dimData == 'ftime') ] = 'time'
} else {
if (!('time' %in% dimData)) {
stop("Parameter 'data' must have temporal dimensions.")
output <- Apply(data = list(data),
target_dims = c('time','lat','lon'),
EOFS = EOFS, neofs = neofs,
varThreshold = varThreshold,
lon = lon, lat = lat,
ncenters = ncenters,
method = method,
ncores = ncores)
if (method=='kmeans' && 'sdate' %in% dimData && 'ftime' %in% dimData) {
# The frequency and the persistency are computed as they are useful
# parameters in the cluster analysis
extra_output <- Apply(data = output$cluster,
target_dims = 'time',
fun = .freqPer,
nsdates = nsdates,
nftimes = nftimes ,
ncenters = ncenters)
output <- list(composite=output$composite,
pvalue=output$pvalue,
cluster=output$cluster,
frequency=extra_output$frequency,
persistence=extra_output$persistence)
}
return(output)
}
.WeatherRegime <- function(data, ncenters = NULL, EOFS = TRUE,neofs = 30,
varThreshold = NULL, lon = NULL,
lat = NULL, method = "kmeans",
iter.max=100, nstart = 30) {
if (is.null(names(dim(data)))) {
stop("Parameter 'data' must be an array with 'time', 'lat' and 'lon' dimensions.")
if (!is.null(lat) && dim(data)['lat'] != length(lat)) {
stop("The length of the paramter 'lat' does not match with the ['lat'] dimension of
}
if (is.null(ncenters)) {
stop("Parameter 'ncenters' must be specified.")
}
if (EOFS == TRUE && is.null(lon)) {
stop("Parameter 'lon' must be specified.")
stop("Parameter 'lat' must be specified.")
}
nlon <- dim(data)['lat']
nlat <- dim(data)['lon']
if (any(is.na(data))){
nas_test <- MergeDims(data, merge_dims = c('lat','lon'),
rename_dim = 'space',na.rm = TRUE)
if (dim(nas_test)['space']== c(nlat*nlon)){
stop("Parameter 'data' contains NAs in the 'time' dimensions.")
}
}
if (EOFS == TRUE) {
if (is.null(varThreshold)) {
dataPC <- EOF(data,
lat = as.vector(lat),
lon = as.vector(lon),
neofs = neofs)
cluster_input <- dataPC$PC
} else {
dataPC <- EOF(data,
lat = as.vector(lat),
lon = as.vector(lon),
minPC <-
head(as.numeric(which(cumsum(dataPC$var) > varThreshold)), 1)
cluster_input <- dataPC$PC[, 1:minPC]
}
} else {
dataW <- aperm(Apply(data, target_dims = 'lat',
function (x, la) {
x * cos(la * pi / 180)},
la = lat)[[1]], c(2, 1, 3))
cluster_input <- MergeDims(dataW, merge_dims = c('lat','lon'),
}
if (method == "kmeans") {
clust <- kmeans(
cluster_input,
centers = ncenters,
iter.max = iter.max,
nstart = nstart,
trace = FALSE)
result <- array(0, c(ncenters, nlat, nlon))
# the order of the data dimensions is changed ('lat','lon','time')
result <- Composite(aperm(data,c(2, 3, 1)), clust$cluster)
} else {
result <- hclust(dist(cluster_input), method = method)
clusterCut <- cutree(result, ncenters)
result <- Composite(aperm(data, c(2, 3, 1)), clusterCut)
}
result <- lapply(1:length(result),
names(dim(result[[n]])) <- c("lat", "lon", "cluster")
names(result) <- c('composite','pvalue')
if (method == "kmeans") {
clust <- as.array(clust$cluster)
names(dim(clust)) <- 'time'
composite = result$composite,
pvalue = result$pvalue,
cluster = clust))
} else {
clust <- as.array(clusterCut)
names(dim(clust)) <- 'time'
return(list(
composite = result$composite,
pvalue = result$pvalue,
cluster = clust))
}
}
.freqPer<- function (clust, nsdates, nftimes, ncenters){
frequency <- persistence <- matrix(NA, nsdates, ncenters)
x <- as.vector(clust)
for (i in 1:nsdates) {
occurences <-rle(x[((i * nftimes) + 1 - nftimes):(i * nftimes)])
for (j in 1:ncenters) {
frequency[i, j] <-(sum(occurences$lengths[occurences$values == j]) / nftimes) * 100
persistence[i, j] <- mean(occurences$lengths[occurences$values == j])
}
}
return(list(frequency = frequency,
persistence = persistence))