Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#' @rdname CST_WeatherRegimes
#' @title Function for Calculating the Cluster analysis
#'
#' @author Verónica Torralba - BSC, \email{veronica.torralba@bsc.es}
#'
#' @description This function computes the weather regimes from a cluster analysis.
#'It is applied on the array \code{data} in a 's2dv_cube' object. The dimensionality of this object can be also reduced
#'by using PCs obtained from the application of the #'EOFs analysis to filter the dataset.
#'The cluster analysis can be performed with the traditional k-means or those methods
#'included in the hclust (stats package).
#'
#'@references Cortesi, N., V., Torralba, N., González-Reviriego, A., Soret, and F.J., Doblas-Reyes (2019).
#' Characterization of European wind speed variability using weather regimes. Climate Dynamics,53,
#' 4961–4976, doi:10.1007/s00382-019-04839-5.
#'@references Torralba, V. (2019) Seasonal climate prediction for the wind energy sector: methods and tools
#' for the development of a climate service. Thesis. Available online: \url{https://eprints.ucm.es/56841/}
#'
#'@param data a 's2dv_cube' object
#'@param EOFs Whether to compute the EOFs (default = 'TRUE') or not (FALSE) over data.
#'@param neofs number of modes to be kept (default = 30).
#'@param varThreshold Value with the percentage of variance to be explained by the PCs.
#' Only sufficient PCs to explain this much variance will be used in the clustering.
#'@param lon Vector of longitudes.
#'@param lat Vector of latitudes.
#'@param ncenters Number of clusters to be calculated with the clustering function.
#'@param method Different options to estimate the clusters. The most traditional approach is the k-means analysis (default=’kmeans’)
#'but the function also support the different methods included in the hclust . These methods are:
#'"ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).
#' For more details about these methods see the hclust function documentation included in the stats package.
#'@param nstarts Parameter for the cluster analysis determining how many random sets to choose (Only if method='kmeans' is selected).
#'@param iter.max Parameter to select the maximum number of iterations allowed (Only if method='kmeans' is selected).
#'@param ncores The number of multicore threads to use for parallel computation}.
#'@return A list with elements \code{$composite} (3-d array (lon, lat, k) containing the composites k=1,..,K for case (*1)
# or only k=1 for any specific cluster, i.e., case (*2)),
#' \code{pvalue} (3-d array (lon, lat, k) containing the pvalue of the composites obtained through a t-test that accounts for the serial
# dependence of the data with the same structure as Composite.),
#' \code{cluster} (A time series of integers (from 1:k) indicating the cluster to which each point is allocated.),
#' \code{persistence} (The value of the regime whose length is given in cluster_lengths (only if method=’kmeans’ has been selected)),
#' \code{frequency} (Percentage of days in a month/season belonging to each cluster (only if method=’kmeans’ has been selected).),
#'@import s2dverification
#'@import multiApply
#'@examples
#'@export
#'
CST_WeatherRegimes <- function(data,ncenters = NULL,
varThreshold = NULL,
method = "kmeans",
iter.max=100, nstart = 30,
ncores = NULL) {
if (!inherits(data, 's2dv_cube')) {
stop("Parameter 'data' must be of the class 's2dv_cube', ",
"as output by CSTools::CST_Load.")
}
if ('lon' %in% names(data)){
lon <- data$lon
}else {
lon <- NULL
}
result <- WeatherRegime(data$data,ncenters = ncenters,
EOFS = EOFS, neofs = neofs,
varThreshold = varThreshold, lon = lon,
lat = data$lat, method = method,
iter.max=iter.max, nstart = nstart,
ncores = ncores)
data$data <- result$composite
data$statistics <- result[-1]
return(data)
}
#' @rdname WeatherRegimes
#' @title Function for Calculating the Cluster analysis
#'
#' @author Verónica Torralba - BSC, \email{veronica.torralba@bsc.es}
#'
#' @description This function computes the weather regimes from a cluster analysis.
#'It can be applied over the dataset with dimensions
#'c(year/month, month/day, lon, lat), or by using PCs obtained from the application of the
#'EOFs analysis to filter the dataset.
#'The cluster analysis can be performed with the traditional k-means or those methods
#'included in the hclust (stats package).
#'
#'@references Cortesi, N., V., Torralba, N., González-Reviriego, A., Soret, and F.J., Doblas-Reyes (2019).
#' Characterization of European wind speed variability using weather regimes. Climate Dynamics,53,
#' 4961–4976, doi:10.1007/s00382-019-04839-5.
#'@references Torralba, V. (2019) Seasonal climate prediction for the wind energy sector: methods and tools
#' for the development of a climate service. Thesis. Available online: \url{https://eprints.ucm.es/56841/}
#'
#'@param data an array containing anomalies with named dimensions with at least start date 'sdate', forecast time 'ftime', latitude 'lat' and longitude 'lon'.
#'@param EOFs Whether to compute the EOFs (default = 'TRUE') or not (FALSE) over data.
#'@param neofs number of modes to be kept only if EOFs = TRUE has been selected. (default = 30).
#'@param varThreshold Value with the percentage of variance to be explained by the PCs.
#' Only sufficient PCs to explain this much variance will be used in the clustering.
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#'@param lon Vector of longitudes.
#'@param lat Vector of latitudes.
#'@param ncenters Number of clusters to be calculated with the clustering function.
#'@param method Different options to estimate the clusters. The most traditional approach is the k-means analysis (default=’kmeans’)
#'but the function also support the different methods included in the hclust . These methods are:
#'"ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).
#' For more details about these methods see the hclust function documentation included in the stats package.
#'@param nstarts Parameter for the cluster analysis determining how many random sets to choose (Only if method='kmeans' is selected).
#'@param iter.max Parameter to select the maximum number of iterations allowed (Only if method='kmeans' is selected).
#'@param ncores The number of multicore threads to use for parallel computation}.
#'@return A list with elements \code{$composite} (3-d array (lon, lat, k) containing the composites k=1,..,K for case (*1)
# or only k=1 for any specific cluster, i.e., case (*2)),
#' \code{pvalue} (3-d array (lon, lat, k) containing the pvalue of the composites obtained through a t-test that accounts for the serial
# dependence of the data with the same structure as Composite.),
#' \code{cluster} (A time series of integers (from 1:k) indicating the cluster to which each point is allocated.),
#' \code{persistence} (The value of the regime whose length is given in cluster_lengths (only if method=’kmeans’ has been selected)),
#' \code{frequency} (Percentage of days in a month/season belonging to each cluster (only if method=’kmeans’ has been selected).),
#'@import s2dverification
#'@import multiApply
#'@examples
#'@export
WeatherRegime <- function(data, ncenters = NULL,
EOFS = TRUE,neofs = 30,
varThreshold = NULL, lon = NULL,
lat = NULL, method = "kmeans",
iter.max=100, nstart = 30,
ncores = NULL) {
if (is.null(names(dim(data)))) {
stop("Parameter 'data' must be an array with named dimensions")
stop("Parameter 'lat' must be specified.")
dimData <- names(dim(data))
if ('sdate' %in% dimData && 'ftime' %in% dimData) {
nsdates <- dim(data)['sdate']
nftimes <- dim(data)['ftime']
data <- MergeDims(data,
merge_dims = c('ftime','sdate'),
rename_dim = 'time')
} else if ('sdate' %in% dimData | 'ftime' %in% dimData) {
names(dim(data))[which(dimData == 'sdate' | dimData == 'ftime') ] = 'time'
} else {
if (!('time' %in% dimData)) {
stop("Parameter 'data' must have temporal dimension(s).")
}
}
output <- Apply(data = list(data),
target_dims = c('time','lat','lon'),
fun = ".WeatherRegime",
EOFS = EOFS, neofs = neofs,
varThreshold = varThreshold,
lon = lon, lat = lat,
ncenters = ncenters,
method = method,
ncores = ncores)
if (method=='kmeans' && 'sdate' %in% dimData && 'ftime' %in% dimData) {
# The frequency and the persistency are computed as they are useful
# parameters in the cluster analysis
extra_output <- Apply(data = output$cluster,
target_dims = 'time',
fun = .freqPer,
nsdates = nsdates,
nftimes = nftimes ,
ncenters = ncenters)
output <- list(composite=output$composite,
pvalue=output$pvalue,
cluster=output$cluster,
frequency=extra_output$frequency,
persistence=extra_output$persistence)
}
return(output)
}
.WeatherRegime <- function(data, ncenters = NULL, EOFS = TRUE,neofs = 30,
varThreshold = NULL, lon = NULL,
lat = NULL, method = "kmeans",
iter.max=100, nstart = 30) {
if (is.null(names(dim(data)))) {
stop("Parameter 'data' must be an array with 'time', 'lat' and 'lon' dimensions.")
}
if (!is.null(lat) && dim(data)['lat'] != length(lat)) {
stop("The length of the paramter 'lat' does not match with the ['lat'] dimension of
the parameter 'data'")
}
if (is.null(ncenters)) {
stop("Parameter 'ncenters' must be specified")
}
if (EOFS == TRUE && is.null(lon)) {
stop("Parameter 'lon' must be specified")
stop("Parameter 'lat' must be specified")
}
nlon <- dim(data)['lat']
nlat <- dim(data)['lon']
if (EOFS == TRUE) {
if (is.null(varThreshold)) {
dataPC <- EOF(data,
lat = as.vector(lat),
lon = as.vector(lon),
neofs = neofs)
cluster_input <- dataPC$PC
} else {
dataPC <- EOF(data,
lat = as.vector(lat),
lon = as.vector(lon),
minPC <-
head(as.numeric(which(cumsum(dataPC$var) > varThreshold)), 1)
cluster_input <- dataPC$PC[, 1:minPC]
}
} else {
dataW <- aperm(Apply(data, target_dims = 'lat',
function (x, la) {
x * cos(la * pi / 180)},
la = lat)[[1]], c(2, 1, 3))
cluster_input <- MergeDims(dataW, merge_dims = c('lat','lon'),
}
if (method == "kmeans") {
clust <- kmeans(
cluster_input,
centers = ncenters,
iter.max = iter.max,
nstart = nstart,
trace = FALSE)
result <- array(0, c(ncenters, nlat, nlon))
# the order of the data dimensions is changed ('lat','lon','time')
result <- Composite(aperm(data,c(2, 3, 1)), clust$cluster)
} else {
result <- hclust(dist(cluster_input), method = method)
clusterCut <- cutree(result, ncenters)
result <- Composite(aperm(data, c(2, 3, 1)), clusterCut)
}
result <- lapply(1:length(result),
function (n) {
names(dim(result[[n]])) <- c("lon", "lat", "cluster")
return (result[[n]])
})
names(result) <- c('composite','pvalue')
if (method == "kmeans") {
clust <- as.array(clust$cluster)
names(dim(clust)) <- 'time'
composite = result$composite,
pvalue = result$pvalue,
cluster = clust))
} else {
clust <- as.array(clusterCut)
names(dim(clust)) <- 'time'
}
}
.freqPer<- function (clust, nsdates, nftimes, ncenters){
frequency <- persistence <- matrix(NA, nsdates, ncenters)
x <- as.vector(clust)
for (i in 1:nsdates) {
occurences <-rle(x[((i * nftimes) + 1 - nftimes):(i * nftimes)])
for (j in 1:ncenters) {
frequency[i, j] <-(sum(occurences$lengths[occurences$values == j]) / nftimes) * 100
persistence[i, j] <- mean(occurences$lengths[occurences$values == j])
}
}
return(list(frequency = frequency,
persistence = persistence))