Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#' @rdname CST_WeatherRegimes
#' @title Function for Calculating the Cluster analysis
#'
#' @author Verónica Torralba - BSC, \email{veronica.torralba@bsc.es}
#'
#' @description This function computes the weather regimes from a cluster analysis.
#'It is applied on the array \code{data} in a 's2dv_cube' object. The dimensionality of this object can be also reduced
#'by using PCs obtained from the application of the #'EOFs analysis to filter the dataset.
#'The cluster analysis can be performed with the traditional k-means or those methods
#'included in the hclust (stats package).
#'
#'@references Cortesi, N., V., Torralba, N., González-Reviriego, A., Soret, and F.J., Doblas-Reyes (2019).
#' Characterization of European wind speed variability using weather regimes. Climate Dynamics,53,
#' 4961–4976, doi:10.1007/s00382-019-04839-5.
#'@references Torralba, V. (2019) Seasonal climate prediction for the wind energy sector: methods and tools
#' for the development of a climate service. Thesis. Available online: \url{https://eprints.ucm.es/56841/}
#'
#'@param data a 's2dv_cube' object
#'@param EOFs Whether to compute the EOFs (default = 'TRUE') or not (FALSE) over data.
#'@param neofs number of modes to be kept (default = 30).
#'@param varThreshold Value with the percentage of variance to be explained by the PCs.
#' Only sufficient PCs to explain this much variance will be used in the clustering.
#'@param lon Vector of longitudes.
#'@param lat Vector of latitudes.
#'@param ncenters Number of clusters to be calculated with the clustering function.
#'@param method Different options to estimate the clusters. The most traditional approach is the k-means analysis (default=’kmeans’)
#'but the function also support the different methods included in the hclust . These methods are:
#'"ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).
#' For more details about these methods see the hclust function documentation included in the stats package.
#'@param nstarts Parameter for the cluster analysis determining how many random sets to choose (Only if method='kmeans' is selected).
#'@param iter.max Parameter to select the maximum number of iterations allowed (Only if method='kmeans' is selected).
#'@param ncores The number of multicore threads to use for parallel computation}.
#'@return A list with elements \code{$composite} (3-d array (lon, lat, k) containing the composites k=1,..,K for case (*1)
# or only k=1 for any specific cluster, i.e., case (*2)),
#' \code{pvalue} (3-d array (lon, lat, k) containing the pvalue of the composites obtained through a t-test that accounts for the serial
# dependence of the data with the same structure as Composite.),
#' \code{cluster} (A time series of integers (from 1:k) indicating the cluster to which each point is allocated.),
#' \code{persistence} (The value of the regime whose length is given in cluster_lengths (only if method=’kmeans’ has been selected)),
#' \code{frequency} (Percentage of days in a month/season belonging to each cluster (only if method=’kmeans’ has been selected).),
#'@import s2dverification
#'@import multiApply
#'@examples
#'@export
#'
CST_WeatherRegimes <- function(data,ncenters = NULL,
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
EOFS = TRUE,neofs = 30,
varThreshold = NULL, lon = NULL,
lat = NULL, method = "kmeans",
iter.max=100, nstart = 30,
ncores = NULL) {
if (!inherits(data, 's2dv_cube')) {
stop("Parameter 'data' must be of the class 's2dv_cube', ",
"as output by CSTools::CST_Load.")
}
data$data <- WeatherRegime(data$data,ncenters = ncenters ,
EOFS = EOFS, neofs = neofs,
varThreshold = varThreshold, lon = lon,
lat = lat, method = method,
iter.max=iter.max, nstart = nstart,
ncores = ncores)
return(data)
}
#' @rdname WeatherRegimes
#' @title Function for Calculating the Cluster analysis
#'
#' @author Verónica Torralba - BSC, \email{veronica.torralba@bsc.es}
#'
#' @description This function computes the weather regimes from a cluster analysis.
#'It can be applied over the dataset with dimensions
#'c(year/month, month/day, lon, lat), or by using PCs obtained from the application of the
#'EOFs analysis to filter the dataset.
#'The cluster analysis can be performed with the traditional k-means or those methods
#'included in the hclust (stats package).
#'
#'@references Cortesi, N., V., Torralba, N., González-Reviriego, A., Soret, and F.J., Doblas-Reyes (2019).
#' Characterization of European wind speed variability using weather regimes. Climate Dynamics,53,
#' 4961–4976, doi:10.1007/s00382-019-04839-5.
#'@references Torralba, V. (2019) Seasonal climate prediction for the wind energy sector: methods and tools
#' for the development of a climate service. Thesis. Available online: \url{https://eprints.ucm.es/56841/}
#'
#'@param data n array containing anomalies with named dimensions: dataset, member, sdate, ftime, lat and lon.
#'@param EOFs Whether to compute the EOFs (default = 'TRUE') or not (FALSE) over data.
#'@param neofs number of modes to be kept (default = 30).
#'@param varThreshold Value with the percentage of variance to be explained by the PCs.
#' Only sufficient PCs to explain this much variance will be used in the clustering.
#'@param lon Vector of longitudes.
#'@param lat Vector of latitudes.
#'@param ncenters Number of clusters to be calculated with the clustering function.
#'@param method Different options to estimate the clusters. The most traditional approach is the k-means analysis (default=’kmeans’)
#'but the function also support the different methods included in the hclust . These methods are:
#'"ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).
#' For more details about these methods see the hclust function documentation included in the stats package.
#'@param nstarts Parameter for the cluster analysis determining how many random sets to choose (Only if method='kmeans' is selected).
#'@param iter.max Parameter to select the maximum number of iterations allowed (Only if method='kmeans' is selected).
#'@param ncores The number of multicore threads to use for parallel computation}.
#'@return A list with elements \code{$composite} (3-d array (lon, lat, k) containing the composites k=1,..,K for case (*1)
# or only k=1 for any specific cluster, i.e., case (*2)),
#' \code{pvalue} (3-d array (lon, lat, k) containing the pvalue of the composites obtained through a t-test that accounts for the serial
# dependence of the data with the same structure as Composite.),
#' \code{cluster} (A time series of integers (from 1:k) indicating the cluster to which each point is allocated.),
#' \code{persistence} (The value of the regime whose length is given in cluster_lengths (only if method=’kmeans’ has been selected)),
#' \code{frequency} (Percentage of days in a month/season belonging to each cluster (only if method=’kmeans’ has been selected).),
#'@import s2dverification
#'@import multiApply
#'@examples
#'@export
WeatherRegime <- function(data, ncenters = NULL,
EOFS = TRUE,neofs = 30,
varThreshold = NULL, lon = NULL,
lat = NULL, method = "kmeans",
iter.max=100, nstart = 30,
ncores = NULL) {
if (is.null(names(dim(data)))) {
stop('data must be an array with named dimensions')
}
dimData <- names(dim(data))
nsdates <- dim(data)['sdate']
nftimes <- dim(data)['ftime']
if ('sdate' %in% dimData && 'ftime' %in% dimData){
data <- MergeDims(data,
merge_dims = c('ftime','sdate'),
rename_dim = 'time')
}
output <- Apply(data = list(data),
target_dims = c('time','lat','lon'),
fun = ".WeatherRegime",
EOFS = EOFS, neofs = neofs,
varThreshold = varThreshold,
lon = lon, lat = lat,
ncenters = ncenters,
method = method,
ncores = ncores)
if (method=='kmeans' && 'sdate' %in% dimData && 'ftime' %in% dimData){
# The frequency and the persistency are computed as they are useful
# parameters in the cluster analysis
extra_output <- Apply(data=output$cluster,
target_dims = 'time',
fun=.freqPer,
nsdates=nsdates,
nftimes=nftimes ,
ncenters = ncenters)
output <- list(composite=output$composite,
pvalue=output$pvalue,
cluster=output$cluster,
frequency=extra_output$frequency,
persistence=extra_output$persistence)
}
return(output)
}
.WeatherRegime <- function(data, ncenters = NULL, EOFS = TRUE,neofs = 30,
varThreshold = NULL, lon = NULL,
lat = NULL, method = "kmeans",
iter.max=100, nstart = 30) {
if (is.null(names(dim(data)))) {
stop('data must be an array with named dimensions')
}
if (!is.null(lat) && dim(data)['lat'] != length(lat)) {
stop('the latitudes do not match with the lat dimension of data')
}
if (is.null(ncenters)) {
stop("ncenters must be specified")
}
if (EOFS == TRUE && is.null(lon)) {
stop("longitudes must be specified")
}
if (EOFS == TRUE && is.null(lat)) {
stop("latitudes must be specified")
}
nlon <- dim(data)['lat']
nlat <- dim(data)['lon']
if (EOFS == TRUE) {
if (is.null(varThreshold)) {
dataPC <- EOF(data,
lat = as.vector(lat),
lon = as.vector(lon),
neofs = neofs)
cluster_input <- dataPC$PC
} else {
dataPC <- EOF(data,
lat = as.vector(lat),
lon = as.vector(lon),
neofs = 30)
minPC <-
head(as.numeric(which(cumsum(dataPC$var) > varThreshold)), 1)
cluster_input <- dataPC$PC[, 1:minPC]
}
} else {
#if (latWeights){
# latitude weights are applied on the data
dataW <- aperm(Apply(data, target_dims = 'lat',
function (x, la) {
x * cos(la * pi / 180)},
la = lat)[[1]], c(2, 1, 3))
cluster_input <- MergeDims(dataW, merge_dims = c('lat','lon'),
rename_dim = 'space',na.rm=T)
#}else {
#cluster_input <- MergeDims(data2, merge_dims = c('lat','lon'),
# rename_dim = 'space',na.rm=T)
#}
}
if (method == "kmeans") {
if (any(is.na(cluster_input))) {
posnas <- unique(which(is.na(cluster_input), arr.ind = T)[, 2])
cluster_input <- cluster_input[, -posnas]
}
clust <- kmeans(
cluster_input,
centers = ncenters,
iter.max = iter.max,
nstart = nstart,
trace = FALSE)
if (any(is.na(cluster_input))) {
centers <- array(NA, dim = c(ncenters, nlon * nlat))
centers [, -c(posnas)] <- as.array(clust$center)
} else{
centers <- as.array(clust$center)
}
result <- array(0, c(ncenters, nlat, nlon))
# the order of the data dimensions is changed ('lat','lon','time')
result <- Composite(aperm(data,c(3,2,1)), clust$cluster)
} else {
result <- hclust(dist(cluster_input), method = method)
clusterCut <- cutree(result, ncenters)
result <- Composite(aperm(data, c(2, 3, 1)), clusterCut)
}
result <- lapply(1:length(result),
function (n) {names(dim(result[[n]])) <- c("lon", "lat", "cluster")
return (result[[n]])})
names(result) <- c('composite','pvalue')
if (method == "kmeans") {
clust <- as.array(clust$cluster)
names(dim(clust)) <- 'time'
: return(list(
composite = result$composite,
pvalue = result$pvalue,
cluster = clust))
} else {
clust <- as.array(clusterCut)
names(dim(clust)) <- 'time'
return(list(
composite = result$composite,
pvalue = result$pvalue,
cluster = clust))
}
}
.freqPer<- function (clust, nsdates, nftimes, ncenters){
frequency <- persistence <- matrix(NA, nsdates, ncenters)
x <- as.vector(clust)
for (i in 1:nsdates) {
occurences <-rle(x[((i * nftimes) + 1 - nftimes):(i * nftimes)])
for (j in 1:ncenters) {
frequency[i, j] <-(sum(occurences$lengths[occurences$values == j]) / nftimes) * 100
persistence[i, j] <- mean(occurences$lengths[occurences$values == j])
}
}
return(list(frequency=frequency,
persistence=persistence))
}