From 47799e175771d32f31fd381af1bfa483c91a7e5b Mon Sep 17 00:00:00 2001 From: Eva Rifa Date: Fri, 15 Sep 2023 10:28:03 +0200 Subject: [PATCH 1/2] Add changes to a new branch as a fix, the branch merged without the changes being applied --- R/CST_MultiEOF.R | 377 ++++++++++++++++++++++------- man/CST_MultiEOF.Rd | 55 ++++- man/MultiEOF.Rd | 65 ++++- tests/testthat/test-CST_MultiEOF.R | 143 +++++++++-- 4 files changed, 517 insertions(+), 123 deletions(-) diff --git a/R/CST_MultiEOF.R b/R/CST_MultiEOF.R index bd218423..62f6b991 100644 --- a/R/CST_MultiEOF.R +++ b/R/CST_MultiEOF.R @@ -17,20 +17,55 @@ #' "lon" and "lat", a dimension "ftime" and a dimension "sdate". Latitudinal #' dimension accepted names: 'lat', 'lats', 'latitude', 'y', 'j', 'nav_lat'. #' Longitudinal dimension accepted names: 'lon', 'lons','longitude', 'x', 'i', -#' 'nav_lon'. -#'@param neof_composed Number of composed eofs to return in output. -#'@param minvar Minimum variance fraction to be explained in first decomposition. +#' 'nav_lon'. NAs can exist but it should be consistent along 'time_dim'. That +#' is, if one grid point has NAs for each variable, all the time steps at this +#' point should be NAs. +#'@param lon_dim A character string indicating the name of the longitudinal +#' dimension. By default, it is set to 'lon'. +#'@param lat_dim A character string indicating the name of the latitudinal +#' dimension. By default, it is set to 'lat'. +#'@param time_dim A character string indicating the name of the temporal +#' dimension. By default, it is set to 'time'. +#'@param sdate_dim A character string indicating the name of the start date +#' dimension. By default, it is set to 'sdate'. +#'@param var_dim A character string indicating the name of the variable +#' dimension. By default, it is set to 'var'. #'@param neof_max Maximum number of single eofs considered in the first #' decomposition. +#'@param neof_composed Number of composed eofs to return in output. +#'@param minvar Minimum variance fraction to be explained in first decomposition. #'@param lon_lim Vector with longitudinal range limits for the EOF calculation #' for all input variables. #'@param lat_lim Vector with latitudinal range limits for the EOF calculation #' for all input variables. -#'@return A list with elements \code{$coeff} (an array of time-varying principal -#'component coefficients), \code{$variance} (a matrix of explained variances), -#'\code{eof_pattern} (a matrix of EOF patterns obtained by regression for each -#'variable). -#'@import abind +#'@return +#'A list containing: +#'\item{coeff}{ +#' An 's2dv_cube' with the data element being an array of principal components +#' with dimensions 'time_dim', 'sdate_dim', number of eof, rest of the +#' dimensions of 'data' except 'lon_dim' and 'lat_dim'. +#'} +#'\item{variance}{ +#' An 's2dv_cube' with the data element being an array of explained variances +#' with dimensions 'eof' and the rest of the dimensions of 'data' except +#' 'time_dim', 'sdate_dim', 'lon_dim' and 'lat_dim'. +#'} +#'\item{eof_pattern}{ +#' An 's2dv_cube' with the data element being an array of EOF patterns obtained +#' by regression with dimensions: 'eof' and the rest of the dimensions of +#' 'data' except 'time_dim' and 'sdate_dim'. +#'} +#'\item{mask}{ +#' An 's2dv_cube' with the data element being an array of the mask with +#' dimensions ('lon_dim', 'lat_dim', rest of the dimensions of 'data' except +#' 'time_dim'). It is made from 'data', 1 for the positions that 'data' has +#' value and NA for the positions that 'data' has NA. It is used to replace NAs +#' with 0s for EOF calculation and mask the result with NAs again after the +#' calculation. +#'} +#'\item{coordinates}{ +#' Longitudinal and latitudinal coordinates vectors. +#'} #'@examples #'seq <- 1 : (2 * 3 * 4 * 5 * 6 * 8) #'mod1 <- sin( 0.7 + seq )^2 + cos( seq ^ 2 * 1.22 ) @@ -54,34 +89,27 @@ #'exp2$attrs$Dates = d #' #'cal <- CST_MultiEOF(datalist = list(exp1, exp2), neof_composed = 2) +#'@import abind #'@export -CST_MultiEOF <- function(datalist, neof_max = 40, neof_composed = 5, +CST_MultiEOF <- function(datalist, lon_dim = "lon", lat_dim = "lat", + time_dim = 'ftime', sdate_dim = 'sdate', + var_dim = 'var', neof_max = 40, neof_composed = 5, minvar = 0.6, lon_lim = NULL, lat_lim = NULL) { - # Check s2dv_cube + # Check 's2dv_cube' if (!(all(sapply(datalist, inherits, 's2dv_cube')))) { - stop("Elements of the list in parameter 'datalist' must be of the class ", - "'s2dv_cube', as output by CSTools::CST_Load.") - } - - # Check if all dims equal - adims = lapply(lapply(datalist, function(x) x$data), dim) - if(!all(apply(apply(abind(adims, along = 0), 2, duplicated), 2, sum) == - (length(adims)-1))) { - stop("Input data fields must all have the same dimensions.") - } - - exp <- abind(lapply(datalist, '[[', 'data'), along = 0) - dim(exp) <- c(var = length(datalist), dim(datalist[[1]]$data)) - - if (any(is.na(exp))) { - stop("Input data contain NA values.") + stop("Elements of the list in parameter 'datalist' must be of the ", + "class 's2dv_cube'.") } - - # Check coordinates if (!all(c('data', 'coords', 'attrs') %in% names(datalist[[1]]))) { stop("Parameter 'datalist' must have 'data', 'coords' and 'attrs' elements ", "within the 's2dv_cube' structure.") } + # Dates + dates <- datalist[[1]]$attrs$Dates + if (is.null(dates)) { + stop("Element 'Dates' is not found in 'attrs' list of the first array.") + } + # coordinates if (!any(names(datalist[[1]]$coords) %in% .KnownLonNames()) | !any(names(datalist[[1]]$coords) %in% .KnownLatNames())) { stop("Spatial coordinate names do not match any of the names accepted by the ", @@ -89,28 +117,47 @@ CST_MultiEOF <- function(datalist, neof_max = 40, neof_composed = 5, "'nav_lat'. Longitudes accepted names: 'lon', 'lons', 'longitude', 'x',", " 'i', 'nav_lon'.") } - # Check dimensions - if (!any(names(dim(datalist[[1]]$data)) %in% .KnownLonNames()) | - !any(names(dim(datalist[[1]]$data)) %in% .KnownLatNames())) { - stop("Spatial dimension names do not match any of the names accepted by ", - "the package.") - } - lon <- names(datalist[[1]]$coords)[[which(names(datalist[[1]]$coords) %in% .KnownLonNames())]] - lat <- names(datalist[[1]]$coords)[[which(names(datalist[[1]]$coords) %in% .KnownLatNames())]] - - lon_name <- names(dim(datalist[[1]]$data))[[which(names(dim(datalist[[1]]$data)) %in% .KnownLonNames())]] - lat_name <- names(dim(datalist[[1]]$data))[[which(names(dim(datalist[[1]]$data)) %in% .KnownLatNames())]] + # Check if all dims equal + adims = lapply(lapply(datalist, function(x) x$data), dim) + if (!all(apply(apply(abind(adims, along = 0), 2, duplicated), 2, sum) == + (length(adims)-1))) { + stop("Input data fields must all have the same dimensions.") + } - result <- MultiEOF(exp, - lon = as.vector(datalist[[1]]$coords[[lon]]), - lat = as.vector(datalist[[1]]$coords[[lat]]), - lon_dim = lon_name, lat_dim = lat_name, - time = datalist[[1]]$attrs$Dates, minvar = minvar, + exp <- abind(lapply(datalist, '[[', 'data'), along = 0) + dim(exp) <- c(length(datalist), dim(datalist[[1]]$data)) + names(dim(exp)) <- c(var_dim, names(dim(datalist[[1]]$data))) + + lon_name <- names(datalist[[1]]$coords)[[which(names(datalist[[1]]$coords) %in% .KnownLonNames())]] + lat_name <- names(datalist[[1]]$coords)[[which(names(datalist[[1]]$coords) %in% .KnownLatNames())]] + lon <- as.vector(datalist[[1]]$coords[[lon_name]]) + lat <- as.vector(datalist[[1]]$coords[[lat_name]]) + + result <- MultiEOF(exp, lon = lon, lat = lat, + lon_dim = lon_dim, lat_dim = lat_dim, time_dim = time_dim, + sdate_dim = sdate_dim, var_dim = var_dim, + dates = dates, minvar = minvar, neof_max = neof_max, neof_composed = neof_composed, lon_lim = lon_lim, lat_lim = lat_lim) - - return(result) + names_res <- names(result[1:4]) + res <- lapply(seq_along(result)[1:4], function(i) { + coords = list(lon, lat) + names(coords) <- c(lon_dim, lat_dim) + dates <- dates + varName <- names(result)[[i]] + metadata <- lapply(datalist, function(x) x$attrs$Variable$metadata) + metadata <- unlist(metadata, recursive=FALSE) + metadata <- metadata[unique(names(metadata))] + suppressWarnings( + cube <- s2dv_cube(data = result[[i]], coords = coords, varName = varName, Dates = dates, + source_files = unlist(sapply(datalist, function(x) x$attrs$source_files)), + metadata = metadata, when = Sys.time()) + ) + return(cube) + }) + names(res) <- names_res + return(c(res, result[5:6])) } #'@rdname MultiEOF #'@title EOF analysis of multiple variables starting from an array (reduced @@ -132,24 +179,61 @@ CST_MultiEOF <- function(datalist, neof_max = 40, neof_composed = 5, #' as the \code{"exp"} element of a 's2dv_cube' object. Latitudinal #' dimension accepted names: 'lat', 'lats', 'latitude', 'y', 'j', 'nav_lat'. #' Longitudinal dimension accepted names: 'lon', 'lons','longitude', 'x', 'i', -#' 'nav_lon'. +#' 'nav_lon'. NAs can exist but it should be consistent along 'time_dim'. That +#' is, if one grid point has NAs for each variable, all the time steps at this +#' point should be NAs. #'@param lon Vector of longitudes. #'@param lat Vector of latitudes. -#'@param time Vector or matrix of dates in POSIXct format. -#'@param lon_dim String with dimension name of longitudinal coordinate. -#'@param lat_dim String with dimension name of latitudinal coordinate. -#'@param neof_composed Number of composed eofs to return in output. -#'@param minvar Minimum variance fraction to be explained in first decomposition. +#'@param dates Vector or matrix of dates in POSIXct format. +#'@param time Deprecated parameter, it has been substituted by 'dates'. It will +#' be removed in the next release. +#'@param lon_dim A character string indicating the name of the longitudinal +#' dimension. By default, it is set to 'lon'. +#'@param lat_dim A character string indicating the name of the latitudinal +#' dimension. By default, it is set to 'lat'. +#'@param time_dim A character string indicating the name of the temporal +#' dimension. By default, it is set to 'time'. +#'@param sdate_dim A character string indicating the name of the start date +#' dimension. By default, it is set to 'sdate'. +#'@param var_dim A character string indicating the name of the variable +#' dimension. By default, it is set to 'var'. #'@param neof_max Maximum number of single eofs considered in the first #' decomposition. +#'@param neof_composed Number of composed eofs to return in output. +#'@param minvar Minimum variance fraction to be explained in first decomposition. #'@param lon_lim Vector with longitudinal range limits for the calculation for #' all input variables. #'@param lat_lim Vector with latitudinal range limits for the calculation for #' all input variables. -#'@return A list with elements \code{$coeff} (an array of time-varying principal -#'component coefficients), \code{$variance} (a matrix of explained variances), -#'\code{eof_pattern} (a matrix of EOF patterns obtained by regression for each -#'variable). +#'@param ncores An integer indicating the number of cores to use for parallel +#' computation. The default value is NULL. +#'@return +#'A list containing: +#'\item{coeff}{ +#' An array of principal components with dimensions 'time_dim', 'sdate_dim', +#' number of eof, rest of the dimensions of 'data' except 'lon_dim' and +#' 'lat_dim'. +#'} +#'\item{variance}{ +#' An array of explained variances with dimensions 'eof' and the rest of the +#' dimensions of 'data' except 'time_dim', 'sdate_dim', 'lon_dim' and +#' 'lat_dim'. +#'} +#'\item{eof_pattern}{ +#' An array of EOF patterns obtained by regression with dimensions: 'eof' and +#' the rest of the dimensions of 'data' except 'time_dim' and 'sdate_dim'. +#'} +#'\item{mask}{ +#' An array of the mask with dimensions ('lon_dim', 'lat_dim', rest of the +#' dimensions of 'data' except 'time_dim'). It is made from 'data', 1 for the +#' positions that 'data' has value and NA for the positions that 'data' has NA. +#' It is used to replace NAs with 0s for EOF calculation and mask the result +#' with NAs again after the calculation. +#'} +#'\item{coordinates}{ +#' Longitudinal and latitudinal coordinates vectors. +#'} +#' #'@examples #'exp <- array(runif(1280)*280, dim = c(dataset = 2, member = 2, sdate = 3, #' ftime = 3, lat = 4, lon = 4, var = 1)) @@ -162,38 +246,130 @@ CST_MultiEOF <- function(datalist, neof_max = 40, neof_composed = 5, #'cal <- MultiEOF(data = exp, lon = lon, lat = lat, time = Dates) #'@import multiApply #'@export -MultiEOF <- function(data, lon, lat, time, +MultiEOF <- function(data, lon, lat, dates, time = NULL, lon_dim = "lon", lat_dim = "lat", + time_dim = 'ftime', sdate_dim = 'sdate', var_dim = 'var', neof_max = 40, neof_composed = 5, minvar = 0.6, - lon_lim = NULL, lat_lim = NULL) { - - # Know spatial coordinates names - if (!any(lon_dim %in% .KnownLonNames()) | - !any(lat_dim %in% .KnownLatNames())) { - stop("Spatial coordinate names do not match any of the names accepted by ", - "the package.") + lon_lim = NULL, lat_lim = NULL, ncores = NULL) { + # Check inputs + # data + if (is.null(data)) { + stop("Parameter 'data' cannot be NULL.") + } + if (!is.numeric(data)) { + stop("Parameter 'data' must be a numeric array.") + } + if (any(is.null(names(dim(data))))| any(nchar(names(dim(data))) == 0)) { + stop("Parameter 'data' must have dimension names.") + } + # dates + if (!is.null(time)) { + warning("The parameter 'time' is deprecated, use 'dates' instead.") + dates <- time + } + # lon_dim + if (!is.character(lon_dim) | length(lon_dim) != 1) { + stop("Parameter 'lon_dim' must be a character string.") + } + if (!lon_dim %in% names(dim(data))) { + stop("Parameter 'lon_dim' is not found in 'data' dimension.") + } + # lat_dim + if (!is.character(lat_dim) | length(lat_dim) != 1) { + stop("Parameter 'lat_dim' must be a character string.") + } + if (!lat_dim %in% names(dim(data))) { + stop("Parameter 'lat_dim' is not found in 'data' dimension.") + } + # lon + if (!is.numeric(lon) | length(lon) != dim(data)[lon_dim]) { + stop(paste0("Parameter 'lon' must be a numeric vector with the same ", + "length as the longitude dimension of 'data'.")) + } + if (any(lon > 360 | lon < -360)) { + warning("Some 'lon' is out of the range [-360, 360].") + } + # lat + if (!is.numeric(lat) | length(lat) != dim(data)[lat_dim]) { + stop(paste0("Parameter 'lat' must be a numeric vector with the same ", + "length as the latitude dimension of 'data'.")) + } + if (any(lat > 90 | lat < -90)) { + stop("Parameter 'lat' must contain values within the range [-90, 90].") + } + # time_dim + if (!is.character(time_dim) | length(time_dim) != 1) { + stop("Parameter 'time_dim' must be a character string.") + } + if (!time_dim %in% names(dim(data))) { + stop("Parameter 'time_dim' is not found in 'data' dimension.") + } + # sdate_dim + if (!is.character(sdate_dim) | length(sdate_dim) != 1) { + stop("Parameter 'sdate_dim' must be a character string.") + } + if (!sdate_dim %in% names(dim(data))) { + stop("Parameter 'sdate_dim' is not found in 'data' dimension.") + } + # var_dim + if (!is.character(var_dim) | length(var_dim) != 1) { + stop("Parameter 'var_dim' must be a character string.") + } + if (!var_dim %in% names(dim(data))) { + stop("Parameter 'var_dim' is not found in 'data' dimension.") + } + # neof_max + if (!is.numeric(neof_max)) { + stop("Parameter 'neof_max' must be a positive integer.") + } + # neof_composed + if (!is.numeric(neof_composed)) { + stop("Parameter 'neof_composed' must be a positive integer.") + } + # minvar + if (!is.numeric(minvar)) { + stop("Parameter 'minvar' must be a positive number between 0 and 1.") + } + # lon_lim + if (!is.null(lon_lim)) { + if (!is.numeric(lon_lim)) { + stop("Parameter 'lon_lim' must be numeric.") + } + } + # lat_lim + if (!is.null(lat_lim)) { + if (!is.numeric(lat_lim)) { + stop("Parameter 'lat_lim' must be numeric.") + } + } + # ncores + if (!is.null(ncores)) { + if (!is.numeric(ncores) | ncores %% 1 != 0 | ncores <= 0 | + length(ncores) > 1) { + stop("Parameter 'ncores' must be a positive integer.") + } } # Reorder and group ftime and sdate together at the end in that order - cdim0 <- dim(data) - imaskt <- names(cdim0) %in% "ftime" - imasks <- names(cdim0) %in% "sdate" + imaskt <- names(dim(data)) %in% time_dim + imasks <- names(dim(data)) %in% sdate_dim data <- .aperm2(data, c(which(!(imasks | imaskt)), which(imaskt), which(imasks))) - cdim <- dim(data) + dims <- dim(data) ind <- 1:length(which(!(imaskt | imasks))) # compact (multiply) time_dim dimensions - dim(data) <- c(cdim[ind], samples = prod(cdim[-ind])) + dim(data) <- c(dims[ind], samples = prod(dims[-ind])) # Repeatedly apply .multi.eofs - result <- Apply(data, c("var", lon_dim, lat_dim, "samples"), - .multi.eofs, lon, lat, time, neof_max = neof_max, - neof_composed = neof_composed, minvar = minvar, - xlim = lon_lim, ylim = lat_lim, - lon_dim = lon_dim, lat_dim = lat_dim) + result <- Apply(data = data, + target_dims = c(var_dim, lon_dim, lat_dim, "samples"), + fun = .multi.eofs, lon = lon, lat = lat, dates = dates, + neof_max = neof_max, neof_composed = neof_composed, + minvar = minvar, xlim = lon_lim, ylim = lat_lim, + lon_dim = lon_dim, lat_dim = lat_dim, ncores = ncores) # Expand back samples to compacted dims - dim(result$coeff) <- c(cdim[-ind], dim(result$coeff)[-1]) + dim(result$coeff) <- c(dims[-ind], dim(result$coeff)[-1]) # Recover first lon and first lat list dd = dim(result[[lon_dim]])[1]; m = matrix(1, nrow = dd, ncol = length(dim(result[[lon_dim]]))); m[1:dd] = 1:dd; result[[lon_dim]] = result[[lon_dim]][m] @@ -222,7 +398,7 @@ MultiEOF <- function(data, lon, lat, time, #'variable). #'@noRd -.multi.eofs <- function(field_arr_raw, lon, lat, time, neof_max = 40, +.multi.eofs <- function(field_arr_raw, lon, lat, dates, neof_max = 40, neof_composed = 5, minvar = 0.6, xlim = NULL, ylim = NULL, lon_dim = "lon", lat_dim = "lat") { @@ -231,9 +407,14 @@ MultiEOF <- function(data, lon, lat, time, } else { lin.fit <- lm.fit } - + + # Dimensions n_field <- dim(field_arr_raw)[1] - etime <- .power.date(time) + n_lon <- dim(field_arr_raw)[2] + n_lat <- dim(field_arr_raw)[3] + nt <- dim(field_arr_raw)[4] + + etime <- .power.date(dates) field_arr <- array(dim = dim(field_arr_raw)) for (k in seq(1, n_field, 1)) { @@ -243,8 +424,33 @@ MultiEOF <- function(data, lon, lat, time, # area weighting, based on the root of cosine ww <- .area.weight(lon, lat, root = T) + # create a mask + mask_arr <- array(dim = c(n_lon, n_lat, n_field)) + for (k in seq(1, n_field, 1)) { field_orig <- field_arr[k, , , ] + + # Check if all the time steps at one grid point are NA-consistent + # The grid point should have all NAs or no NA along time dim. + if (anyNA(field_orig)) { + field_latlon <- array(field_orig, dim = c(n_lon*n_lat, nt)) # [lon*lat, time] + na_ind <- which(is.na(field_latlon), arr.ind = T) + if (dim(na_ind)[1] != nt * length(unique(na_ind[,1]))) { + stop("Detected certain grid points have NAs but not consistent across time ", + "dimension. If the grid point is NA, it should have NA at all time step.") + } + } + # Build the mask + mask <- field_orig[, , 1] + mask[!is.finite(mask)] <- NA + mask[is.finite(mask)] <- 1 + dim(mask) <- c(n_lon, n_lat) + mask_arr[,,k] <- mask + + # Replace mask of NAs with 0s for EOF analysis. + field_arr[k, , , ][!is.finite(field_orig)] <- 0 + field_orig[!is.finite(field_orig)] <- 0 + # calculate the area weight field <- sweep(field_orig, c(1, 2), ww, "*") idx <- .selbox(lon, lat, xlim, ylim) @@ -274,7 +480,7 @@ MultiEOF <- function(data, lon, lat, time, for (i in 1:reqPC) { regression[, , i] <- apply(field_orig, c(1, 2), function(x) lin.fit(as.matrix(coefficient[, i], - ncol = 1), x)$coefficients) + ncol = 1), x)$coefficients)*mask } assign(paste0("pc", k), list(coeff = coefficient, variance = variance, wcoeff = sweep(coefficient, c(2), variance, "*"), @@ -300,19 +506,20 @@ MultiEOF <- function(data, lon, lat, time, for (i in 1:neof_composed) { regression[k, , , i] <- apply(field_arr[k, , , ], c(1, 2), function(x) lin.fit(as.matrix(coefficient[, i], - ncol = 1), x)$coefficients) + ncol = 1), x)$coefficients)*mask_arr[,,k] } } - names(dim(coefficient)) <- c("time", "eof") + names(dim(coefficient)) <- c("dates", "eof") variance <- array(variance) names(dim(variance)) <- "eof" - names(dim(regression)) <- c("var", "lon", "lat", "eof") + names(dim(regression)) <- c(names(dim(field_arr_raw))[1:3], "eof") - out <- list(coeff = coefficient, variance = variance, eof_pattern = regression) + out <- list(coeff = coefficient, variance = variance, eof_pattern = regression, + mask = mask_arr) - out[[lon_dim]] <- slon - out[[lat_dim]] <- slat + out[[names(n_lon)]] <- slon + out[[names(n_lat)]] <- slat return(out) } diff --git a/man/CST_MultiEOF.Rd b/man/CST_MultiEOF.Rd index 11f8877f..265eabd4 100644 --- a/man/CST_MultiEOF.Rd +++ b/man/CST_MultiEOF.Rd @@ -6,6 +6,11 @@ \usage{ CST_MultiEOF( datalist, + lon_dim = "lon", + lat_dim = "lat", + time_dim = "ftime", + sdate_dim = "sdate", + var_dim = "var", neof_max = 40, neof_composed = 5, minvar = 0.6, @@ -20,7 +25,24 @@ an element named \code{$data} with at least two spatial dimensions named "lon" and "lat", a dimension "ftime" and a dimension "sdate". Latitudinal dimension accepted names: 'lat', 'lats', 'latitude', 'y', 'j', 'nav_lat'. Longitudinal dimension accepted names: 'lon', 'lons','longitude', 'x', 'i', -'nav_lon'.} +'nav_lon'. NAs can exist but it should be consistent along 'time_dim'. That +is, if one grid point has NAs for each variable, all the time steps at this +point should be NAs.} + +\item{lon_dim}{A character string indicating the name of the longitudinal +dimension. By default, it is set to 'lon'.} + +\item{lat_dim}{A character string indicating the name of the latitudinal +dimension. By default, it is set to 'lat'.} + +\item{time_dim}{A character string indicating the name of the temporal +dimension. By default, it is set to 'time'.} + +\item{sdate_dim}{A character string indicating the name of the start date +dimension. By default, it is set to 'sdate'.} + +\item{var_dim}{A character string indicating the name of the variable +dimension. By default, it is set to 'var'.} \item{neof_max}{Maximum number of single eofs considered in the first decomposition.} @@ -36,10 +58,33 @@ for all input variables.} for all input variables.} } \value{ -A list with elements \code{$coeff} (an array of time-varying principal -component coefficients), \code{$variance} (a matrix of explained variances), -\code{eof_pattern} (a matrix of EOF patterns obtained by regression for each -variable). +A list containing: +\item{coeff}{ + An 's2dv_cube' with the data element being an array of principal components + with dimensions 'time_dim', 'sdate_dim', number of eof, rest of the + dimensions of 'data' except 'lon_dim' and 'lat_dim'. +} +\item{variance}{ + An 's2dv_cube' with the data element being an array of explained variances + with dimensions 'eof' and the rest of the dimensions of 'data' except + 'time_dim', 'sdate_dim', 'lon_dim' and 'lat_dim'. +} +\item{eof_pattern}{ + An 's2dv_cube' with the data element being an array of EOF patterns obtained + by regression with dimensions: 'eof' and the rest of the dimensions of + 'data' except 'time_dim' and 'sdate_dim'. +} +\item{mask}{ + An 's2dv_cube' with the data element being an array of the mask with + dimensions ('lon_dim', 'lat_dim', rest of the dimensions of 'data' except + 'time_dim'). It is made from 'data', 1 for the positions that 'data' has + value and NA for the positions that 'data' has NA. It is used to replace NAs + with 0s for EOF calculation and mask the result with NAs again after the + calculation. +} +\item{coordinates}{ + Longitudinal and latitudinal coordinates vectors. +} } \description{ This function performs EOF analysis over multiple variables, diff --git a/man/MultiEOF.Rd b/man/MultiEOF.Rd index 04963e1a..dbefda9d 100644 --- a/man/MultiEOF.Rd +++ b/man/MultiEOF.Rd @@ -9,14 +9,19 @@ MultiEOF( data, lon, lat, - time, + dates, + time = NULL, lon_dim = "lon", lat_dim = "lat", + time_dim = "ftime", + sdate_dim = "sdate", + var_dim = "var", neof_max = 40, neof_composed = 5, minvar = 0.6, lon_lim = NULL, - lat_lim = NULL + lat_lim = NULL, + ncores = NULL ) } \arguments{ @@ -25,17 +30,33 @@ the variables to be analysed. The other diemnsions follow the same structure as the \code{"exp"} element of a 's2dv_cube' object. Latitudinal dimension accepted names: 'lat', 'lats', 'latitude', 'y', 'j', 'nav_lat'. Longitudinal dimension accepted names: 'lon', 'lons','longitude', 'x', 'i', -'nav_lon'.} +'nav_lon'. NAs can exist but it should be consistent along 'time_dim'. That +is, if one grid point has NAs for each variable, all the time steps at this +point should be NAs.} \item{lon}{Vector of longitudes.} \item{lat}{Vector of latitudes.} -\item{time}{Vector or matrix of dates in POSIXct format.} +\item{dates}{Vector or matrix of dates in POSIXct format.} -\item{lon_dim}{String with dimension name of longitudinal coordinate.} +\item{time}{Deprecated parameter, it has been substituted by 'dates'. It will +be removed in the next release.} -\item{lat_dim}{String with dimension name of latitudinal coordinate.} +\item{lon_dim}{A character string indicating the name of the longitudinal +dimension. By default, it is set to 'lon'.} + +\item{lat_dim}{A character string indicating the name of the latitudinal +dimension. By default, it is set to 'lat'.} + +\item{time_dim}{A character string indicating the name of the temporal +dimension. By default, it is set to 'time'.} + +\item{sdate_dim}{A character string indicating the name of the start date +dimension. By default, it is set to 'sdate'.} + +\item{var_dim}{A character string indicating the name of the variable +dimension. By default, it is set to 'var'.} \item{neof_max}{Maximum number of single eofs considered in the first decomposition.} @@ -49,12 +70,36 @@ all input variables.} \item{lat_lim}{Vector with latitudinal range limits for the calculation for all input variables.} + +\item{ncores}{An integer indicating the number of cores to use for parallel +computation. The default value is NULL.} } \value{ -A list with elements \code{$coeff} (an array of time-varying principal -component coefficients), \code{$variance} (a matrix of explained variances), -\code{eof_pattern} (a matrix of EOF patterns obtained by regression for each -variable). +A list containing: +\item{coeff}{ + An array of principal components with dimensions 'time_dim', 'sdate_dim', + number of eof, rest of the dimensions of 'data' except 'lon_dim' and + 'lat_dim'. +} +\item{variance}{ + An array of explained variances with dimensions 'eof' and the rest of the + dimensions of 'data' except 'time_dim', 'sdate_dim', 'lon_dim' and + 'lat_dim'. +} +\item{eof_pattern}{ + An array of EOF patterns obtained by regression with dimensions: 'eof' and + the rest of the dimensions of 'data' except 'time_dim' and 'sdate_dim'. +} +\item{mask}{ + An array of the mask with dimensions ('lon_dim', 'lat_dim', rest of the + dimensions of 'data' except 'time_dim'). It is made from 'data', 1 for the + positions that 'data' has value and NA for the positions that 'data' has NA. + It is used to replace NAs with 0s for EOF calculation and mask the result + with NAs again after the calculation. +} +\item{coordinates}{ + Longitudinal and latitudinal coordinates vectors. +} } \description{ This function performs EOF analysis over multiple variables, diff --git a/tests/testthat/test-CST_MultiEOF.R b/tests/testthat/test-CST_MultiEOF.R index 66a63525..a2f66257 100644 --- a/tests/testthat/test-CST_MultiEOF.R +++ b/tests/testthat/test-CST_MultiEOF.R @@ -32,6 +32,9 @@ mod3 <- cos( 0.5 + seq ) + sin( seq ^ 2 * 0.2 ) dim(mod3) <- c(dataset = 2, member = 3, sdate = 4, ftime = 5, lat = 6, lon = 8) exp3$data <- mod3 +dat3 <- mod3 +dim(dat3) <- c(var = 1, dim(mod3)) + # dat0 dat0 <- exp1 dat01 <- exp2 @@ -39,27 +42,39 @@ dat0$coords <- NULL dat01$coords <- NULL dat02 <- dat0 dat03 <- dat01 +dat04 <- exp1 +dat04$attrs$Dates <- NULL dat02$coords <- list(long = seq(1:4), lati = seq(1:4)) dat03$coords <- list(long = seq(1:4), lati = seq(1:4)) +# dat4 +exp4 <- array(runif(1280)*280, dim = c(dataset = 2, member = 2, sdates = 3, + time = 3, latitude = 4, longitude = 4, vars = 1)) +lon4 <- seq(0, 3) +lat4 <- seq(47, 44) +dates4 <- c("2000-11-01", "2000-12-01", "2001-01-01", "2001-11-01", + "2001-12-01", "2002-01-01", "2002-11-01", "2002-12-01", "2003-01-01") +Dates4 <- as.POSIXct(dates4, format = "%Y-%m-%d") +dim(Dates4) <- c(ftime = 3, sdate = 3) + ############################################## test_that("1. Input checks", { expect_error( CST_MultiEOF(datalist = 1), - paste0("Elements of the list in parameter 'datalist' must be of the class ", - "'s2dv_cube', as output by CSTools::CST_Load.") + paste0("Elements of the list in parameter 'datalist' must be of the class.", + "'s2dv_cube'.") ) - # Check if all dims equal - expect_error( - CST_MultiEOF(list(exp1, exp03)), - "Input data fields must all have the same dimensions." - ) - # Know spatial coordinates names expect_error( CST_MultiEOF(list(dat0, dat01)), paste0("Parameter 'datalist' must have 'data', 'coords' and 'attrs' elements ", "within the 's2dv_cube' structure.") ) + # Dates + expect_error( + CST_MultiEOF(list(dat04)), + "Element 'Dates' is not found in 'attrs' list of the first array." + ) + # coordinates expect_error( CST_MultiEOF(list(dat02, dat03)), paste0("Spatial coordinate names do not match any of the names accepted by ", @@ -67,16 +82,69 @@ test_that("1. Input checks", { " 'y', 'j', 'nav_lat'. Longitudes accepted names: 'lon', 'lons',", " 'longitude', 'x', 'i', 'nav_lon'.") ) + # Check if all dims equal + expect_error( + CST_MultiEOF(list(exp1, exp03)), + "Input data fields must all have the same dimensions." + ) + expect_error( + CST_MultiEOF(list(exp1, exp2, exp3), lon_lim = c(-250, -245), lat_lim = c(10, 25)), + "No intersection between longitude bounds and data domain." + ) +}) + +############################################## + +test_that("2. Input checks MultiEOF", { + # time + expect_warning( + MultiEOF(data = dat3, lon = lon, lat = lat, time = d), + "The parameter 'time' is deprecated, use 'dates' instead." + ) + expect_error( + MultiEOF(data = 1, lon = lon, lat = lat, time = d), + "Parameter 'data' must have dimension names." + ) + # lon_dim + expect_error( + MultiEOF(data = dat3, lon = lon, lat = lat, dates = d, lon_dim = 'lons'), + "Parameter 'lon_dim' is not found in 'data' dimension." + ) + # lat_dim + expect_error( + MultiEOF(data = dat3, lon = lon, lat = lat, dates = d, lat_dim = 1), + "Parameter 'lat_dim' must be a character string." + ) + # lon expect_error( MultiEOF(data = array(rnorm(96), dim = c(var = 2, lonss = 8, latss = 6)), lon = seq(1:7), lat = seq(1:5), lon_dim = 'lonss', lat_dim = 'latss'), - paste0("Spatial coordinate names do not match any of the names accepted by ", - "the package.") + paste0("Parameter 'lon' must be a numeric vector with the same ", + "length as the longitude dimension of 'data'.") ) + # lat expect_error( - CST_MultiEOF(list(exp1, exp2, exp3), lon_lim = c(-250, -245), lat_lim = c(10, 25)), - "No intersection between longitude bounds and data domain.") -}) + MultiEOF(data = array(rnorm(96), dim = c(var = 2, lonss = 8, latss = 6)), + lon = seq(1:8), lat = seq(1:5), lon_dim = 'lonss', lat_dim = 'latss'), + paste0("Parameter 'lat' must be a numeric vector with the same ", + "length as the latitude dimension of 'data'.") + ) + # time_dim + expect_error( + MultiEOF(data = dat3, lon = lon, lat = lat, dates = d, time_dim = 'lons'), + "Parameter 'time_dim' is not found in 'data' dimension." + ) + # sdate_dim + expect_error( + MultiEOF(data = dat3, lon = lon, lat = lat, dates = d, sdate_dim = 1), + "Parameter 'sdate_dim' must be a character string." + ) + # var_dim + expect_error( + MultiEOF(data = dat3, lon = lon, lat = lat, dates = d, var_dim = 'vars'), + "Parameter 'var_dim' is not found in 'data' dimension." + ) +}) ############################################## @@ -84,47 +152,47 @@ test_that("2. Output checks", { cal <- CST_MultiEOF(datalist = list(exp1, exp2, exp3), neof_composed=2) expect_equal( length(cal), - 5 + 6 ) dimexp = dim(exp1$data) expect_equal( - dim(cal$coeff), + dim(cal$coeff$data), c(dimexp["ftime"], dimexp["sdate"], eof=2, dimexp["dataset"], dimexp["member"]) ) expect_equal( - dim(cal$variance), + dim(cal$variance$data), c(eof = 2, dimexp["dataset"], dimexp["member"]) ) expect_equal( - dim(cal$eof_pattern), + dim(cal$eof_pattern$data), c(var = 3, dimexp["lon"], dimexp["lat"], eof = 2, dimexp["dataset"], dimexp["member"]) ) expect_equal( - cal$variance[1, 1, 1], + cal$variance$data[1, 1, 1], 0.2909419, tolerance = .00001 ) expect_equal( - cal$coeff[2, 1, 1, 1, 1], + cal$coeff$data[2, 1, 1, 1, 1], 0.5414261, tolerance = .00001 ) expect_equal( - cal$eof_pattern[1, 2, 2, 2, 1, 1], + cal$eof_pattern$data[1, 2, 2, 2, 1, 1], 0.3932484, tolerance = .00001 ) cal <- CST_MultiEOF(list(exp1, exp2, exp3), neof_max = 5, neof_composed = 2, minvar = 0.2) expect_equal( - cal$coeff[2, 1, 1, 1, 1], + cal$coeff$data[2, 1, 1, 1, 1], -0.6117927, tolerance = .00001 ) cal <- CST_MultiEOF(list(exp1, exp2, exp3), lon_lim = c(5, 30), lat_lim = c(10, 25)) expect_equal( - cal$coeff[2, 1, 1, 1, 1], + cal$coeff$data[2, 1, 1, 1, 1], 0.8539488, tolerance = .00001 ) @@ -152,6 +220,35 @@ test_that("2. Output checks", { exp3$data[1, 1, 1, 1, 1, 1] = NaN expect_error( CST_MultiEOF(list(exp1, exp3), neof_max = 8, neof_composed=2), - "Input data contain NA values." + paste0("Detected certain grid points have NAs but not consistent across time ", + "dimension. If the grid point is NA, it should have NA at all time step.") ) }) + +############################################## + +test_that("3. Output checks II", { + # time_dim, sdate_dim, var_dim, lon_dim, lat_dim + cal <- MultiEOF(data = exp4, lon = lon4, lat = lat4, dates = Dates4, + time_dim = 'time', sdate_dim = 'sdates', var_dim = 'vars', + lon_dim = 'longitude', lat_dim = 'latitude') + expect_equal( + dim(cal[[3]]), + c(vars = 1, longitude = 4, latitude = 4, eof = 5, dataset = 2, member = 2) + ) + # NA + exp4_1 <- exp4 + exp4_1[1,2,1,1,1:2,1,1] <- NA # random NA + expect_error( + MultiEOF(data = exp4_1, lon = lon4, lat = lat4, dates = Dates4, + time_dim = 'time', sdate_dim = 'sdates', var_dim = 'vars', + lon_dim = 'longitude', lat_dim = 'latitude'), + paste0("Detected certain grid points have NAs but not consistent across time ", + "dimension. If the grid point is NA, it should have NA at all time step.") + ) + exp4_2 <- exp4 + exp4_2[,,,,1,1,] <- NA # spatial NA + cal <- MultiEOF(data = exp4_2, lon = lon4, lat = lat4, dates = Dates4, + time_dim = 'time', sdate_dim = 'sdates', var_dim = 'vars', + lon_dim = 'longitude', lat_dim = 'latitude') +}) -- GitLab From ad0c47b1feba2246bd89906c8cdf156762837634 Mon Sep 17 00:00:00 2001 From: Eva Rifa Date: Fri, 15 Sep 2023 10:44:40 +0200 Subject: [PATCH 2/2] Add ncores parameter and correct example in MultiEOF --- R/CST_MultiEOF.R | 11 +++++++---- man/CST_MultiEOF.Rd | 6 +++++- man/MultiEOF.Rd | 2 +- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/R/CST_MultiEOF.R b/R/CST_MultiEOF.R index 62f6b991..3a6c9026 100644 --- a/R/CST_MultiEOF.R +++ b/R/CST_MultiEOF.R @@ -38,6 +38,8 @@ #' for all input variables. #'@param lat_lim Vector with latitudinal range limits for the EOF calculation #' for all input variables. +#'@param ncores An integer indicating the number of cores to use for parallel +#' computation. The default value is NULL. #'@return #'A list containing: #'\item{coeff}{ @@ -94,7 +96,8 @@ CST_MultiEOF <- function(datalist, lon_dim = "lon", lat_dim = "lat", time_dim = 'ftime', sdate_dim = 'sdate', var_dim = 'var', neof_max = 40, neof_composed = 5, - minvar = 0.6, lon_lim = NULL, lat_lim = NULL) { + minvar = 0.6, lon_lim = NULL, lat_lim = NULL, + ncores = NULL) { # Check 's2dv_cube' if (!(all(sapply(datalist, inherits, 's2dv_cube')))) { stop("Elements of the list in parameter 'datalist' must be of the ", @@ -134,12 +137,12 @@ CST_MultiEOF <- function(datalist, lon_dim = "lon", lat_dim = "lat", lon <- as.vector(datalist[[1]]$coords[[lon_name]]) lat <- as.vector(datalist[[1]]$coords[[lat_name]]) - result <- MultiEOF(exp, lon = lon, lat = lat, + result <- MultiEOF(data = exp, lon = lon, lat = lat, lon_dim = lon_dim, lat_dim = lat_dim, time_dim = time_dim, sdate_dim = sdate_dim, var_dim = var_dim, dates = dates, minvar = minvar, neof_max = neof_max, neof_composed = neof_composed, - lon_lim = lon_lim, lat_lim = lat_lim) + lon_lim = lon_lim, lat_lim = lat_lim, ncores = ncores) names_res <- names(result[1:4]) res <- lapply(seq_along(result)[1:4], function(i) { coords = list(lon, lat) @@ -243,7 +246,7 @@ CST_MultiEOF <- function(datalist, lon_dim = "lon", lat_dim = "lat", #' "2001-12-01", "2002-01-01", "2002-11-01", "2002-12-01", "2003-01-01") #'Dates <- as.POSIXct(dates, format = "%Y-%m-%d") #'dim(Dates) <- c(ftime = 3, sdate = 3) -#'cal <- MultiEOF(data = exp, lon = lon, lat = lat, time = Dates) +#'cal <- MultiEOF(data = exp, lon = lon, lat = lat, dates = Dates) #'@import multiApply #'@export MultiEOF <- function(data, lon, lat, dates, time = NULL, diff --git a/man/CST_MultiEOF.Rd b/man/CST_MultiEOF.Rd index 265eabd4..7162c947 100644 --- a/man/CST_MultiEOF.Rd +++ b/man/CST_MultiEOF.Rd @@ -15,7 +15,8 @@ CST_MultiEOF( neof_composed = 5, minvar = 0.6, lon_lim = NULL, - lat_lim = NULL + lat_lim = NULL, + ncores = NULL ) } \arguments{ @@ -56,6 +57,9 @@ for all input variables.} \item{lat_lim}{Vector with latitudinal range limits for the EOF calculation for all input variables.} + +\item{ncores}{An integer indicating the number of cores to use for parallel +computation. The default value is NULL.} } \value{ A list containing: diff --git a/man/MultiEOF.Rd b/man/MultiEOF.Rd index dbefda9d..fb8eefa6 100644 --- a/man/MultiEOF.Rd +++ b/man/MultiEOF.Rd @@ -119,7 +119,7 @@ dates <- c("2000-11-01", "2000-12-01", "2001-01-01", "2001-11-01", "2001-12-01", "2002-01-01", "2002-11-01", "2002-12-01", "2003-01-01") Dates <- as.POSIXct(dates, format = "\%Y-\%m-\%d") dim(Dates) <- c(ftime = 3, sdate = 3) -cal <- MultiEOF(data = exp, lon = lon, lat = lat, time = Dates) +cal <- MultiEOF(data = exp, lon = lon, lat = lat, dates = Dates) } \author{ Jost von Hardenberg - ISAC-CNR, \email{j.vonhardenberg@isac.cnr.it} -- GitLab