diff --git a/R/Analogs.R b/R/Analogs.R index feb063ef258c1524d1e860843356e0c931798bd3..062425faddf7bb068e1c7058db0ddae32dd79b14 100644 --- a/R/Analogs.R +++ b/R/Analogs.R @@ -1,16 +1,24 @@ #'@rdname CST_Analogs -#'@title Downscaling using Analogs based on large scale fields. +#'@title Downscaling using Analogs based on coarse scale fields. #' #'@author J. Ramon, \email{jaume.ramon@bsc.es} #' #'@description This function performs a downscaling using Analogs. To compute -#'the analogs given a coarse-scale field, the function looks for days with -#'similar conditions in the historical observations. The coarse scale and -#'observation data can be either global or regional. In the latter case, the -#'region is defined by the user. In principle, the coarse and observation data -#'should be of the same variable, although different variables can also be admitted. -#'The analogs function will find the N best analogs based in Minimum Euclidean -#'distance. +#'the analogs given a coarse-scale field, the function looks for days with similar conditions +#'in the historical observations. The analogs function determines the N best analogs based +#'on Euclidian distance, distance correlation, or Spearman's correlation metrics. To downscale +#'a local-scale variable, either the variable itself or another large-scale variable +#'can be utilized as the predictor. In the first scenario, analogs are examined between +#'the observation and model data of the same local-scale variable. In the latter scenario, +#'the function identifies the day in the observation data that closely resembles +#'the large-scale pattern of interest in the model. When it identifies the date of +#'the best analog, the function extracts the corresponding local-scale variable for that day +#'from the observation of the local scale variable. The used local-scale and large-scale +#'variables can be retrieved from independent regions. The input data for the first case must +#'include 'exp' and 'obs,' while in the second case, 'obs,' 'obsL,' and 'exp' are the +#'required input fields. Users can perform the downscaling process over the subregions +#'that can be identified through the 'region' argument, instead of focusing +#'on the entire area of the loaded data. #' #'The search of analogs must be done in the longest dataset posible, but might #'require high-memory computational resources. This is important since it is @@ -21,25 +29,24 @@ #'is intended to downscale climate prediction data (i.e., sub-seasonal, seasonal #'and decadal predictions) but can admit climate projections or reanalyses. It does #'not have constrains of specific region or variables to downscale. -#'@param exp an 's2dv' object with named dimensions containing the experimental field on -#'the coarse scale for which the downscaling is aimed. The object must have, at least, -#'the dimensions latitude, longitude, start date and time. The object is expected to be -#'already subset for the desired region. Data can be in one or two integrated regions, e.g., -#'crossing the Greenwich meridian. To get the correct results in the latter case, -#'the borders of the region should be specified in the parameter 'region'. See parameter -#''region'. -#'@param obs an 's2dv' object with named dimensions containing the observational field. -#'The object must have, at least, the dimensions latitude, longitude and start date. -#'The object is expected to be already subset for the desired region. -#'@param obs2 an 's2dv' object with named dimensions containing a different observational -#'field to that in 'obs'. If provided, these observations will be used in the training, -#'i.e. the searching for analogs, so that they should be in a coarser grid to those in -#''obs'. Training with observations on a grid with a spatial resolution closer to that -#'in 'exp', will in principle ensure better results. The object must have, at least, the -#'dimensions latitude, longitude and start date. The object is expected to be already -#'subset for the desired region. -#'@param grid_exp a character vector with a path to an example file of the exp data. -#'It can be either a path to another NetCDF file which to read the target grid from +#'@param exp an 's2dv_cube' object with named dimensions containing the experimental field +#'on the coarse scale for the variable targeted for downscaling (in case obsL is not provided) +#'or for the large-scale variable used as the predictor (if obsL is provided). +#'The object must have, at least, the dimensions latitude, longitude, start date and time. +#'The object is expected to be already subset for the desired region. Data can be in one +#'or two integrated regions, e.g., crossing the Greenwich meridian. To get the correct +#'results in the latter case, the borders of the region should be specified in the parameter +#''region'. See parameter 'region'. +#'@param obs an 's2dv_cube' object with named dimensions containing the observational field +#'for the variable targeted for downscaling. The object must have, at least, the dimensions +#'latitude, longitude and start date. The object is expected to be already subset for the +#'desired region. +#'@param obsL an 's2dv_cube' object with named dimensions containing the observational +#'field of the large-scale variable. The object must have, at least, the dimensions latitude, +#'longitude and start date. The object is expected to be already subset for the desired region. +#'@param grid_exp a character vector with a path to an example file of the exp (if the +#'predictor is the local scale variable) or expL (if the predictor is a large scale variable) +#'data. It can be either a path to another NetCDF file which to read the target grid from #'(a single grid must be defined in such file) or a character vector indicating the #'coarse grid to be passed to CDO, and it must be a grid recognised by CDO. #'@param nanalogs an integer indicating the number of analogs to be searched @@ -56,6 +63,9 @@ #''data' in exp and obs. Default set to "time". #'@param member_dim a character vector indicating the member dimension name in the element #''data' in exp and obs. Default set to "member". +#'@param metric a character vector to select the analog specification method. Only these +#'options are valid: "dist" (i.e., Euclidian distance), "dcor" (i.e., distance correlation) +#'or "cor" (i.e., Spearman's .correlation). The default metric is "dist". #'@param region a numeric vector indicating the borders of the region defined in exp. #'It consists of four elements in this order: lonmin, lonmax, latmin, latmax. lonmin refers #'to the left border, while lonmax refers to the right border. latmin indicates the lower @@ -68,7 +78,7 @@ #'of the window. It is recommended to be set to TRUE. Default to TRUE. #'@param ncores an integer indicating the number of cores to use in parallel computation. #'The default value is NULL. -#'@return An 's2dv' object. The element 'data' contains the dowscaled field, 'lat' the +#'@return An 's2dv_cube' object. The element 'data' contains the dowscaled field, 'lat' the #'downscaled latitudes, and 'lon' the downscaled longitudes. If fun_analog is set to NULL #'(default), the output array in 'data' also contains the dimension 'analog' with the best #'analog days. @@ -81,31 +91,40 @@ #'dim(obs) <- c(lat = 12, lon = 15, sdate = 5, time = 30) #'obs_lons <- seq(0,6, 6/14) #'obs_lats <- seq(0,6, 6/11) -#'exp <- s2dv_cube(data = exp, lat = exp_lats, lon = exp_lons) -#'obs <- s2dv_cube(data = obs, lat = obs_lats, lon = obs_lons) +#'exp <- s2dv_cube(data = exp, coords = list(lat = exp_lats, lon = exp_lons)) +#'obs <- s2dv_cube(data = obs, coords = list(lat = obs_lats, lon = obs_lons)) #'downscaled_field <- CST_Analogs(exp = exp, obs = obs, grid_exp = 'r360x180') #'@export -CST_Analogs <- function(exp, obs, grid_exp, obs2 = NULL, nanalogs = 3, fun_analog = NULL, lat_dim = "lat", - lon_dim = "lon", sdate_dim = "sdate", time_dim = "time", member_dim = "member", - region = NULL, return_indices = FALSE, loocv_window = TRUE, ncores = NULL) { - - # input exp and obs must be s2dv_cube objects - if (!inherits(exp,'s2dv_cube')) { - stop("Parameter 'exp' must be of the class 's2dv_cube'") - } +CST_Analogs <- function(exp, obs, obsL = NULL, grid_exp, nanalogs = 3, + fun_analog = NULL, lat_dim = "lat", lon_dim = "lon", sdate_dim = "sdate", + time_dim = "time", member_dim = "member", metric = "dist", region = NULL, + return_indices = FALSE, loocv_window = TRUE, ncores = NULL) { # input exp and obs must be s2dv_cube objects if (!inherits(obs,'s2dv_cube')) { stop("Parameter 'obs' must be of the class 's2dv_cube'") } - res <- Analogs(exp = exp$data, obs = obs$data, exp_lats = exp$coords[[lat_dim]], - exp_lons = exp$coords[[lon_dim]], obs_lats = obs$coords[[lat_dim]], - obs_lons = obs$coords[[lon_dim]], grid_exp = grid_exp, nanalogs = nanalogs, - fun_analog = fun_analog, lat_dim = lat_dim, lon_dim = lon_dim, - sdate_dim = sdate_dim, time_dim = time_dim, member_dim = member_dim, - region = region, return_indices = return_indices, loocv_window = loocv_window, - ncores = ncores) + if (!is.null(obsL)) { + # input obs must be s2dv_cube objects + if (!inherits(obsL,'s2dv_cube')) { + stop("Parameter 'obsL' must be of the class 's2dv_cube'") + } + } + # input exp and obs must be s2dv_cube objects + if (!inherits(exp,'s2dv_cube')) { + stop("Parameter 'exp' must be of the class 's2dv_cube'") + } + + res <- Analogs(exp = exp$data, obs = obs$data, obsL = obsL$data, + exp_lats = exp$coords[[lat_dim]], exp_lons = exp$coords[[lon_dim]], + obs_lats = obs$coords[[lat_dim]], obs_lons = obs$coords[[lon_dim]], + obsL_lats = obsL$coords[[lat_dim]], obsL_lons = obsL$coords[[lon_dim]], + grid_exp = grid_exp, nanalogs = nanalogs, fun_analog = fun_analog, + lat_dim = lat_dim, lon_dim = lon_dim, sdate_dim = sdate_dim, + time_dim = time_dim, member_dim = member_dim, metric = metric, + region = region, return_indices = return_indices, + loocv_window = loocv_window, ncores = ncores) # Modify data, lat and lon in the origina s2dv_cube, adding the downscaled data exp$data <- res$data @@ -129,13 +148,21 @@ CST_Analogs <- function(exp, obs, grid_exp, obs2 = NULL, nanalogs = 3, fun_analo #'@author Ll. Lledó, \email{llorenc.lledo@ecmwf.int} #' #'@description This function performs a downscaling using Analogs. To compute -#'the analogs given a coarse-scale field, the function looks for days with -#'similar conditions in the historical observations. The coarse scale and -#'observation data can be either global or regional. In the latter case, the -#'region is defined by the user. In principle, the coarse and observation data -#'should be of the same variable, although different variables can also be admitted. -#'The analogs function will find the N best analogs based in Minimum Euclidean -#'distance. +#'the analogs given a coarse-scale field, the function looks for days with similar conditions +#'in the historical observations. The analogs function determines the N best analogs based +#'on RMSE, distance correlation, or Spearman's correlation metrics. To downscale +#'a local-scale variable, either the variable itself or another large-scale variable +#'can be utilized as the predictor. In the first scenario, analogs are examined between +#'the observation and model data of the same local-scale variable. In the latter scenario, +#'the function identifies the day in the observation data that closely resembles +#'the large-scale pattern of interest in the model. When it identifies the date of +#'the best analog, the function extracts the corresponding local-scale variable for that day +#'from the observation of the local scale variable. The used local-scale and large-scale +#'variables can be retrieved from independent regions. The input data for the first case must +#'include 'exp' and 'obs,' while in the second case, 'obs,' 'obsL,' and 'expL' are the +#'required input fields. Users can perform the downscaling process over the subregions +#'that can be identified through the 'region' and 'regionL' arguments, instead of focusing +#'on the entire area of the loaded data. #' #'The search of analogs must be done in the longest dataset posible, but might #'require high-memory computational resources. This is important since it is @@ -146,19 +173,20 @@ CST_Analogs <- function(exp, obs, grid_exp, obs2 = NULL, nanalogs = 3, fun_analo #'is intended to downscale climate prediction data (i.e., sub-seasonal, seasonal #'and decadal predictions) but can admit climate projections or reanalyses. It does #'not have constrains of specific region or variables to downscale. -#'@param exp an array with named dimensions containing the experimental field on the -#'coarse scale for which the downscaling is aimed. The object must have, at least, -#'the dimensions latitude, longitude, start date and time. The object is expected to be -#'already subset for the desired region. Data can be in one or two integrated regions, e.g., -#'crossing the Greenwich meridian. To get the correct results in the latter case, -#'the borders of the region should be specified in the parameter 'region'. See parameter -#''region'. -#'@param obs an array with named dimensions containing the observational field. The object -#'must have, at least, the dimensions latitude, longitude, start date and time. The object -#'is expected to be already subset for the desired region. Optionally, 'obs' can have the -#'dimension 'window', containing the sampled fields into which the function will look for -#'the analogs. See function 'generate_window()'. Otherwise, the function will look for -#'analogs using all the possible fields contained in obs. +#'@param exp an array with named dimensions containing the experimental field +#'on the coarse scale for the variable targeted for downscaling (in case obsL is not provided) +#'or for the large-scale variable used as the predictor (if obsL is provided). +#'The object must have, at least, the dimensions latitude, longitude, start date and time. +#'The object is expected to be already subset for the desired region. Data can be in one +#'or two integrated regions, e.g., crossing the Greenwich meridian. To get the correct +#'results in the latter case, the borders of the region should be specified in the parameter +#''region'. See parameter 'region'. +#'@param obs an array with named dimensions containing the observational field for the variable +#'targeted for downscaling. The object must have, at least, the dimensions latitude, longitude, +#'start date and time. The object is expected to be already subset for the desired region. +#'Optionally, 'obs' can have the dimension 'window', containing the sampled fields into which +#'the function will look for the analogs. See function 'generate_window()'. Otherwise, +#'the function will look for analogs using all the possible fields contained in obs. #'@param exp_lats a numeric vector containing the latitude values in 'exp'. Latitudes must #'range from -90 to 90. #'@param exp_lons a numeric vector containing the longitude values in 'exp'. Longitudes @@ -167,17 +195,21 @@ CST_Analogs <- function(exp, obs, grid_exp, obs2 = NULL, nanalogs = 3, fun_analo #'range from -90 to 90. #'@param obs_lons a numeric vector containing the longitude values in 'obs'. Longitudes #'can range from -180 to 180 or from 0 to 360. -#'@param grid_exp a character vector with a path to an example file of the exp data. +#'@param obsL an 's2dv_cube' object with named dimensions containing the observational +#'field of the large-scale variable.The object must have, at least, the dimensions latitude, +#'longitude and start date. The object is expected to be already subset for the desired region. +#'Optionally, 'obsL' can have the dimension 'window', containing the sampled fields into which +#'the function will look for the analogs. See function 'generate_window()'. Otherwise, +#'the function will look for analogs using all the possible fields contained in obs. +#'@param obsL_lats a numeric vector containing the latitude values in 'obsL'. Latitudes must +#'range from -90 to 90. +#'@param obsL_lons a numeric vector containing the longitude values in 'obsL'. Longitudes +#'can range from -180 to 180 or from 0 to 360. +#'@param grid_exp a character vector with a path to an example file of the exp (if the +#'predictor is the local scale variable) or expL (if the predictor is a large scale variable) data. #'It can be either a path to another NetCDF file which to read the target grid from #'(a single grid must be defined in such file) or a character vector indicating the #'coarse grid to be passed to CDO, and it must be a grid recognised by CDO. -#'@param obs2 an 's2dv' object with named dimensions containing a different observational -#'field to that in 'obs'. If provided, these observations will be used in the training, -#'i.e. the searching for analogs, so that they should be in a coarser grid to those in -#''obs'. Training with observations on a grid with a spatial resolution closer to that -#'in 'exp', will in principle ensure better results. The object must have, at least, the -#'dimensions latitude, longitude and start date. The object is expected to be already -#'subset for the desired region. #'@param nanalogs an integer indicating the number of analogs to be searched. #'@param fun_analog a function to be applied over the found analogs. Only these options #'are valid: "mean", "wmean", "max", "min", "median" or NULL. If set to NULL (default), @@ -192,6 +224,9 @@ CST_Analogs <- function(exp, obs, grid_exp, obs2 = NULL, nanalogs = 3, fun_analo #''data' in exp and obs. Default set to "time". #'@param member_dim a character vector indicating the member dimension name in the element #''data' in exp and obs. Default set to "member". +#'@param metric a character vector to select the analog specification method. Only these +#'options are valid: "dist" (i.e., Euclidian distance), "dcor" (i.e., distance correlation) +#'or "cor" (i.e., Spearman's .correlation). The default metric is "dist". #'@param region a numeric vector indicating the borders of the region defined in exp. #'It consists of four elements in this order: lonmin, lonmax, latmin, latmax. lonmin refers #'to the left border, while lonmax refers to the right border. latmin indicates the lower @@ -209,7 +244,7 @@ CST_Analogs <- function(exp, obs, grid_exp, obs2 = NULL, nanalogs = 3, fun_analo #'@import multiApply #'@import CSTools #'@importFrom s2dv InsertDim CDORemap -#'@importFrom FNN get.knnx +#'@importFrom energy dcor #' #'@seealso \code{\link[s2dverification]{CDORemap}} #' @@ -229,11 +264,11 @@ CST_Analogs <- function(exp, obs, grid_exp, obs2 = NULL, nanalogs = 3, fun_analo #'downscaled_field <- Analogs(exp = exp, obs = obs, exp_lats = exp_lats, exp_lons = exp_lons, #'obs_lats = obs_lats, obs_lons = obs_lons, grid_exp = 'r360x180') #'@export -Analogs <- function(exp, obs, exp_lats, exp_lons, obs_lats, obs_lons, grid_exp, obs2 = NULL, - obs2_lats = NULL, obs2_lons = NULL, nanalogs = 3, fun_analog = NULL, - lat_dim = "lat", lon_dim = "lon", sdate_dim = "sdate", time_dim = "time", - member_dim = "member", region = NULL, return_indices = FALSE, - loocv_window = TRUE, ncores = NULL) { +Analogs <- function(exp, obs, exp_lats = NULL, exp_lons = NULL, obs_lats, obs_lons, + grid_exp, obsL = NULL, obsL_lats = NULL, obsL_lons = NULL, nanalogs = 3, + fun_analog = NULL, lat_dim = "lat", lon_dim = "lon", sdate_dim = "sdate", + time_dim = "time", member_dim = "member", metric = "dist", region = NULL, + return_indices = FALSE, loocv_window = TRUE, ncores = NULL) { #----------------------------------- # Checkings #----------------------------------- @@ -264,7 +299,7 @@ Analogs <- function(exp, obs, exp_lats, exp_lons, obs_lats, obs_lons, grid_exp, stop("Parameter 'time_dim' must be of the class 'character'") } - # Do not allow synonims for lat (latitude), lon (longitude) and time (sdate) dimension names + # Do not allow synonims for lat (latitude), lon (longitude) and time (sdate) dimension names if (is.na(match(lon_dim, names(dim(exp)))) | is.na(match(lon_dim, names(dim(obs))))) { stop("Missing longitude dimension in 'exp' and/or 'obs', or does not match the parameter ", "'lon_dim'") @@ -305,38 +340,41 @@ Analogs <- function(exp, obs, exp_lats, exp_lons, obs_lats, obs_lons, grid_exp, } } - if (!is.null(obs2)) { + if (!is.null(obsL) ) { + # the code is not yet prepared to handle members in the observations - if (member_dim %in% names(dim(obs2))) { - if (identical(as.numeric(dim(obs2)[member_dim]), 1)) { - obs2 <- ClimProjDiags::Subset(x = obs2, along = member_dim, indices = 1, drop = 'selected') + if (member_dim %in% names(dim(obsL))) { + if (identical(as.numeric(dim(obsL)[member_dim]), 1)) { + obsL <- ClimProjDiags::Subset(x = obsL, along = member_dim, indices = 1, drop = 'selected') } else { - stop("Not implemented for observations with members ('obs2' can have 'member_dim', ", + stop("Not implemented for observations with members ('obsL' can have 'member_dim', ", "but it should be of length = 1).") } } - if (is.null(obs2_lats) | is.null(obs2_lons)) { + if (is.null(obsL_lats) | is.null(obsL_lons)) { stop("Missing latitudes and/or longitudes for the provided training observations. Please ", - "provide them with the parametres 'obs2_lats' and 'obs2_lons'") + "provide them with the parametres 'obsL_lats' and 'obsL_lons'") } - if (is.na(match(lon_dim, names(dim(obs2))))) { - stop("Missing longitude dimension in 'obs2', or does not match the parameter 'lon_dim'") + if (is.na(match(lon_dim, names(dim(obsL))))) { + stop("Missing longitude dimension in 'obsL', or does not match the parameter 'lon_dim'") } - if (is.na(match(lat_dim, names(dim(obs2))))) { - stop("Missing latitude dimension in 'obs2', or does not match the parameter 'lat_dim'") + if (is.na(match(lat_dim, names(dim(obsL))))) { + stop("Missing latitude dimension in 'obsL', or does not match the parameter 'lat_dim'") } - if (is.na(match(sdate_dim, names(dim(obs2))))) { - stop("Missing start date dimension in 'obs2', or does not match the parameter 'sdate_dim'") + if (is.na(match(sdate_dim, names(dim(obsL))))) { + stop("Missing start date dimension in 'obsL', or does not match the parameter 'sdate_dim'") } - if (is.na(match(time_dim, names(dim(obs2))))) { - stop("Missing time dimension in 'obs2', or does not match the parameter 'time_dim'") + if (is.na(match(time_dim, names(dim(obsL))))) { + stop("Missing time dimension in 'obsL', or does not match the parameter 'time_dim'") } + } + ## ncores if (!is.null(ncores)) { if (!is.numeric(ncores) | any(ncores %% 1 != 0) | any(ncores < 0) | @@ -350,15 +388,20 @@ Analogs <- function(exp, obs, exp_lats, exp_lons, obs_lats, obs_lons, grid_exp, stopifnot(fun_analog %in% c("mean", "wmean", "max", "min", "median")) } - if (!is.null(obs2)) { - obs_train <- obs2 - obs_train_lats <- obs2_lats - obs_train_lons <- obs2_lons + # metric method to be used to specify the analogs + stopifnot(metric %in% c("cor", "dcor", "dist")) + + + + if (!is.null(obsL)) { + obs_train <- obsL + obs_train_lats <- obsL_lats + obs_train_lons <- obsL_lons } else { obs_train <- obs obs_train_lats <- obs_lats obs_train_lons <- obs_lons - } + } # Correct indices later if cross-validation loocv_correction <- FALSE @@ -366,24 +409,40 @@ Analogs <- function(exp, obs, exp_lats, exp_lons, obs_lats, obs_lons, grid_exp, loocv_correction <- TRUE } - #----------------------------------- - # Interpolate high-res observations to the coarse grid - #----------------------------------- + # crop downscaling region, if the argument region is provided. + if (!is.null(region) & is.null(obsL)) { + # if a border is equally distant from two different grids, the map will be cropped from the grid having smaller coordinate + + a <- which.min(abs((region[1]-obs_lons))) + b <- which.min(abs((region[2]-obs_lons))) + c <- which.min(abs((region[3]-obs_lats))) + d <- which.min(abs((region[4]-obs_lats))) + obs <- ClimProjDiags::Subset(x = obs, along = list(lon_dim,lat_dim), + indices = list(a:b,c:d), drop = 'selected') + } + if (is.null(region)) { - warning("The borders of the downscaling region have not been provided. Assuming the four borders of the ", - "downscaling region are defined by the first and last elements of the parametres 'exp_lats' and ", - "'exp_lons'.") - region <- c(exp_lons[1], exp_lons[length(exp_lons)], exp_lats[1], exp_lats[length(exp_lats)]) + warning("The borders of the downscaling region have not been provided. ", + "Assuming the four borders of the downscaling region are defined by the ", + "first and last elements of the parameters 'exp_lats' and 'exp_lons'.") + region <- c(exp_lons[1], exp_lons[length(exp_lons)], exp_lats[1], + exp_lats[length(exp_lats)]) } - obs_interpolated <- Interpolation(exp = obs_train, lats = obs_train_lats, lons = obs_train_lons, - target_grid = grid_exp, lat_dim = lat_dim, lon_dim = lon_dim, - method_remap = "conservative", region = region, ncores = ncores) + obs_interpolated <- Interpolation(exp = obs_train, lats = obs_train_lats, lons = obs_train_lons, + target_grid = grid_exp, lat_dim = lat_dim, lon_dim = lon_dim, + method_remap = "conservative", region = region, + ncores = ncores) + # If after interpolating 'obs' data the coordinates do not match, the exp data is interpolated to # the same grid to force the matching - if (!.check_coords(lat1 = as.numeric(obs_interpolated$lat), lat2 = exp_lats, lon1 = as.numeric(obs_interpolated$lon), lon2 = exp_lons)) { - exp_interpolated <- Interpolation(exp = exp, lats = exp_lats, lons = exp_lons, target_grid = grid_exp, - lat_dim = lat_dim, lon_dim = lon_dim, method_remap = "conservative", + if (!.check_coords(lat1 = as.numeric(obs_interpolated$lat), + lat2 = exp_lats, + lon1 = as.numeric(obs_interpolated$lon), + lon2 = exp_lons)) { + exp_interpolated <- Interpolation(exp = exp, lats = exp_lats, lons = exp_lons, + target_grid = grid_exp, lat_dim = lat_dim, + lon_dim = lon_dim, method_remap = "conservative", region = region, ncores = ncores)$data } else { exp_interpolated <- exp @@ -391,40 +450,57 @@ Analogs <- function(exp, obs, exp_lats, exp_lons, obs_lats, obs_lons, grid_exp, # Create window if user does not have it in the training observations if ( !("window" %in% names(dim(obs_interpolated$data))) ) { - obs_train_interpolated <- .generate_window(obj = obs_interpolated$data, sdate_dim = sdate_dim, - time_dim = time_dim, loocv = loocv_window, ncores = ncores) - obs_hres <- .generate_window(obj = obs, sdate_dim = sdate_dim, time_dim = time_dim, loocv = loocv_window, ncores = ncores) + obs_train_interpolated <- .generate_window(obj = obs_interpolated$data, sdate_dim = sdate_dim, + time_dim = time_dim, loocv = loocv_window, + ncores = ncores) + if (!is.null(obsL)) { + if ( ("window" %in% names(dim(obs))) ) { + stop("Either both obs and obsL should include 'window' dimension or none.") + } + } + obs_hres <- .generate_window(obj = obs, sdate_dim = sdate_dim, time_dim = time_dim, + loocv = loocv_window, ncores = ncores) + } else { obs_train_interpolated <- obs_interpolated$data - dim(obs_train_interpolated) <- dim(obs_train_interpolated)[-which(names(dim(obs_train_interpolated))=="time")] + dim(obs_train_interpolated) <- dim(ClimProjDiags::Subset(x = obs_train_interpolated, + along = time_dim, indices = 1, drop = 'selected')) + + if (!is.null(obsL)) { + if ( !("window" %in% names(dim(obs))) ) { + stop("Either both obs and obsL should include 'window' dimension or none.") + } + } obs_hres <- obs - dim(obs_hres) <- dim(obs_hres)[-which(names(dim(obs_hres))=="time")] + dim(obs_hres) <- dim(ClimProjDiags::Subset(x = obs_hres, + along = time_dim, indices = 1, drop = 'selected')) + } #----------------------------------- # Reshape train and test #----------------------------------- - res.data <- Apply(list(obs_train_interpolated, exp_interpolated, obs_hres), - target_dims = list(c("window", lat_dim, lon_dim), c(lat_dim, lon_dim), - c("window", lat_dim, lon_dim)), - fun = function(tr, te, ob) .analogs(train = tr, test = te, obs_hres = ob, k = nanalogs, - fun_analog = fun_analog), ncores = ncores)$output1 - # Return the indices of the best analogs - if (return_indices) { - res.ind <- Apply(list(obs_train_interpolated, exp_interpolated, obs_hres), + RES <- Apply(list(obs_train_interpolated, exp_interpolated, obs_hres), target_dims = list(c("window", lat_dim, lon_dim), c(lat_dim, lon_dim), c("window", lat_dim, lon_dim)), - fun = function(tr, te, ob) .analogs(train = tr, test = te, obs_hres = ob, k = nanalogs, - fun_analog = fun_analog, return_indices = TRUE), ncores = ncores, output_dims = 'ind')$output1 + fun = function(tr, te, ob) .analogs(train = tr, test = te, obs_hres = ob, + k = nanalogs, metric = metric, fun_analog = fun_analog), + ncores = ncores) ## output1 -> data, output2 -> index, output3 -> metric + + res.data <- RES$output1 + + # Return the indices of the best analogs + if (return_indices) { + res.ind <- RES$output2 # If cross-validation has been applied, correct the indices if (loocv_correction) { nsdates <- dim(res.ind)[names(dim(res.ind)) == sdate_dim] ntimes <- dim(res.ind)[names(dim(res.ind)) == time_dim] - res.ind <- Apply(res.ind, target_dims = c("ind", sdate_dim), function(x) + res.ind <- Apply(res.ind, target_dims = c("index", sdate_dim), function(x) sapply(1:nsdates, function(s) seq(ntimes * nsdates)[ - (ntimes * (s - 1) + 1:ntimes)][x[, s]]), - output_dims = c("ind", sdate_dim), ncores = ncores)$output1 + output_dims = c("index", sdate_dim), ncores = ncores)$output1 } # restore ensemble dimension in observations if it existed originally @@ -433,25 +509,21 @@ Analogs <- function(exp, obs, exp_lats, exp_lons, obs_lats, obs_lons, grid_exp, } res <- list(data = res.data, ind = res.ind, obs = obs, lon = obs_lons, lat = obs_lats) - } - else { + } else { # restore ensemble dimension in observations if it existed originally - if (restore_ens) { - obs <- s2dv::InsertDim(obs, posdim = 1, lendim = 1, name = member_dim) - } - - res <- list(data = res.data, obs = obs, lon = obs_lons, lat = obs_lats) + if (restore_ens) { + obs <- s2dv::InsertDim(obs, posdim = 1, lendim = 1, name = member_dim) + } + res <- list(data = res.data, obs = obs, lon = obs_lons, lat = obs_lats) } return(res) } # For each element in test, find the indices of the k nearest neigbhors in train -.analogs <- function(train, test, obs_hres, k, fun_analog, return_indices = FALSE) { - - require(FNN) - - # train and obs_hres dim: 3 dimensions window, lat and lon (in this order) +.analogs <- function(train, test, obs_hres, k, fun_analog, metric = NULL, return_indices = FALSE) { + + # train and obs_rs dim: 3 dimensions window, lat and lon (in this order) # test dim: 2 dimensions lat and lon (in this order) # Number of lats/lons of the high-resolution data space_dims_hres <- dim(obs_hres)[c(2,3)] @@ -472,40 +544,51 @@ Analogs <- function(exp, obs, exp_lats, exp_lons, obs_lats, obs_lons, grid_exp, te_wo_na <- InsertDim(data = te_wo_na, posdim = 1, lendim = 1, name = "time") if (all(is.na(test))) { - res <- array(NA,space_dims_hres) + res <- array(NA, space_dims_hres) } else { - knn.ind <- get.knnx(tr_wo_na, te_wo_na, k) - - dist <- knn.ind$nn.dist - idx <- knn.ind$nn.index - + if (metric == "dist") { + dist_all <- sqrt(rowSums((sweep(tr_wo_na, 2, te_wo_na[1,]))^2)) # euc dist + best_vals <- head(sort(dist_all), k) + idx <- match(best_vals, dist_all) + } else if (metric == "cor") { + cor_all <- apply(tr_wo_na, 1, function (x) cor(x,te_wo_na[1, ], method = "spearman")) # cor + best_vals <- head(sort(cor_all, decreasing = TRUE), k) + idx <- match(best_vals, cor_all) + } else if (metric == "dcor") { +# require(energy) + dcor_all <- apply(tr_wo_na, 1, function (x) .dcor(x,te_wo_na[1, ])) # dcor + best_vals <- head(sort(dcor_all, decreasing = TRUE), k,) + idx <- match(best_vals, dcor_all) + } if (isTRUE(any(idy_na_tr))) { dum <-(1:length(idy_na_tr))[!idy_na_tr] idx <- dum[idx] } - - # Either return only the indices or the analogs - if (return_indices) { - res <- as.numeric(idx) - } else { - res <- obs_hres[ , idx] - dim(res) <- c(space_dims_hres, analogs = k) - - if (!is.null(fun_analog)) { - if (fun_analog == "wmean") { - weight <- 1 / dist - res <- apply(res, c(1,2), function(x) weighted.mean(x, weight)) - } else if (fun_analog == "min") { - res<-res[,,which.min(dist)] - } else if (fun_analog == "max") { - res<-res[,,which.max(dist)] + res_ind <- array(idx, k) + names(dim(res_ind)) <- c("index") + res_metric <- array(best_vals, c(k)) + names(dim(res_metric)) <- c("metric") + res <- obs_hres[ , idx] + dim(res) <- c(space_dims_hres, analogs = k) + + if (!is.null(fun_analog)) { + if (fun_analog == "wmean") { + if (metric == "dist") { + weight <- 1 / best_vals } else { - res <- apply(res, c(1,2), fun_analog) + weight <- best_vals } + res <- apply(res, c(1,2), function(x) weighted.mean(x, weight)) + } else if (fun_analog == "min") { + res <- res[, , which.min(best_vals)] + } else if (fun_analog == "max") { + res <- res[, , which.max(best_vals)] + } else { + res <- apply(res, c(1,2), fun_analog) } } + return(list(res, res_ind, res_metric)) } - return(res) } # Add the dimension window to an array that contains, at least, the start date and time @@ -557,3 +640,38 @@ Analogs <- function(exp, obs, exp_lats, exp_lons, obs_lats, obs_lons, grid_exp, return(obj_window) } + +# Distance correlation function +.dcor <- function(x, y) { + n <- length(x) + + # Calculate Euclidean distances for x + dist_x <- as.matrix(dist(matrix(x))) + # Calculate Euclidean distances for y + dist_y <- as.matrix(dist(matrix(y))) + + # Centering matrices + H <- diag(n) - 1/n + + # Centered distance matrices + dist_centered_x <- H %*% dist_x %*% H + dist_centered_y <- H %*% dist_y %*% H + + # Calculate the product of mean-centered distance matrices + B <- dist_centered_x %*% t(dist_centered_y) + C <- dist_centered_x %*% t(dist_centered_x) + D <- dist_centered_y %*% t(dist_centered_y) + + # Calculate the distance covariance + dcov_xy <- sqrt(sum(diag(B))) + + # Calculate the variances + cov_xx <- sqrt(sum(diag(C))) + cov_yy <- sqrt(sum(diag(D))) + + # Calculate the distance correlation + dcor_xy <- dcov_xy / sqrt(cov_xx * cov_yy) + + return(dcor_xy) +} +