From 00f24fbcba67585b5d4745cae354e35cfd3c03f6 Mon Sep 17 00:00:00 2001 From: eduzenli Date: Wed, 11 Oct 2023 17:48:25 +0200 Subject: [PATCH 1/6] working on new metrics and LSV integration --- R/Analogs.R | 228 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 180 insertions(+), 48 deletions(-) diff --git a/R/Analogs.R b/R/Analogs.R index 1cbea15..7b6b55f 100644 --- a/R/Analogs.R +++ b/R/Analogs.R @@ -230,9 +230,11 @@ CST_Analogs <- function(exp, obs, grid_exp, obs2 = NULL, nanalogs = 3, fun_analo #'obs_lats = obs_lats, obs_lons = obs_lons, grid_exp = 'r360x180') #'@export Analogs <- function(exp, obs, exp_lats, exp_lons, obs_lats, obs_lons, grid_exp, obs2 = NULL, - obs2_lats = NULL, obs2_lons = NULL, nanalogs = 3, fun_analog = NULL, - lat_dim = "lat", lon_dim = "lon", sdate_dim = "sdate", time_dim = "time", - member_dim = "member", region = NULL, return_indices = FALSE, + obs2_lats = NULL, obs2_lons = NULL, expL = NULL, obsL = NULL, + expL_lats = NULL, expL_lons = NULL, obsL_lats = NULL, obsL_lons = NULL, + nanalogs = 3, fun_analog = NULL, lat_dim = "lat", lon_dim = "lon", + sdate_dim = "sdate", time_dim = "time", member_dim = "member", + metric = "rmse", region = NULL, regionL = NULL, return_indices = FALSE, loocv_window = TRUE, ncores = NULL) { #----------------------------------- # Checkings @@ -337,6 +339,54 @@ Analogs <- function(exp, obs, exp_lats, exp_lons, obs_lats, obs_lons, grid_exp, stop("Missing time dimension in 'obs2', or does not match the parameter 'time_dim'") } } + + if (!is.null(obsL) | !is.null(expL)) { + + # the code is not yet prepared to handle members in the observations + if (member_dim %in% names(dim(obsL))) { + if (identical(as.numeric(dim(obsL)[member_dim]), 1)) { + obsL <- ClimProjDiags::Subset(x = obsL, along = member_dim, indices = 1, drop = 'selected') + } else { + stop("Not implemented for observations with members ('obsL' can have 'member_dim', ", + "but it should be of length = 1).") + } + } + + if (is.null(obsL_lats) | is.null(obsL_lons) | is.null(expL_lats) | is.null(obsL_lats)) { + stop("Missing latitudes and/or longitudes for the provided large scale observations and/or ", + "experimental fields Please provide them with the parametres 'obsL_lats, '", + "'obsL_lons', 'expL_lats' and/or 'expL_lons'.") + } + + if (is.na(match(lon_dim, names(dim(obsL)))) | is.na(match(lon_dim, names(dim(expL))))) { + stop("Missing longitude dimension in 'obsL' and/or 'expL', or does not match ", + "the parameter 'lon_dim'") + } + + if (is.na(match(lat_dim, names(dim(obsL)))) | is.na(match(lat_dim, names(dim(expL))))) { + stop("Missing latitude dimension in 'obsL' and/or 'expL', or does not match ", + "the parameter 'lat_dim'") + } + + if (is.na(match(sdate_dim, names(dim(obsL)))) | is.na(match(sdate_dim, names(dim(expL))))) { + stop("Missing start date dimension in 'obsL' and/or 'expL', or does not match ", + "the parameter 'sdate_dim'") + } + + if (is.na(match(time_dim, names(dim(obsL)))) | is.na(match(time_dim, names(dim(expL))))) { + stop("Missing time dimension in 'obsL' and/or 'expL', or does not match ", + "the parameter 'time_dim'") + } + + if (is.null(regionL)) { + warning("The borders of the large scale variable region have not been provided. ", + "Assuming the four borders of the large scale variable region are defined ", + "by the first and last elements of the parameters 'exp_lats' and 'exp_lons'.") + regionL <- c(expL_lons[1], expL_lons[length(expL_lons)], expL_lats[1], + expL_lats[length(expL_lats)]) + } + } + ## ncores if (!is.null(ncores)) { if (!is.numeric(ncores) | any(ncores %% 1 != 0) | any(ncores < 0) | @@ -370,9 +420,9 @@ Analogs <- function(exp, obs, exp_lats, exp_lons, obs_lats, obs_lons, grid_exp, # Interpolate high-res observations to the coarse grid #----------------------------------- if (is.null(region)) { - warning("The borders of the downscaling region have not been provided. Assuming the four borders of the ", - "downscaling region are defined by the first and last elements of the parametres 'exp_lats' and ", - "'exp_lons'.") + warning("The borders of the downscaling region have not been provided. ", + "Assuming the four borders of the downscaling region are defined by the ", + "first and last elements of the parameters 'exp_lats' and 'exp_lons'.") region <- c(exp_lons[1], exp_lons[length(exp_lons)], exp_lats[1], exp_lats[length(exp_lats)]) } @@ -382,8 +432,9 @@ Analogs <- function(exp, obs, exp_lats, exp_lons, obs_lats, obs_lons, grid_exp, # If after interpolating 'obs' data the coordinates do not match, the exp data is interpolated to # the same grid to force the matching if (!.check_coords(lat1 = as.numeric(obs_interpolated$lat), lat2 = exp_lats, lon1 = as.numeric(obs_interpolated$lon), lon2 = exp_lons)) { - exp_interpolated <- Interpolation(exp = exp, lats = exp_lats, lons = exp_lons, target_grid = grid_exp, - lat_dim = lat_dim, lon_dim = lon_dim, method_remap = "conservative", + exp_interpolated <- Interpolation(exp = exp, lats = exp_lats, lons = exp_lons, + target_grid = grid_exp, lat_dim = lat_dim, + lon_dim = lon_dim, method_remap = "conservative", region = region, ncores = ncores)$data } else { exp_interpolated <- exp @@ -392,39 +443,42 @@ Analogs <- function(exp, obs, exp_lats, exp_lons, obs_lats, obs_lons, grid_exp, # Create window if user does not have it in the training observations if ( !("window" %in% names(dim(obs_interpolated$data))) ) { obs_train_interpolated <- .generate_window(obj = obs_interpolated$data, sdate_dim = sdate_dim, - time_dim = time_dim, loocv = loocv_window, ncores = ncores) - obs_hres <- .generate_window(obj = obs, sdate_dim = sdate_dim, time_dim = time_dim, loocv = loocv_window, ncores = ncores) + time_dim = time_dim, loocv = loocv_window, + ncores = ncores) + obs_hres <- .generate_window(obj = obs, sdate_dim = sdate_dim, time_dim = time_dim, + loocv = loocv_window, ncores = ncores) } else { obs_train_interpolated <- obs_interpolated$data - dim(obs_train_interpolated) <- dim(obs_train_interpolated)[-which(names(dim(obs_train_interpolated))=="time")] + dim(obs_train_interpolated) <- dim(ClimProjDiags::Subset(x = obs_train_interpolated, + along = time_dim, indices = 1, drop = 'selected')) obs_hres <- obs - dim(obs_hres) <- dim(obs_hres)[-which(names(dim(obs_hres))=="time")] + dim(obs_hres) <- dim(ClimProjDiags::Subset(x = obs_hres, + along = time_dim, indices = 1, drop = 'selected')) } + #----------------------------------- # Reshape train and test #----------------------------------- - res.data <- Apply(list(obs_train_interpolated, exp_interpolated, obs_hres), + RES <- Apply(list(obs_train_interpolated, exp_interpolated, obs_hres), target_dims = list(c("window", lat_dim, lon_dim), c(lat_dim, lon_dim), c("window", lat_dim, lon_dim)), - fun = function(tr, te, ob) .analogs(train = tr, test = te, obs_hres = ob, k = nanalogs, - fun_analog = fun_analog), ncores = ncores)$output1 + fun = function(tr, te, ob) .analogs(train = tr, test = te, obs_hres = ob, + k = nanalogs, metric = metric, fun_analog = fun_analog), + ncores = ncores) ## output1 -> data, output2 -> index, output3 -> metric + res.data <- RES$output1 # Return the indices of the best analogs if (return_indices) { - res.ind <- Apply(list(obs_train_interpolated, exp_interpolated, obs_hres), - target_dims = list(c("window", lat_dim, lon_dim), c(lat_dim, lon_dim), - c("window", lat_dim, lon_dim)), - fun = function(tr, te, ob) .analogs(train = tr, test = te, obs_hres = ob, k = nanalogs, - fun_analog = fun_analog, return_indices = TRUE), ncores = ncores, output_dims = 'ind')$output1 + res.ind <- RES$output2 # If cross-validation has been applied, correct the indices if (loocv_correction) { nsdates <- dim(res.ind)[names(dim(res.ind)) == sdate_dim] ntimes <- dim(res.ind)[names(dim(res.ind)) == time_dim] - res.ind <- Apply(res.ind, target_dims = c("ind", sdate_dim), function(x) + res.ind <- Apply(res.ind, target_dims = c("index", sdate_dim), function(x) sapply(1:nsdates, function(s) seq(ntimes * nsdates)[ - (ntimes * (s - 1) + 1:ntimes)][x[, s]]), - output_dims = c("ind", sdate_dim), ncores = ncores)$output1 + output_dims = c("index", sdate_dim), ncores = ncores)$output1 } # restore ensemble dimension in observations if it existed originally @@ -433,57 +487,89 @@ Analogs <- function(exp, obs, exp_lats, exp_lons, obs_lats, obs_lons, grid_exp, } res <- list(data = res.data, ind = res.ind, obs = obs, lon = obs_lons, lat = obs_lats) - } - else { + } else { # restore ensemble dimension in observations if it existed originally - if (restore_ens) { - obs <- s2dv::InsertDim(obs, posdim = 1, lendim = 1, name = member_dim) - } - - res <- list(data = res.data, obs = obs, lon = obs_lons, lat = obs_lats) + if (restore_ens) { + obs <- s2dv::InsertDim(obs, posdim = 1, lendim = 1, name = member_dim) + } + res <- list(data = res.data, obs = obs, lon = obs_lons, lat = obs_lats) } return(res) } # For each element in test, find the indices of the k nearest neigbhors in train -.analogs <- function(train, test, obs_hres, k, fun_analog, return_indices = FALSE) { - - require(FNN) - +.analogs <- function(train, test, obs_hres, k, fun_analog, metric = NULL, return_indices = FALSE) { + + # require(FNN) + # train and obs_hres dim: 3 dimensions window, lat and lon (in this order) # test dim: 2 dimensions lat and lon (in this order) # Number of lats/lons of the high-resolution data space_dims_hres <- dim(obs_hres)[c(2,3)] # Reformat train and test as an array with (time, points) - train <- apply(train, 1, as.vector); names(dim(train))[1] <- "space" + train <- apply(train, 1, as.vector); names(dim(train))[1] <- "space" test <- as.vector(test) obs_hres <- apply(obs_hres, 1, as.vector); names(dim(obs_hres))[1] <- "space" - + # Identify and remove NA's - idx_na_tr <- is.na(train[ , 1]) + dum<-which(!apply(train,2,function (x) all(is.na(x))))[1] ## the column in which NA in space will be investigated. it shouldn't be all-NA time-step + idx_na_tr <- is.na(train[ , dum]) # NA in space + idy_na_tr <- is.na(train[1, ]) # NA in time idx_na_te <- is.na(test) idx_na <- idx_na_tr | idx_na_te - tr_wo_na <- t(train[!idx_na , ]) + tr_wo_na <- t(train[!idx_na , !idy_na_tr ]) te_wo_na <- test[!idx_na] te_wo_na <- InsertDim(data = te_wo_na, posdim = 1, lendim = 1, name = "time") - - knn.ind <- get.knnx(tr_wo_na, te_wo_na, k) - - dist <- knn.ind$nn.dist - idx <- knn.ind$nn.index - # Either return only the indices or the analogs - if (return_indices) { - res <- as.numeric(idx) + if (all(is.na(test))) { + res <- array(NA,space_dims_hres) } else { + if (metric == "dist") { + knn.ind <- get.knnx(tr_wo_na, te_wo_na, k) # dist + best_vals <- knn.ind$nn.dist + idx <- knn.ind$nn.index + } else if (metric == "rmse") { + rmse_all <- sqrt(rowMeans((sweep(tr_wo_na, 2, te_wo_na[1,]))^2)) # rmse + best_vals <- head(sort(rmse_all), k) + idx <- match(best_vals, rmse_all) + } else if (metric == "cor") { + cor_all <- apply(tr_wo_na, 1, function (x) cor(x,te_wo_na[1, ], method = "spearman")) # cor + best_vals <- head(sort(cor_all, decreasing = TRUE), k) + idx <- match(best_vals, cor_all) + } else if (metric == "dcor") { + dcor_all <- apply(tr_wo_na, 1, function (x) .dcor(x,te_wo_na[1, ])) # dcor + best_vals <- head(sort(dcor_all, decreasing = TRUE), k,) + idx <- match(best_vals, dcor_all) + } else if (metric == "lm") { + r2_all <- apply(tr_wo_na, 1, function (x) { + df = data.frame (x, te_wo_na[1, ]) + names(df) <- c("x", "y") + model <- lm (y~x, df) + summary(model)$r.squared + }) + best_vals <- head(sort(r2_all, decreasing = TRUE), k) + idx <- match(best_vals, r2_all) + } + if (isTRUE(any(idy_na_tr))) { + dum <-(1:length(idy_na_tr))[!idy_na_tr] + idx <- dum[idx] + } + res_ind <- array(idx, k) + names(dim(res_ind)) <- c("index") + res_metric <- array(best_vals, c(k)) + names(dim(res_metric)) <- c("metric") res <- obs_hres[ , idx] dim(res) <- c(space_dims_hres, analogs = k) if (!is.null(fun_analog)) { if (fun_analog == "wmean") { - weight <- 1 / dist + if (metric == "dist" | metric == "rmse") { + weight <- 1 / best_vals + } else { + weight <- best_vals + } res <- apply(res, c(1,2), function(x) weighted.mean(x, weight)) } else if (fun_analog == "min") { res<-res[,,which.min(dist)] @@ -493,9 +579,8 @@ Analogs <- function(exp, obs, exp_lats, exp_lons, obs_lats, obs_lons, grid_exp, res <- apply(res, c(1,2), fun_analog) } } + return(list(res, res_ind, res_metric)) } - - return(res) } # Add the dimension window to an array that contains, at least, the start date and time @@ -547,3 +632,50 @@ Analogs <- function(exp, obs, exp_lats, exp_lons, obs_lats, obs_lons, grid_exp, return(obj_window) } + +# Euclidean distances function +.euclidean_distances <- function(a) { + if (!(is.matrix(a) | is.array(a))) { + a <- array(a) + } + n <- nrow(a) + H <- diag(n) - 1/n + dist_m <- outer(a, a, "-") + d_centered <- H %*% dist_m %*% H + return(d_centered) +} + +# Distance correlation function +.dcor <- function(x, y) { + + if (!(is.numeric(x) & is.numeric(y))) { + stop("x and y should be numeric.") + } + if (!(is.matrix(x) | is.array(x))) { + x <- as.matrix(x) + } + if (!(is.matrix(y) | is.array(y))) { + y <- as.matrix(y) + } + + # Euclidean distances + dist_x <- .euclidean_distances(x) + dist_y <- .euclidean_distances(y) + + # Calculate the product of mean-centered distance matrices + B <- dist_x %*% t(dist_y) + C <- dist_x %*% t(dist_x) + D <- dist_y %*% t(dist_y) + + # Calculate the distance covariance + dcov_xy <- sqrt(sum(diag(B))) + + # Calculate the variances + cov_xx <- sqrt(sum(diag(C))) + cov_yy <- sqrt(sum(diag(D))) + + # Calculate the distance correlation + dcor_xy <- dcov_xy / sqrt(cov_xx * cov_yy) + + return(dcor_xy) +} -- GitLab From 4761acb295f4c52b667dd932aa2f826c74213e4a Mon Sep 17 00:00:00 2001 From: eduzenli Date: Fri, 13 Oct 2023 17:58:57 +0200 Subject: [PATCH 2/6] continues --- R/Analogs.R | 127 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 77 insertions(+), 50 deletions(-) diff --git a/R/Analogs.R b/R/Analogs.R index 7b6b55f..80f1fd2 100644 --- a/R/Analogs.R +++ b/R/Analogs.R @@ -229,7 +229,7 @@ CST_Analogs <- function(exp, obs, grid_exp, obs2 = NULL, nanalogs = 3, fun_analo #'downscaled_field <- Analogs(exp = exp, obs = obs, exp_lats = exp_lats, exp_lons = exp_lons, #'obs_lats = obs_lats, obs_lons = obs_lons, grid_exp = 'r360x180') #'@export -Analogs <- function(exp, obs, exp_lats, exp_lons, obs_lats, obs_lons, grid_exp, obs2 = NULL, +Analogs <- function(exp = NULL, obs, exp_lats, exp_lons, obs_lats, obs_lons, grid_exp, obs2 = NULL, obs2_lats = NULL, obs2_lons = NULL, expL = NULL, obsL = NULL, expL_lats = NULL, expL_lons = NULL, obsL_lats = NULL, obsL_lons = NULL, nanalogs = 3, fun_analog = NULL, lat_dim = "lat", lon_dim = "lon", @@ -266,25 +266,27 @@ Analogs <- function(exp, obs, exp_lats, exp_lons, obs_lats, obs_lons, grid_exp, stop("Parameter 'time_dim' must be of the class 'character'") } - # Do not allow synonims for lat (latitude), lon (longitude) and time (sdate) dimension names - if (is.na(match(lon_dim, names(dim(exp)))) | is.na(match(lon_dim, names(dim(obs))))) { - stop("Missing longitude dimension in 'exp' and/or 'obs', or does not match the parameter ", - "'lon_dim'") - } + if (is.null(obsL) | is.null(expL)) { + # Do not allow synonims for lat (latitude), lon (longitude) and time (sdate) dimension names + if (is.na(match(lon_dim, names(dim(exp)))) | is.na(match(lon_dim, names(dim(obs))))) { + stop("Missing longitude dimension in 'exp' and/or 'obs', or does not match the parameter ", + "'lon_dim'") + } - if (is.na(match(lat_dim, names(dim(exp)))) | is.na(match(lat_dim, names(dim(obs))))) { - stop("Missing latitude dimension in 'exp' and/or 'obs', or does not match the parameter ", - "'lat_dim'") - } + if (is.na(match(lat_dim, names(dim(exp)))) | is.na(match(lat_dim, names(dim(obs))))) { + stop("Missing latitude dimension in 'exp' and/or 'obs', or does not match the parameter ", + "'lat_dim'") + } - if (is.na(match(sdate_dim, names(dim(exp)))) | is.na(match(sdate_dim, names(dim(obs))))) { - stop("Missing start date dimension in 'exp' and/or 'obs', or does not match the parameter ", - "'sdate_dim'") - } + if (is.na(match(sdate_dim, names(dim(exp)))) | is.na(match(sdate_dim, names(dim(obs))))) { + stop("Missing start date dimension in 'exp' and/or 'obs', or does not match the parameter ", + "'sdate_dim'") + } - if (is.na(match(time_dim, names(dim(exp)))) | is.na(match(time_dim, names(dim(obs))))) { - stop("Missing time dimension in 'exp' and/or 'obs', or does not match the parameter ", - "'time_dim'") + if (is.na(match(time_dim, names(dim(exp)))) | is.na(match(time_dim, names(dim(obs))))) { + stop("Missing time dimension in 'exp' and/or 'obs', or does not match the parameter ", + "'time_dim'") + } } # Ensure we have enough data to interpolate from high-res to coarse grid @@ -352,6 +354,10 @@ Analogs <- function(exp, obs, exp_lats, exp_lons, obs_lats, obs_lons, grid_exp, } } + if (is.null(obsL) | is.null(expL)) { + stop("obsL and/or expL is missing. Both must be provided for the large scale variable.") + } + if (is.null(obsL_lats) | is.null(obsL_lons) | is.null(expL_lats) | is.null(obsL_lats)) { stop("Missing latitudes and/or longitudes for the provided large scale observations and/or ", "experimental fields Please provide them with the parametres 'obsL_lats, '", @@ -377,14 +383,6 @@ Analogs <- function(exp, obs, exp_lats, exp_lons, obs_lats, obs_lons, grid_exp, stop("Missing time dimension in 'obsL' and/or 'expL', or does not match ", "the parameter 'time_dim'") } - - if (is.null(regionL)) { - warning("The borders of the large scale variable region have not been provided. ", - "Assuming the four borders of the large scale variable region are defined ", - "by the first and last elements of the parameters 'exp_lats' and 'exp_lons'.") - regionL <- c(expL_lons[1], expL_lons[length(expL_lons)], expL_lats[1], - expL_lats[length(expL_lats)]) - } } ## ncores @@ -400,15 +398,25 @@ Analogs <- function(exp, obs, exp_lats, exp_lons, obs_lats, obs_lons, grid_exp, stopifnot(fun_analog %in% c("mean", "wmean", "max", "min", "median")) } - if (!is.null(obs2)) { - obs_train <- obs2 - obs_train_lats <- obs2_lats - obs_train_lons <- obs2_lons + + if (is.null(obsL)) { + if (!is.null(obs2)) { + obs_train <- obs2 + obs_train_lats <- obs2_lats + obs_train_lons <- obs2_lons + } else { + obs_train <- obs + obs_train_lats <- obs_lats + obs_train_lons <- obs_lons + } } else { - obs_train <- obs - obs_train_lats <- obs_lats - obs_train_lons <- obs_lons - } + obs_train <- obsL + obs_train_lats <- obsL_lats + obs_train_lons <- obsL_lons + exp <- expL + exp_lats <- expL_lats + exp_lons <- expL_lons + } # Correct indices later if cross-validation loocv_correction <- FALSE @@ -416,26 +424,44 @@ Analogs <- function(exp, obs, exp_lats, exp_lons, obs_lats, obs_lons, grid_exp, loocv_correction <- TRUE } - #----------------------------------- - # Interpolate high-res observations to the coarse grid - #----------------------------------- - if (is.null(region)) { - warning("The borders of the downscaling region have not been provided. ", - "Assuming the four borders of the downscaling region are defined by the ", - "first and last elements of the parameters 'exp_lats' and 'exp_lons'.") - region <- c(exp_lons[1], exp_lons[length(exp_lons)], exp_lats[1], exp_lats[length(exp_lats)]) + if (is.null(obsL) | is.null(expL)) { + #----------------------------------- + # Interpolate high-res observations to the coarse grid + #----------------------------------- + if (is.null(region)) { + warning("The borders of the downscaling region have not been provided. ", + "Assuming the four borders of the downscaling region are defined by the ", + "first and last elements of the parameters 'exp_lats' and 'exp_lons'.") + region <- region_train <- c(exp_lons[1], exp_lons[length(exp_lons)], exp_lats[1], + exp_lats[length(exp_lats)]) + } + } else { + if (is.null(region)) { + warning("The borders of the downscaling region have not been provided. ", + "Assuming the four borders of the downscaling region are defined by the ", + "first and last elements of the parameters 'obs_lats' and 'obs_lons'.") + # For large-scale case, downscaling region is the region of local scale variable obs + region <- c(obs_lons[1], obs_lons[length(obs_lons)], obs_lats[1], obs_lats[length(obs_lats)]) + # For large-scale case, training region is the region of large scale variable exp + region_train <- c(expL_lons[1], expL_lons[length(expL_lons)], expL_lats[1], + expL_lats[length(expL_lats)]) + } } obs_interpolated <- Interpolation(exp = obs_train, lats = obs_train_lats, lons = obs_train_lons, target_grid = grid_exp, lat_dim = lat_dim, lon_dim = lon_dim, - method_remap = "conservative", region = region, ncores = ncores) + method_remap = "conservative", region = region_train, + ncores = ncores) # If after interpolating 'obs' data the coordinates do not match, the exp data is interpolated to # the same grid to force the matching - if (!.check_coords(lat1 = as.numeric(obs_interpolated$lat), lat2 = exp_lats, lon1 = as.numeric(obs_interpolated$lon), lon2 = exp_lons)) { + if (!.check_coords(lat1 = as.numeric(obs_interpolated$lat), + lat2 = exp_lats, + lon1 = as.numeric(obs_interpolated$lon), + lon2 = exp_lons)) { exp_interpolated <- Interpolation(exp = exp, lats = exp_lats, lons = exp_lons, target_grid = grid_exp, lat_dim = lat_dim, lon_dim = lon_dim, method_remap = "conservative", - region = region, ncores = ncores)$data + region = region_train, ncores = ncores)$data } else { exp_interpolated <- exp } @@ -456,10 +482,11 @@ Analogs <- function(exp, obs, exp_lats, exp_lons, obs_lats, obs_lons, grid_exp, along = time_dim, indices = 1, drop = 'selected')) } - #----------------------------------- # Reshape train and test #----------------------------------- + + RES <- Apply(list(obs_train_interpolated, exp_interpolated, obs_hres), target_dims = list(c("window", lat_dim, lon_dim), c(lat_dim, lon_dim), c("window", lat_dim, lon_dim)), @@ -526,14 +553,14 @@ Analogs <- function(exp, obs, exp_lats, exp_lons, obs_lats, obs_lons, grid_exp, if (all(is.na(test))) { res <- array(NA,space_dims_hres) } else { - if (metric == "dist") { - knn.ind <- get.knnx(tr_wo_na, te_wo_na, k) # dist - best_vals <- knn.ind$nn.dist - idx <- knn.ind$nn.index - } else if (metric == "rmse") { + if (metric == "rmse") { rmse_all <- sqrt(rowMeans((sweep(tr_wo_na, 2, te_wo_na[1,]))^2)) # rmse best_vals <- head(sort(rmse_all), k) idx <- match(best_vals, rmse_all) +# } else if (metric == "dist") { +# knn.ind <- get.knnx(tr_wo_na, te_wo_na, k) # dist +# best_vals <- knn.ind$nn.dist +# idx <- knn.ind$nn.index } else if (metric == "cor") { cor_all <- apply(tr_wo_na, 1, function (x) cor(x,te_wo_na[1, ], method = "spearman")) # cor best_vals <- head(sort(cor_all, decreasing = TRUE), k) -- GitLab From f544747ca879a2e4706bc861d448c10e60694e09 Mon Sep 17 00:00:00 2001 From: eduzenli Date: Tue, 31 Oct 2023 15:14:53 +0100 Subject: [PATCH 3/6] first draft is ready --- R/Analogs.R | 362 +++++++++++++++++++++++++--------------------------- 1 file changed, 177 insertions(+), 185 deletions(-) diff --git a/R/Analogs.R b/R/Analogs.R index 80f1fd2..3070e54 100644 --- a/R/Analogs.R +++ b/R/Analogs.R @@ -1,16 +1,24 @@ #'@rdname CST_Analogs -#'@title Downscaling using Analogs based on large scale fields. +#'@title Downscaling using Analogs based on coarse scale fields. #' #'@author J. Ramon, \email{jaume.ramon@bsc.es} #' #'@description This function performs a downscaling using Analogs. To compute -#'the analogs given a coarse-scale field, the function looks for days with -#'similar conditions in the historical observations. The coarse scale and -#'observation data can be either global or regional. In the latter case, the -#'region is defined by the user. In principle, the coarse and observation data -#'should be of the same variable, although different variables can also be admitted. -#'The analogs function will find the N best analogs based in Minimum Euclidean -#'distance. +#'the analogs given a coarse-scale field, the function looks for days with similar conditions +#'in the historical observations. The analogs function determines the N best analogs based +#'on RMSE, distance correlation, or Spearman's correlation metrics. To downscale +#'a local-scale variable, either the variable itself or another large-scale variable +#'can be utilized as the predictor. In the first scenario, analogs are examined between +#'the observation and model data of the same local-scale variable. In the latter scenario, +#'the function identifies the day in the observation data that closely resembles +#'the large-scale pattern of interest in the model. When it identifies the date of +#'the best analog, the function extracts the corresponding local-scale variable for that day +#'from the observation of the local scale variable. The used local-scale and large-scale +#'variables can be retrieved from independent regions. The input data for the first case must +#'include 'exp' and 'obs,' while in the second case, 'obs,' 'obsL,' and 'expL' are the +#'required input fields. Users can perform the downscaling process over the subregions +#'that can be identified through the 'region' and 'regionL' arguments, instead of focusing +#'on the entire area of the loaded data. #' #'The search of analogs must be done in the longest dataset posible, but might #'require high-memory computational resources. This is important since it is @@ -21,25 +29,29 @@ #'is intended to downscale climate prediction data (i.e., sub-seasonal, seasonal #'and decadal predictions) but can admit climate projections or reanalyses. It does #'not have constrains of specific region or variables to downscale. -#'@param exp an 's2dv' object with named dimensions containing the experimental field on +#'@param exp an 's2dv_cube' object with named dimensions containing the experimental field on #'the coarse scale for which the downscaling is aimed. The object must have, at least, #'the dimensions latitude, longitude, start date and time. The object is expected to be #'already subset for the desired region. Data can be in one or two integrated regions, e.g., #'crossing the Greenwich meridian. To get the correct results in the latter case, #'the borders of the region should be specified in the parameter 'region'. See parameter #''region'. -#'@param obs an 's2dv' object with named dimensions containing the observational field. +#'@param obs an 's2dv_cube' object with named dimensions containing the observational field. #'The object must have, at least, the dimensions latitude, longitude and start date. #'The object is expected to be already subset for the desired region. -#'@param obs2 an 's2dv' object with named dimensions containing a different observational -#'field to that in 'obs'. If provided, these observations will be used in the training, -#'i.e. the searching for analogs, so that they should be in a coarser grid to those in -#''obs'. Training with observations on a grid with a spatial resolution closer to that -#'in 'exp', will in principle ensure better results. The object must have, at least, the -#'dimensions latitude, longitude and start date. The object is expected to be already -#'subset for the desired region. -#'@param grid_exp a character vector with a path to an example file of the exp data. -#'It can be either a path to another NetCDF file which to read the target grid from +#'@param expL an 's2dv_cube' object with named dimensions containing the experimental +#'field of the large-scale variable on the coarse scale. The object must have, at least, +#'the dimensions latitude, longitude, start date and time. The object is expected to be +#'already subset for the desired region. Data can be in one or two integrated regions, +#'e.g., crossing the Greenwich meridian. To get the correct results in the latter case, +#'the borders of the region should be specified in the parameter regionL'. See +#''parameter regionL'. +#'@param obsL an 's2dv_cube' object with named dimensions containing the observational +#'field of the large-scale variable.The object must have, at least, the dimensions latitude, +#'longitude and start date. The object is expected to be already subset for the desired region. +#'@param grid_exp a character vector with a path to an example file of the exp (if the +#'predictor is the local scale variable) or expL (if the predictor is a large scale variable) +#'data. It can be either a path to another NetCDF file which to read the target grid from #'(a single grid must be defined in such file) or a character vector indicating the #'coarse grid to be passed to CDO, and it must be a grid recognised by CDO. #'@param nanalogs an integer indicating the number of analogs to be searched @@ -56,6 +68,9 @@ #''data' in exp and obs. Default set to "time". #'@param member_dim a character vector indicating the member dimension name in the element #''data' in exp and obs. Default set to "member". +#'@param metric a character vector to select the analog specification method. Only these +#'options are valid: "rmse", "dcor" (i.e., distance correlation) or "cor" (i.e., Spearman's +#'.correlation) The default metric is "rmse". #'@param region a numeric vector indicating the borders of the region defined in exp. #'It consists of four elements in this order: lonmin, lonmax, latmin, latmax. lonmin refers #'to the left border, while lonmax refers to the right border. latmin indicates the lower @@ -68,7 +83,7 @@ #'of the window. It is recommended to be set to TRUE. Default to TRUE. #'@param ncores an integer indicating the number of cores to use in parallel computation. #'The default value is NULL. -#'@return An 's2dv' object. The element 'data' contains the dowscaled field, 'lat' the +#'@return An 's2dv_cube' object. The element 'data' contains the dowscaled field, 'lat' the #'downscaled latitudes, and 'lon' the downscaled longitudes. If fun_analog is set to NULL #'(default), the output array in 'data' also contains the dimension 'analog' with the best #'analog days. @@ -85,27 +100,44 @@ #'obs <- s2dv_cube(data = obs, lat = obs_lats, lon = obs_lons) #'downscaled_field <- CST_Analogs(exp = exp, obs = obs, grid_exp = 'r360x180') #'@export -CST_Analogs <- function(exp, obs, grid_exp, obs2 = NULL, nanalogs = 3, fun_analog = NULL, lat_dim = "lat", - lon_dim = "lon", sdate_dim = "sdate", time_dim = "time", member_dim = "member", - region = NULL, return_indices = FALSE, loocv_window = TRUE, ncores = NULL) { - - # input exp and obs must be s2dv_cube objects - if (!inherits(exp,'s2dv_cube')) { - stop("Parameter 'exp' must be of the class 's2dv_cube'") - } +CST_Analogs <- function(exp = NULL, obs, expL = NULL, obsL = NULL, grid_exp, nanalogs = 3, + fun_analog = NULL, lat_dim = "lat", lon_dim = "lon", sdate_dim = "sdate", + time_dim = "time", member_dim = "member", metric = "rmse", region = NULL, + regionL = NULL, return_indices = FALSE, loocv_window = TRUE, + ncores = NULL) { # input exp and obs must be s2dv_cube objects if (!inherits(obs,'s2dv_cube')) { stop("Parameter 'obs' must be of the class 's2dv_cube'") } - res <- Analogs(exp = exp$data, obs = obs$data, exp_lats = exp$coords[[lat_dim]], - exp_lons = exp$coords[[lon_dim]], obs_lats = obs$coords[[lat_dim]], - obs_lons = obs$coords[[lon_dim]], grid_exp = grid_exp, nanalogs = nanalogs, - fun_analog = fun_analog, lat_dim = lat_dim, lon_dim = lon_dim, - sdate_dim = sdate_dim, time_dim = time_dim, member_dim = member_dim, - region = region, return_indices = return_indices, loocv_window = loocv_window, - ncores = ncores) + if (!is.null(obsL) | !is.null(expL)) { + # input exp and obs must be s2dv_cube objects + if (!inherits(expL,'s2dv_cube')) { + stop("Parameter 'expL' must be of the class 's2dv_cube'") + } + + # input exp and obs must be s2dv_cube objects + if (!inherits(obsL,'s2dv_cube')) { + stop("Parameter 'obsL' must be of the class 's2dv_cube'") + } + } else { + # input exp and obs must be s2dv_cube objects + if (!inherits(exp,'s2dv_cube')) { + stop("Parameter 'exp' must be of the class 's2dv_cube'") + } + } + + res <- Analogs(exp = exp$data, obs = obs$data, expL = expL$data, obsL = obsL$data, + exp_lats = exp$coords[[lat_dim]], exp_lons = exp$coords[[lon_dim]], + obs_lats = obs$coords[[lat_dim]], obs_lons = obs$coords[[lon_dim]], + expL_lats = expL$coords[[lat_dim]], expL_lons = expL$coords[[lon_dim]], + obsL_lats = obsL$coords[[lat_dim]], obsL_lons = obsL$coords[[lon_dim]], + grid_exp = grid_exp, nanalogs = nanalogs, fun_analog = fun_analog, + lat_dim = lat_dim, lon_dim = lon_dim, sdate_dim = sdate_dim, + time_dim = time_dim, member_dim = member_dim, metric = metric, + region = region, regionL = regionL, return_indices = return_indices, + loocv_window = loocv_window, ncores = ncores) # Modify data, lat and lon in the origina s2dv_cube, adding the downscaled data exp$data <- res$data @@ -129,13 +161,21 @@ CST_Analogs <- function(exp, obs, grid_exp, obs2 = NULL, nanalogs = 3, fun_analo #'@author Ll. Lledó, \email{llorenc.lledo@ecmwf.int} #' #'@description This function performs a downscaling using Analogs. To compute -#'the analogs given a coarse-scale field, the function looks for days with -#'similar conditions in the historical observations. The coarse scale and -#'observation data can be either global or regional. In the latter case, the -#'region is defined by the user. In principle, the coarse and observation data -#'should be of the same variable, although different variables can also be admitted. -#'The analogs function will find the N best analogs based in Minimum Euclidean -#'distance. +#'the analogs given a coarse-scale field, the function looks for days with similar conditions +#'in the historical observations. The analogs function determines the N best analogs based +#'on RMSE, distance correlation, or Spearman's correlation metrics. To downscale +#'a local-scale variable, either the variable itself or another large-scale variable +#'can be utilized as the predictor. In the first scenario, analogs are examined between +#'the observation and model data of the same local-scale variable. In the latter scenario, +#'the function identifies the day in the observation data that closely resembles +#'the large-scale pattern of interest in the model. When it identifies the date of +#'the best analog, the function extracts the corresponding local-scale variable for that day +#'from the observation of the local scale variable. The used local-scale and large-scale +#'variables can be retrieved from independent regions. The input data for the first case must +#'include 'exp' and 'obs,' while in the second case, 'obs,' 'obsL,' and 'expL' are the +#'required input fields. Users can perform the downscaling process over the subregions +#'that can be identified through the 'region' and 'regionL' arguments, instead of focusing +#'on the entire area of the loaded data. #' #'The search of analogs must be done in the longest dataset posible, but might #'require high-memory computational resources. This is important since it is @@ -167,17 +207,29 @@ CST_Analogs <- function(exp, obs, grid_exp, obs2 = NULL, nanalogs = 3, fun_analo #'range from -90 to 90. #'@param obs_lons a numeric vector containing the longitude values in 'obs'. Longitudes #'can range from -180 to 180 or from 0 to 360. -#'@param grid_exp a character vector with a path to an example file of the exp data. +#'@param expL an 's2dv_cube' object with named dimensions containing the experimental +#'field of the large-scale variable on the coarse scale. The object must have, at least, +#'the dimensions latitude, longitude, start date and time. The object is expected to be +#'already subset for the desired region. Data can be in one or two integrated regions, +#'e.g., crossing the Greenwich meridian. To get the correct results in the latter case, +#'the borders of the region should be specified in the parameter regionL'. See +#''parameter regionL'. +#'@param obsL an 's2dv_cube' object with named dimensions containing the observational +#'field of the large-scale variable.The object must have, at least, the dimensions latitude, +#'longitude and start date. The object is expected to be already subset for the desired region +#'@param expL_lats a numeric vector containing the latitude values in 'expL'. Latitudes must +#'range from -90 to 90. +#'@param expL_lons a numeric vector containing the longitude values in 'expL'. Longitudes +#'can range from -180 to 180 or from 0 to 360. +#'@param obsL_lats a numeric vector containing the latitude values in 'obsL'. Latitudes must +#'range from -90 to 90. +#'@param obsL_lons a numeric vector containing the longitude values in 'obsL'. Longitudes +#'can range from -180 to 180 or from 0 to 360. +#'@param grid_exp a character vector with a path to an example file of the exp (if the +#'predictor is the local scale variable) or expL (if the predictor is a large scale variable) data. #'It can be either a path to another NetCDF file which to read the target grid from #'(a single grid must be defined in such file) or a character vector indicating the #'coarse grid to be passed to CDO, and it must be a grid recognised by CDO. -#'@param obs2 an 's2dv' object with named dimensions containing a different observational -#'field to that in 'obs'. If provided, these observations will be used in the training, -#'i.e. the searching for analogs, so that they should be in a coarser grid to those in -#''obs'. Training with observations on a grid with a spatial resolution closer to that -#'in 'exp', will in principle ensure better results. The object must have, at least, the -#'dimensions latitude, longitude and start date. The object is expected to be already -#'subset for the desired region. #'@param nanalogs an integer indicating the number of analogs to be searched. #'@param fun_analog a function to be applied over the found analogs. Only these options #'are valid: "mean", "wmean", "max", "min", "median" or NULL. If set to NULL (default), @@ -192,11 +244,19 @@ CST_Analogs <- function(exp, obs, grid_exp, obs2 = NULL, nanalogs = 3, fun_analo #''data' in exp and obs. Default set to "time". #'@param member_dim a character vector indicating the member dimension name in the element #''data' in exp and obs. Default set to "member". +#'@param metric a character vector to select the analog specification method. Only these +#'options are valid: "rmse", "dcor" (i.e., distance correlation) or "cor" (i.e., Spearman's +#'.correlation) The default metric is "rmse". #'@param region a numeric vector indicating the borders of the region defined in exp. #'It consists of four elements in this order: lonmin, lonmax, latmin, latmax. lonmin refers #'to the left border, while lonmax refers to the right border. latmin indicates the lower #'border, whereas latmax indicates the upper border. If set to NULL (default), the function #'takes the first and last elements of the latitudes and longitudes. +#'@param regionL a numeric vector indicating the borders of the region defined in expL. +#'It consists of four elements in this order: lonmin, lonmax, latmin, latmax. lonmin refers +#'to the left border, while lonmax refers to the right border. latmin indicates the lower +#'border, whereas latmax indicates the upper border. If set to NULL (default), the function +#'takes the first and last elements of the latitudes and longitudes of the large-scale region. #'@param return_indices a logical vector indicating whether to return the indices of the #'analogs together with the downscaled fields. The indices refer to the position of the #'element in the vector time * start_date. If 'obs' contain the dimension 'window', it will @@ -209,7 +269,7 @@ CST_Analogs <- function(exp, obs, grid_exp, obs2 = NULL, nanalogs = 3, fun_analo #'@import multiApply #'@import CSTools #'@importFrom s2dv InsertDim CDORemap -#'@importFrom FNN get.knnx +#'@importFrom energy dcor #' #'@seealso \code{\link[s2dverification]{CDORemap}} #' @@ -229,13 +289,12 @@ CST_Analogs <- function(exp, obs, grid_exp, obs2 = NULL, nanalogs = 3, fun_analo #'downscaled_field <- Analogs(exp = exp, obs = obs, exp_lats = exp_lats, exp_lons = exp_lons, #'obs_lats = obs_lats, obs_lons = obs_lons, grid_exp = 'r360x180') #'@export -Analogs <- function(exp = NULL, obs, exp_lats, exp_lons, obs_lats, obs_lons, grid_exp, obs2 = NULL, - obs2_lats = NULL, obs2_lons = NULL, expL = NULL, obsL = NULL, - expL_lats = NULL, expL_lons = NULL, obsL_lats = NULL, obsL_lons = NULL, - nanalogs = 3, fun_analog = NULL, lat_dim = "lat", lon_dim = "lon", - sdate_dim = "sdate", time_dim = "time", member_dim = "member", - metric = "rmse", region = NULL, regionL = NULL, return_indices = FALSE, - loocv_window = TRUE, ncores = NULL) { +Analogs <- function(exp = NULL, obs, exp_lats = NULL, exp_lons = NULL, obs_lats, obs_lons, + grid_exp, expL = NULL, obsL = NULL, expL_lats = NULL, expL_lons = NULL, + obsL_lats = NULL, obsL_lons = NULL, nanalogs = 3, fun_analog = NULL, + lat_dim = "lat", lon_dim = "lon", sdate_dim = "sdate", time_dim = "time", + member_dim = "member", metric = "rmse", region = NULL, regionL = NULL, + return_indices = FALSE, loocv_window = TRUE, ncores = NULL) { #----------------------------------- # Checkings #----------------------------------- @@ -309,41 +368,12 @@ Analogs <- function(exp = NULL, obs, exp_lats, exp_lons, obs_lats, obs_lons, gri } } - if (!is.null(obs2)) { - # the code is not yet prepared to handle members in the observations - if (member_dim %in% names(dim(obs2))) { - if (identical(as.numeric(dim(obs2)[member_dim]), 1)) { - obs2 <- ClimProjDiags::Subset(x = obs2, along = member_dim, indices = 1, drop = 'selected') - } else { - stop("Not implemented for observations with members ('obs2' can have 'member_dim', ", - "but it should be of length = 1).") - } - } - - if (is.null(obs2_lats) | is.null(obs2_lons)) { - stop("Missing latitudes and/or longitudes for the provided training observations. Please ", - "provide them with the parametres 'obs2_lats' and 'obs2_lons'") - } - - if (is.na(match(lon_dim, names(dim(obs2))))) { - stop("Missing longitude dimension in 'obs2', or does not match the parameter 'lon_dim'") - } - - if (is.na(match(lat_dim, names(dim(obs2))))) { - stop("Missing latitude dimension in 'obs2', or does not match the parameter 'lat_dim'") - } - - if (is.na(match(sdate_dim, names(dim(obs2))))) { - stop("Missing start date dimension in 'obs2', or does not match the parameter 'sdate_dim'") - } - - if (is.na(match(time_dim, names(dim(obs2))))) { - stop("Missing time dimension in 'obs2', or does not match the parameter 'time_dim'") - } - } - if (!is.null(obsL) | !is.null(expL)) { + if (is.null(obsL) | is.null(expL)) { + stop("obsL and/or expL is missing. Both must be provided for the large scale variable.") + } + # the code is not yet prepared to handle members in the observations if (member_dim %in% names(dim(obsL))) { if (identical(as.numeric(dim(obsL)[member_dim]), 1)) { @@ -354,32 +384,32 @@ Analogs <- function(exp = NULL, obs, exp_lats, exp_lons, obs_lats, obs_lons, gri } } - if (is.null(obsL) | is.null(expL)) { - stop("obsL and/or expL is missing. Both must be provided for the large scale variable.") - } - if (is.null(obsL_lats) | is.null(obsL_lons) | is.null(expL_lats) | is.null(obsL_lats)) { stop("Missing latitudes and/or longitudes for the provided large scale observations and/or ", "experimental fields Please provide them with the parametres 'obsL_lats, '", "'obsL_lons', 'expL_lats' and/or 'expL_lons'.") } - if (is.na(match(lon_dim, names(dim(obsL)))) | is.na(match(lon_dim, names(dim(expL))))) { + if (is.na(match(lon_dim, names(dim(obsL)))) | is.na(match(lon_dim, names(dim(expL)))) | + is.na(match(lon_dim, names(dim(obs))))) { stop("Missing longitude dimension in 'obsL' and/or 'expL', or does not match ", "the parameter 'lon_dim'") } - if (is.na(match(lat_dim, names(dim(obsL)))) | is.na(match(lat_dim, names(dim(expL))))) { + if (is.na(match(lat_dim, names(dim(obsL)))) | is.na(match(lat_dim, names(dim(expL)))) | + is.na(match(lat_dim, names(dim(obs))))) { stop("Missing latitude dimension in 'obsL' and/or 'expL', or does not match ", "the parameter 'lat_dim'") } - if (is.na(match(sdate_dim, names(dim(obsL)))) | is.na(match(sdate_dim, names(dim(expL))))) { + if (is.na(match(sdate_dim, names(dim(obsL)))) | is.na(match(sdate_dim, names(dim(expL)))) | + is.na(match(sdate_dim, names(dim(obs))))) { stop("Missing start date dimension in 'obsL' and/or 'expL', or does not match ", "the parameter 'sdate_dim'") } - if (is.na(match(time_dim, names(dim(obsL)))) | is.na(match(time_dim, names(dim(expL))))) { + if (is.na(match(time_dim, names(dim(obsL)))) | is.na(match(time_dim, names(dim(expL)))) | + is.na(match(time_dim, names(dim(obs))))) { stop("Missing time dimension in 'obsL' and/or 'expL', or does not match ", "the parameter 'time_dim'") } @@ -398,24 +428,17 @@ Analogs <- function(exp = NULL, obs, exp_lats, exp_lons, obs_lats, obs_lons, gri stopifnot(fun_analog %in% c("mean", "wmean", "max", "min", "median")) } - - if (is.null(obsL)) { - if (!is.null(obs2)) { - obs_train <- obs2 - obs_train_lats <- obs2_lats - obs_train_lons <- obs2_lons - } else { - obs_train <- obs - obs_train_lats <- obs_lats - obs_train_lons <- obs_lons - } + if (!is.null(obsL)) { + obs_train <- obsL + obs_train_lats <- obsL_lats + obs_train_lons <- obsL_lons + exp <- expL + exp_lats <- expL_lats + exp_lons <- expL_lons } else { - obs_train <- obsL - obs_train_lats <- obsL_lats - obs_train_lons <- obsL_lons - exp <- expL - exp_lats <- expL_lats - exp_lons <- expL_lons + obs_train <- obs + obs_train_lats <- obs_lats + obs_train_lons <- obs_lons } # Correct indices later if cross-validation @@ -424,6 +447,17 @@ Analogs <- function(exp = NULL, obs, exp_lats, exp_lons, obs_lats, obs_lons, gri loocv_correction <- TRUE } + # crop downscaling region, if the argument region is provided. + if (!is.null(region)) { + # if a border is equally distant from two different grids, the map will be cropped from the grid having smaller coordinate + a <- which.min(abs((region[1]-obs_lons))) + b <- which.min(abs((region[2]-obs_lons))) + c <- which.min(abs((region[3]-obs_lats))) + d <- which.min(abs((region[4]-obs_lats))) + obs <- ClimProjDiags::Subset(x = obs, along = list(lon_dim,lat_dim), + indices = list(a:b,c:d), drop = 'selected') + } + if (is.null(obsL) | is.null(expL)) { #----------------------------------- # Interpolate high-res observations to the coarse grid @@ -432,19 +466,26 @@ Analogs <- function(exp = NULL, obs, exp_lats, exp_lons, obs_lats, obs_lons, gri warning("The borders of the downscaling region have not been provided. ", "Assuming the four borders of the downscaling region are defined by the ", "first and last elements of the parameters 'exp_lats' and 'exp_lons'.") - region <- region_train <- c(exp_lons[1], exp_lons[length(exp_lons)], exp_lats[1], + region_train <- region <- c(exp_lons[1], exp_lons[length(exp_lons)], exp_lats[1], exp_lats[length(exp_lats)]) + } else { + region_train <- region } } else { - if (is.null(region)) { + if (is.null(regionL)) { warning("The borders of the downscaling region have not been provided. ", "Assuming the four borders of the downscaling region are defined by the ", "first and last elements of the parameters 'obs_lats' and 'obs_lons'.") - # For large-scale case, downscaling region is the region of local scale variable obs - region <- c(obs_lons[1], obs_lons[length(obs_lons)], obs_lats[1], obs_lats[length(obs_lats)]) # For large-scale case, training region is the region of large scale variable exp region_train <- c(expL_lons[1], expL_lons[length(expL_lons)], expL_lats[1], expL_lats[length(expL_lats)]) + } else { + region_train <- regionL + } + + if (is.null(region)) { + # For large-scale case, downscaling region is the region of local scale variable obs + region <- c(obs_lons[1], obs_lons[length(obs_lons)], obs_lats[1], obs_lats[length(obs_lats)]) } } @@ -452,6 +493,7 @@ Analogs <- function(exp = NULL, obs, exp_lats, exp_lons, obs_lats, obs_lons, gri target_grid = grid_exp, lat_dim = lat_dim, lon_dim = lon_dim, method_remap = "conservative", region = region_train, ncores = ncores) + # If after interpolating 'obs' data the coordinates do not match, the exp data is interpolated to # the same grid to force the matching if (!.check_coords(lat1 = as.numeric(obs_interpolated$lat), @@ -527,9 +569,7 @@ Analogs <- function(exp = NULL, obs, exp_lats, exp_lons, obs_lats, obs_lons, gri # For each element in test, find the indices of the k nearest neigbhors in train .analogs <- function(train, test, obs_hres, k, fun_analog, metric = NULL, return_indices = FALSE) { - - # require(FNN) - + # train and obs_hres dim: 3 dimensions window, lat and lon (in this order) # test dim: 2 dimensions lat and lon (in this order) # Number of lats/lons of the high-resolution data @@ -541,7 +581,8 @@ Analogs <- function(exp = NULL, obs, exp_lats, exp_lons, obs_lats, obs_lons, gri obs_hres <- apply(obs_hres, 1, as.vector); names(dim(obs_hres))[1] <- "space" # Identify and remove NA's - dum<-which(!apply(train,2,function (x) all(is.na(x))))[1] ## the column in which NA in space will be investigated. it shouldn't be all-NA time-step + dum<-which(!apply(train,2,function (x) all(is.na(x))))[1] ## the column in which NA in space + #will be investigated. it shouldn't be all-NA time-step idx_na_tr <- is.na(train[ , dum]) # NA in space idy_na_tr <- is.na(train[1, ]) # NA in time idx_na_te <- is.na(test) @@ -551,33 +592,30 @@ Analogs <- function(exp = NULL, obs, exp_lats, exp_lons, obs_lats, obs_lons, gri te_wo_na <- InsertDim(data = te_wo_na, posdim = 1, lendim = 1, name = "time") if (all(is.na(test))) { - res <- array(NA,space_dims_hres) + res <- array(NA, space_dims_hres) } else { if (metric == "rmse") { rmse_all <- sqrt(rowMeans((sweep(tr_wo_na, 2, te_wo_na[1,]))^2)) # rmse best_vals <- head(sort(rmse_all), k) idx <- match(best_vals, rmse_all) -# } else if (metric == "dist") { -# knn.ind <- get.knnx(tr_wo_na, te_wo_na, k) # dist -# best_vals <- knn.ind$nn.dist -# idx <- knn.ind$nn.index } else if (metric == "cor") { cor_all <- apply(tr_wo_na, 1, function (x) cor(x,te_wo_na[1, ], method = "spearman")) # cor best_vals <- head(sort(cor_all, decreasing = TRUE), k) idx <- match(best_vals, cor_all) } else if (metric == "dcor") { - dcor_all <- apply(tr_wo_na, 1, function (x) .dcor(x,te_wo_na[1, ])) # dcor + require(energy) + dcor_all <- apply(tr_wo_na, 1, function (x) dcor(x,te_wo_na[1, ])) # dcor best_vals <- head(sort(dcor_all, decreasing = TRUE), k,) idx <- match(best_vals, dcor_all) - } else if (metric == "lm") { - r2_all <- apply(tr_wo_na, 1, function (x) { - df = data.frame (x, te_wo_na[1, ]) - names(df) <- c("x", "y") - model <- lm (y~x, df) - summary(model)$r.squared - }) - best_vals <- head(sort(r2_all, decreasing = TRUE), k) - idx <- match(best_vals, r2_all) +# } else if (metric == "lm") { + # r2_all <- apply(tr_wo_na, 1, function (x) { + # df = data.frame (x, te_wo_na[1, ]) + # names(df) <- c("x", "y") + # model <- lm (y~x, df) + # summary(model)$adj.r.squared + # }) +# best_vals <- head(sort(r2_all, decreasing = TRUE), k) + # idx <- match(best_vals, r2_all) } if (isTRUE(any(idy_na_tr))) { dum <-(1:length(idy_na_tr))[!idy_na_tr] @@ -660,49 +698,3 @@ Analogs <- function(exp = NULL, obs, exp_lats, exp_lons, obs_lats, obs_lons, gri return(obj_window) } -# Euclidean distances function -.euclidean_distances <- function(a) { - if (!(is.matrix(a) | is.array(a))) { - a <- array(a) - } - n <- nrow(a) - H <- diag(n) - 1/n - dist_m <- outer(a, a, "-") - d_centered <- H %*% dist_m %*% H - return(d_centered) -} - -# Distance correlation function -.dcor <- function(x, y) { - - if (!(is.numeric(x) & is.numeric(y))) { - stop("x and y should be numeric.") - } - if (!(is.matrix(x) | is.array(x))) { - x <- as.matrix(x) - } - if (!(is.matrix(y) | is.array(y))) { - y <- as.matrix(y) - } - - # Euclidean distances - dist_x <- .euclidean_distances(x) - dist_y <- .euclidean_distances(y) - - # Calculate the product of mean-centered distance matrices - B <- dist_x %*% t(dist_y) - C <- dist_x %*% t(dist_x) - D <- dist_y %*% t(dist_y) - - # Calculate the distance covariance - dcov_xy <- sqrt(sum(diag(B))) - - # Calculate the variances - cov_xx <- sqrt(sum(diag(C))) - cov_yy <- sqrt(sum(diag(D))) - - # Calculate the distance correlation - dcor_xy <- dcov_xy / sqrt(cov_xx * cov_yy) - - return(dcor_xy) -} -- GitLab From b9676087e538496d5702d2458094b582a3798881 Mon Sep 17 00:00:00 2001 From: eduzenli Date: Tue, 14 Nov 2023 16:27:43 +0100 Subject: [PATCH 4/6] dtw_integrated-first-version --- R/Analogs.R | 272 +++++++++++++++++++--------------------------------- 1 file changed, 101 insertions(+), 171 deletions(-) diff --git a/R/Analogs.R b/R/Analogs.R index 3070e54..45c0eb4 100644 --- a/R/Analogs.R +++ b/R/Analogs.R @@ -100,9 +100,9 @@ #'obs <- s2dv_cube(data = obs, lat = obs_lats, lon = obs_lons) #'downscaled_field <- CST_Analogs(exp = exp, obs = obs, grid_exp = 'r360x180') #'@export -CST_Analogs <- function(exp = NULL, obs, expL = NULL, obsL = NULL, grid_exp, nanalogs = 3, +CST_Analogs <- function(exp = NULL, obs, obsL = NULL, grid_exp, nanalogs = 3, fun_analog = NULL, lat_dim = "lat", lon_dim = "lon", sdate_dim = "sdate", - time_dim = "time", member_dim = "member", metric = "rmse", region = NULL, + time_dim = "time", member_dim = "member", region = NULL, regionL = NULL, return_indices = FALSE, loocv_window = TRUE, ncores = NULL) { @@ -111,31 +111,24 @@ CST_Analogs <- function(exp = NULL, obs, expL = NULL, obsL = NULL, grid_exp, nan stop("Parameter 'obs' must be of the class 's2dv_cube'") } - if (!is.null(obsL) | !is.null(expL)) { - # input exp and obs must be s2dv_cube objects - if (!inherits(expL,'s2dv_cube')) { - stop("Parameter 'expL' must be of the class 's2dv_cube'") - } - - # input exp and obs must be s2dv_cube objects + if (!is.null(obsL)) { + # input obs must be s2dv_cube objects if (!inherits(obsL,'s2dv_cube')) { stop("Parameter 'obsL' must be of the class 's2dv_cube'") } - } else { + } # input exp and obs must be s2dv_cube objects - if (!inherits(exp,'s2dv_cube')) { - stop("Parameter 'exp' must be of the class 's2dv_cube'") - } + if (!inherits(exp,'s2dv_cube')) { + stop("Parameter 'exp' must be of the class 's2dv_cube'") } - res <- Analogs(exp = exp$data, obs = obs$data, expL = expL$data, obsL = obsL$data, + res <- Analogs(exp = exp$data, obs = obs$data, obsL = obsL$data, exp_lats = exp$coords[[lat_dim]], exp_lons = exp$coords[[lon_dim]], obs_lats = obs$coords[[lat_dim]], obs_lons = obs$coords[[lon_dim]], - expL_lats = expL$coords[[lat_dim]], expL_lons = expL$coords[[lon_dim]], obsL_lats = obsL$coords[[lat_dim]], obsL_lons = obsL$coords[[lon_dim]], grid_exp = grid_exp, nanalogs = nanalogs, fun_analog = fun_analog, lat_dim = lat_dim, lon_dim = lon_dim, sdate_dim = sdate_dim, - time_dim = time_dim, member_dim = member_dim, metric = metric, + time_dim = time_dim, member_dim = member_dim, region = region, regionL = regionL, return_indices = return_indices, loocv_window = loocv_window, ncores = ncores) @@ -290,11 +283,10 @@ CST_Analogs <- function(exp = NULL, obs, expL = NULL, obsL = NULL, grid_exp, nan #'obs_lats = obs_lats, obs_lons = obs_lons, grid_exp = 'r360x180') #'@export Analogs <- function(exp = NULL, obs, exp_lats = NULL, exp_lons = NULL, obs_lats, obs_lons, - grid_exp, expL = NULL, obsL = NULL, expL_lats = NULL, expL_lons = NULL, - obsL_lats = NULL, obsL_lons = NULL, nanalogs = 3, fun_analog = NULL, - lat_dim = "lat", lon_dim = "lon", sdate_dim = "sdate", time_dim = "time", - member_dim = "member", metric = "rmse", region = NULL, regionL = NULL, - return_indices = FALSE, loocv_window = TRUE, ncores = NULL) { + grid_exp, obsL = NULL, obsL_lats = NULL, obsL_lons = NULL, nanalogs = 3, + fun_analog = NULL, lat_dim = "lat", lon_dim = "lon", sdate_dim = "sdate", + time_dim = "time", member_dim = "member", region = NULL, + regionL = NULL, return_indices = FALSE, loocv_window = TRUE, ncores = NULL) { #----------------------------------- # Checkings #----------------------------------- @@ -325,27 +317,25 @@ Analogs <- function(exp = NULL, obs, exp_lats = NULL, exp_lons = NULL, obs_lats, stop("Parameter 'time_dim' must be of the class 'character'") } - if (is.null(obsL) | is.null(expL)) { # Do not allow synonims for lat (latitude), lon (longitude) and time (sdate) dimension names - if (is.na(match(lon_dim, names(dim(exp)))) | is.na(match(lon_dim, names(dim(obs))))) { - stop("Missing longitude dimension in 'exp' and/or 'obs', or does not match the parameter ", - "'lon_dim'") - } - - if (is.na(match(lat_dim, names(dim(exp)))) | is.na(match(lat_dim, names(dim(obs))))) { - stop("Missing latitude dimension in 'exp' and/or 'obs', or does not match the parameter ", - "'lat_dim'") - } + if (is.na(match(lon_dim, names(dim(exp)))) | is.na(match(lon_dim, names(dim(obs))))) { + stop("Missing longitude dimension in 'exp' and/or 'obs', or does not match the parameter ", + "'lon_dim'") + } - if (is.na(match(sdate_dim, names(dim(exp)))) | is.na(match(sdate_dim, names(dim(obs))))) { - stop("Missing start date dimension in 'exp' and/or 'obs', or does not match the parameter ", - "'sdate_dim'") - } + if (is.na(match(lat_dim, names(dim(exp)))) | is.na(match(lat_dim, names(dim(obs))))) { + stop("Missing latitude dimension in 'exp' and/or 'obs', or does not match the parameter ", + "'lat_dim'") + } - if (is.na(match(time_dim, names(dim(exp)))) | is.na(match(time_dim, names(dim(obs))))) { - stop("Missing time dimension in 'exp' and/or 'obs', or does not match the parameter ", - "'time_dim'") - } + if (is.na(match(sdate_dim, names(dim(exp)))) | is.na(match(sdate_dim, names(dim(obs))))) { + stop("Missing start date dimension in 'exp' and/or 'obs', or does not match the parameter ", + "'sdate_dim'") + } + + if (is.na(match(time_dim, names(dim(exp)))) | is.na(match(time_dim, names(dim(obs))))) { + stop("Missing time dimension in 'exp' and/or 'obs', or does not match the parameter ", + "'time_dim'") } # Ensure we have enough data to interpolate from high-res to coarse grid @@ -368,11 +358,7 @@ Analogs <- function(exp = NULL, obs, exp_lats = NULL, exp_lons = NULL, obs_lats, } } - if (!is.null(obsL) | !is.null(expL)) { - - if (is.null(obsL) | is.null(expL)) { - stop("obsL and/or expL is missing. Both must be provided for the large scale variable.") - } + if (!is.null(obsL) ) { # the code is not yet prepared to handle members in the observations if (member_dim %in% names(dim(obsL))) { @@ -384,35 +370,27 @@ Analogs <- function(exp = NULL, obs, exp_lats = NULL, exp_lons = NULL, obs_lats, } } - if (is.null(obsL_lats) | is.null(obsL_lons) | is.null(expL_lats) | is.null(obsL_lats)) { - stop("Missing latitudes and/or longitudes for the provided large scale observations and/or ", - "experimental fields Please provide them with the parametres 'obsL_lats, '", - "'obsL_lons', 'expL_lats' and/or 'expL_lons'.") + if (is.null(obsL_lats) | is.null(obsl_lons)) { + stop("Missing latitudes and/or longitudes for the provided training observations. Please ", + "provide them with the parametres 'obsL_lats' and 'obsL_lons'") } - - if (is.na(match(lon_dim, names(dim(obsL)))) | is.na(match(lon_dim, names(dim(expL)))) | - is.na(match(lon_dim, names(dim(obs))))) { - stop("Missing longitude dimension in 'obsL' and/or 'expL', or does not match ", - "the parameter 'lon_dim'") + + if (is.na(match(lon_dim, names(dim(obs2))))) { + stop("Missing longitude dimension in 'obsL', or does not match the parameter 'lon_dim'") } - if (is.na(match(lat_dim, names(dim(obsL)))) | is.na(match(lat_dim, names(dim(expL)))) | - is.na(match(lat_dim, names(dim(obs))))) { - stop("Missing latitude dimension in 'obsL' and/or 'expL', or does not match ", - "the parameter 'lat_dim'") + if (is.na(match(lat_dim, names(dim(obs2))))) { + stop("Missing latitude dimension in 'obsL', or does not match the parameter 'lat_dim'") } - if (is.na(match(sdate_dim, names(dim(obsL)))) | is.na(match(sdate_dim, names(dim(expL)))) | - is.na(match(sdate_dim, names(dim(obs))))) { - stop("Missing start date dimension in 'obsL' and/or 'expL', or does not match ", - "the parameter 'sdate_dim'") + if (is.na(match(sdate_dim, names(dim(obs2))))) { + stop("Missing start date dimension in 'obsL', or does not match the parameter 'sdate_dim'") } - if (is.na(match(time_dim, names(dim(obsL)))) | is.na(match(time_dim, names(dim(expL)))) | - is.na(match(time_dim, names(dim(obs))))) { - stop("Missing time dimension in 'obsL' and/or 'expL', or does not match ", - "the parameter 'time_dim'") + if (is.na(match(time_dim, names(dim(obs2))))) { + stop("Missing time dimension in 'obsL', or does not match the parameter 'time_dim'") } + } ## ncores @@ -432,9 +410,6 @@ Analogs <- function(exp = NULL, obs, exp_lats = NULL, exp_lons = NULL, obs_lats, obs_train <- obsL obs_train_lats <- obsL_lats obs_train_lons <- obsL_lons - exp <- expL - exp_lats <- expL_lats - exp_lons <- expL_lons } else { obs_train <- obs obs_train_lats <- obs_lats @@ -458,83 +433,54 @@ Analogs <- function(exp = NULL, obs, exp_lats = NULL, exp_lons = NULL, obs_lats, indices = list(a:b,c:d), drop = 'selected') } - if (is.null(obsL) | is.null(expL)) { - #----------------------------------- - # Interpolate high-res observations to the coarse grid - #----------------------------------- - if (is.null(region)) { - warning("The borders of the downscaling region have not been provided. ", - "Assuming the four borders of the downscaling region are defined by the ", - "first and last elements of the parameters 'exp_lats' and 'exp_lons'.") - region_train <- region <- c(exp_lons[1], exp_lons[length(exp_lons)], exp_lats[1], - exp_lats[length(exp_lats)]) - } else { - region_train <- region - } - } else { - if (is.null(regionL)) { - warning("The borders of the downscaling region have not been provided. ", - "Assuming the four borders of the downscaling region are defined by the ", - "first and last elements of the parameters 'obs_lats' and 'obs_lons'.") - # For large-scale case, training region is the region of large scale variable exp - region_train <- c(expL_lons[1], expL_lons[length(expL_lons)], expL_lats[1], - expL_lats[length(expL_lats)]) - } else { - region_train <- regionL - } - - if (is.null(region)) { - # For large-scale case, downscaling region is the region of local scale variable obs - region <- c(obs_lons[1], obs_lons[length(obs_lons)], obs_lats[1], obs_lats[length(obs_lats)]) - } - } - - obs_interpolated <- Interpolation(exp = obs_train, lats = obs_train_lats, lons = obs_train_lons, - target_grid = grid_exp, lat_dim = lat_dim, lon_dim = lon_dim, - method_remap = "conservative", region = region_train, - ncores = ncores) - - # If after interpolating 'obs' data the coordinates do not match, the exp data is interpolated to - # the same grid to force the matching - if (!.check_coords(lat1 = as.numeric(obs_interpolated$lat), - lat2 = exp_lats, - lon1 = as.numeric(obs_interpolated$lon), - lon2 = exp_lons)) { - exp_interpolated <- Interpolation(exp = exp, lats = exp_lats, lons = exp_lons, - target_grid = grid_exp, lat_dim = lat_dim, - lon_dim = lon_dim, method_remap = "conservative", - region = region_train, ncores = ncores)$data - } else { - exp_interpolated <- exp + if (is.null(region)) { + warning("The borders of the downscaling region have not been provided. ", + "Assuming the four borders of the downscaling region are defined by the ", + "first and last elements of the parameters 'exp_lats' and 'exp_lons'.") + region <- c(exp_lons[1], exp_lons[length(exp_lons)], exp_lats[1], + exp_lats[length(exp_lats)]) } # Create window if user does not have it in the training observations - if ( !("window" %in% names(dim(obs_interpolated$data))) ) { - obs_train_interpolated <- .generate_window(obj = obs_interpolated$data, sdate_dim = sdate_dim, - time_dim = time_dim, loocv = loocv_window, - ncores = ncores) - obs_hres <- .generate_window(obj = obs, sdate_dim = sdate_dim, time_dim = time_dim, - loocv = loocv_window, ncores = ncores) + if ( !("window" %in% names(dim(obs_train))) ) { + obs_train <- .generate_window(obj = obs_train, sdate_dim = sdate_dim, + time_dim = time_dim, loocv = loocv_window, + ncores = ncores) + if (!is.null(obsL)) { + obs_rs <- .generate_window(obj = obs, sdate_dim = sdate_dim, + time_dim = time_dim, + loocv = loocv_window, ncores = ncores) ## regional scale obs + } } else { - obs_train_interpolated <- obs_interpolated$data - dim(obs_train_interpolated) <- dim(ClimProjDiags::Subset(x = obs_train_interpolated, - along = time_dim, indices = 1, drop = 'selected')) - obs_hres <- obs - dim(obs_hres) <- dim(ClimProjDiags::Subset(x = obs_hres, - along = time_dim, indices = 1, drop = 'selected')) + dim(obs_train) <- dim(ClimProjDiags::Subset(x = obs_train, + along = time_dim, indices = 1, drop = 'selected')) + if (!is.null(obsL)) { + if ( !("window" %in% names(dim(obs))) ) { + stop("Either both obs and obsL should include 'window' dimension or none.") + } + obs_rs <- obs + dim(obs_rs) <- dim(obs_rs)[-which(names(dim(obs_rs)) == "time")] + } } #----------------------------------- # Reshape train and test #----------------------------------- - - RES <- Apply(list(obs_train_interpolated, exp_interpolated, obs_hres), + if (is.null(obsL)) { + RES <- Apply(list(obs_train, exp), + target_dims = list(c("window", lat_dim, lon_dim), c(lat_dim, lon_dim)), + fun = function(tr, te) .analogs(train = tr, test = te, + k = nanalogs, fun_analog = fun_analog), + ncores = ncores) ## output1 -> data, output2 -> index, output3 -> metric + } else { + RES <- Apply(list(obs_train, exp, obs_rs), target_dims = list(c("window", lat_dim, lon_dim), c(lat_dim, lon_dim), c("window", lat_dim, lon_dim)), - fun = function(tr, te, ob) .analogs(train = tr, test = te, obs_hres = ob, - k = nanalogs, metric = metric, fun_analog = fun_analog), + fun = function(tr, te, ob) .analogs(train = tr, test = te, obs_rs = ob, + k = nanalogs, fun_analog = fun_analog), ncores = ncores) ## output1 -> data, output2 -> index, output3 -> metric + } res.data <- RES$output1 # Return the indices of the best analogs @@ -568,17 +514,23 @@ Analogs <- function(exp = NULL, obs, exp_lats = NULL, exp_lons = NULL, obs_lats, } # For each element in test, find the indices of the k nearest neigbhors in train -.analogs <- function(train, test, obs_hres, k, fun_analog, metric = NULL, return_indices = FALSE) { +.analogs <- function(train, test, obs_rs = NULL, k, fun_analog, + return_indices = FALSE) { + require(dtwclust) - # train and obs_hres dim: 3 dimensions window, lat and lon (in this order) + # train and obs_rs dim: 3 dimensions window, lat and lon (in this order) # test dim: 2 dimensions lat and lon (in this order) # Number of lats/lons of the high-resolution data - space_dims_hres <- dim(obs_hres)[c(2,3)] + if (!is.null(obs_rs)) { + space_dims <- dim(obs_rs)[c(2,3)] + obs_rs <- apply(obs_rs, 1, as.vector); names(dim(obs_rs))[1] <- "space" + } else { + space_dims <- dim(train)[c(2,3)] + } # Reformat train and test as an array with (time, points) train <- apply(train, 1, as.vector); names(dim(train))[1] <- "space" test <- as.vector(test) - obs_hres <- apply(obs_hres, 1, as.vector); names(dim(obs_hres))[1] <- "space" # Identify and remove NA's dum<-which(!apply(train,2,function (x) all(is.na(x))))[1] ## the column in which NA in space @@ -586,37 +538,16 @@ Analogs <- function(exp = NULL, obs, exp_lats = NULL, exp_lons = NULL, obs_lats, idx_na_tr <- is.na(train[ , dum]) # NA in space idy_na_tr <- is.na(train[1, ]) # NA in time idx_na_te <- is.na(test) - idx_na <- idx_na_tr | idx_na_te - tr_wo_na <- t(train[!idx_na , !idy_na_tr ]) - te_wo_na <- test[!idx_na] + tr_wo_na <- t(train[!idx_na_tr , !idy_na_tr ]) + te_wo_na <- test[!idx_na_te] te_wo_na <- InsertDim(data = te_wo_na, posdim = 1, lendim = 1, name = "time") if (all(is.na(test))) { - res <- array(NA, space_dims_hres) + res <- array(NA, space_dims) } else { - if (metric == "rmse") { - rmse_all <- sqrt(rowMeans((sweep(tr_wo_na, 2, te_wo_na[1,]))^2)) # rmse - best_vals <- head(sort(rmse_all), k) - idx <- match(best_vals, rmse_all) - } else if (metric == "cor") { - cor_all <- apply(tr_wo_na, 1, function (x) cor(x,te_wo_na[1, ], method = "spearman")) # cor - best_vals <- head(sort(cor_all, decreasing = TRUE), k) - idx <- match(best_vals, cor_all) - } else if (metric == "dcor") { - require(energy) - dcor_all <- apply(tr_wo_na, 1, function (x) dcor(x,te_wo_na[1, ])) # dcor - best_vals <- head(sort(dcor_all, decreasing = TRUE), k,) - idx <- match(best_vals, dcor_all) -# } else if (metric == "lm") { - # r2_all <- apply(tr_wo_na, 1, function (x) { - # df = data.frame (x, te_wo_na[1, ]) - # names(df) <- c("x", "y") - # model <- lm (y~x, df) - # summary(model)$adj.r.squared - # }) -# best_vals <- head(sort(r2_all, decreasing = TRUE), k) - # idx <- match(best_vals, r2_all) - } + dtw_all <- apply(tr_wo_na, 1, function (x) dtw_basic(x, te_wo_na[1, ])) # dtw + best_vals <- head(sort(dtw_all), k) + idx <- match(best_vals, dtw_all) if (isTRUE(any(idy_na_tr))) { dum <-(1:length(idy_na_tr))[!idy_na_tr] idx <- dum[idx] @@ -625,16 +556,15 @@ Analogs <- function(exp = NULL, obs, exp_lats = NULL, exp_lons = NULL, obs_lats, names(dim(res_ind)) <- c("index") res_metric <- array(best_vals, c(k)) names(dim(res_metric)) <- c("metric") - res <- obs_hres[ , idx] - dim(res) <- c(space_dims_hres, analogs = k) - + if (!is.null(obs_rs)) { + res <- obs_rs[ , idx] + } else { + res <- train[ , idx] + } + dim(res) <- c(space_dims, analogs = k) if (!is.null(fun_analog)) { if (fun_analog == "wmean") { - if (metric == "dist" | metric == "rmse") { - weight <- 1 / best_vals - } else { - weight <- best_vals - } + weight <- 1 / best_vals res <- apply(res, c(1,2), function(x) weighted.mean(x, weight)) } else if (fun_analog == "min") { res<-res[,,which.min(dist)] -- GitLab From df74b811ca4bd1561e04f0b28f152ac736eb2555 Mon Sep 17 00:00:00 2001 From: eduzenli Date: Sun, 7 Jan 2024 19:51:07 +0100 Subject: [PATCH 5/6] large scale predictor can be used to downscale local scale variable --- R/Analogs.R | 143 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 84 insertions(+), 59 deletions(-) diff --git a/R/Analogs.R b/R/Analogs.R index 45c0eb4..b141b76 100644 --- a/R/Analogs.R +++ b/R/Analogs.R @@ -102,9 +102,8 @@ #'@export CST_Analogs <- function(exp = NULL, obs, obsL = NULL, grid_exp, nanalogs = 3, fun_analog = NULL, lat_dim = "lat", lon_dim = "lon", sdate_dim = "sdate", - time_dim = "time", member_dim = "member", region = NULL, - regionL = NULL, return_indices = FALSE, loocv_window = TRUE, - ncores = NULL) { + time_dim = "time", member_dim = "member", metric = "rmse", region = NULL, + return_indices = FALSE, loocv_window = TRUE, ncores = NULL) { # input exp and obs must be s2dv_cube objects if (!inherits(obs,'s2dv_cube')) { @@ -128,8 +127,8 @@ CST_Analogs <- function(exp = NULL, obs, obsL = NULL, grid_exp, nanalogs = 3, obsL_lats = obsL$coords[[lat_dim]], obsL_lons = obsL$coords[[lon_dim]], grid_exp = grid_exp, nanalogs = nanalogs, fun_analog = fun_analog, lat_dim = lat_dim, lon_dim = lon_dim, sdate_dim = sdate_dim, - time_dim = time_dim, member_dim = member_dim, - region = region, regionL = regionL, return_indices = return_indices, + time_dim = time_dim, member_dim = member_dim, metric = "rmse", + region = region, return_indices = return_indices, loocv_window = loocv_window, ncores = ncores) # Modify data, lat and lon in the origina s2dv_cube, adding the downscaled data @@ -285,8 +284,8 @@ CST_Analogs <- function(exp = NULL, obs, obsL = NULL, grid_exp, nanalogs = 3, Analogs <- function(exp = NULL, obs, exp_lats = NULL, exp_lons = NULL, obs_lats, obs_lons, grid_exp, obsL = NULL, obsL_lats = NULL, obsL_lons = NULL, nanalogs = 3, fun_analog = NULL, lat_dim = "lat", lon_dim = "lon", sdate_dim = "sdate", - time_dim = "time", member_dim = "member", region = NULL, - regionL = NULL, return_indices = FALSE, loocv_window = TRUE, ncores = NULL) { + time_dim = "time", member_dim = "member", metric = "rmse", region = NULL, + return_indices = FALSE, loocv_window = TRUE, ncores = NULL) { #----------------------------------- # Checkings #----------------------------------- @@ -370,24 +369,24 @@ Analogs <- function(exp = NULL, obs, exp_lats = NULL, exp_lons = NULL, obs_lats, } } - if (is.null(obsL_lats) | is.null(obsl_lons)) { + if (is.null(obsL_lats) | is.null(obsL_lons)) { stop("Missing latitudes and/or longitudes for the provided training observations. Please ", "provide them with the parametres 'obsL_lats' and 'obsL_lons'") } - if (is.na(match(lon_dim, names(dim(obs2))))) { + if (is.na(match(lon_dim, names(dim(obsL))))) { stop("Missing longitude dimension in 'obsL', or does not match the parameter 'lon_dim'") } - if (is.na(match(lat_dim, names(dim(obs2))))) { + if (is.na(match(lat_dim, names(dim(obsL))))) { stop("Missing latitude dimension in 'obsL', or does not match the parameter 'lat_dim'") } - if (is.na(match(sdate_dim, names(dim(obs2))))) { + if (is.na(match(sdate_dim, names(dim(obsL))))) { stop("Missing start date dimension in 'obsL', or does not match the parameter 'sdate_dim'") } - if (is.na(match(time_dim, names(dim(obs2))))) { + if (is.na(match(time_dim, names(dim(obsL))))) { stop("Missing time dimension in 'obsL', or does not match the parameter 'time_dim'") } @@ -441,46 +440,65 @@ Analogs <- function(exp = NULL, obs, exp_lats = NULL, exp_lons = NULL, obs_lats, exp_lats[length(exp_lats)]) } + obs_interpolated <- Interpolation(exp = obs_train, lats = obs_train_lats, lons = obs_train_lons, + target_grid = grid_exp, lat_dim = lat_dim, lon_dim = lon_dim, + method_remap = "conservative", region = region, + ncores = ncores) + + # If after interpolating 'obs' data the coordinates do not match, the exp data is interpolated to + # the same grid to force the matching + if (!.check_coords(lat1 = as.numeric(obs_interpolated$lat), + lat2 = exp_lats, + lon1 = as.numeric(obs_interpolated$lon), + lon2 = exp_lons)) { + exp_interpolated <- Interpolation(exp = exp, lats = exp_lats, lons = exp_lons, + target_grid = grid_exp, lat_dim = lat_dim, + lon_dim = lon_dim, method_remap = "conservative", + region = region, ncores = ncores)$data + } else { + exp_interpolated <- exp + } + # Create window if user does not have it in the training observations - if ( !("window" %in% names(dim(obs_train))) ) { - obs_train <- .generate_window(obj = obs_train, sdate_dim = sdate_dim, - time_dim = time_dim, loocv = loocv_window, - ncores = ncores) + if ( !("window" %in% names(dim(obs_interpolated$data))) ) { + obs_train_interpolated <- .generate_window(obj = obs_interpolated$data, sdate_dim = sdate_dim, + time_dim = time_dim, loocv = loocv_window, + ncores = ncores) if (!is.null(obsL)) { - obs_rs <- .generate_window(obj = obs, sdate_dim = sdate_dim, - time_dim = time_dim, - loocv = loocv_window, ncores = ncores) ## regional scale obs + if ( ("window" %in% names(dim(obs))) ) { + stop("Either both obs and obsL should include 'window' dimension or none.") + } } + obs_hres <- .generate_window(obj = obs, sdate_dim = sdate_dim, time_dim = time_dim, + loocv = loocv_window, ncores = ncores) + } else { - dim(obs_train) <- dim(ClimProjDiags::Subset(x = obs_train, - along = time_dim, indices = 1, drop = 'selected')) + obs_train_interpolated <- obs_interpolated$data + dim(obs_train_interpolated) <- dim(ClimProjDiags::Subset(x = obs_train_interpolated, + along = time_dim, indices = 1, drop = 'selected')) + if (!is.null(obsL)) { if ( !("window" %in% names(dim(obs))) ) { stop("Either both obs and obsL should include 'window' dimension or none.") } - obs_rs <- obs - dim(obs_rs) <- dim(obs_rs)[-which(names(dim(obs_rs)) == "time")] } + obs_hres <- obs + dim(obs_hres) <- dim(ClimProjDiags::Subset(x = obs_hres, + along = time_dim, indices = 1, drop = 'selected')) + } #----------------------------------- # Reshape train and test #----------------------------------- - if (is.null(obsL)) { - RES <- Apply(list(obs_train, exp), - target_dims = list(c("window", lat_dim, lon_dim), c(lat_dim, lon_dim)), - fun = function(tr, te) .analogs(train = tr, test = te, - k = nanalogs, fun_analog = fun_analog), - ncores = ncores) ## output1 -> data, output2 -> index, output3 -> metric - } else { - RES <- Apply(list(obs_train, exp, obs_rs), - target_dims = list(c("window", lat_dim, lon_dim), c(lat_dim, lon_dim), - c("window", lat_dim, lon_dim)), - fun = function(tr, te, ob) .analogs(train = tr, test = te, obs_rs = ob, - k = nanalogs, fun_analog = fun_analog), + RES <- Apply(list(obs_train_interpolated, exp_interpolated, obs_hres), + target_dims = list(c("window", lat_dim, lon_dim), c(lat_dim, lon_dim), + c("window", lat_dim, lon_dim)), + fun = function(tr, te, ob) .analogs(train = tr, test = te, obs_hres = ob, + k = nanalogs, metric = metric, fun_analog = fun_analog), ncores = ncores) ## output1 -> data, output2 -> index, output3 -> metric - } + res.data <- RES$output1 # Return the indices of the best analogs @@ -514,23 +532,17 @@ Analogs <- function(exp = NULL, obs, exp_lats = NULL, exp_lons = NULL, obs_lats, } # For each element in test, find the indices of the k nearest neigbhors in train -.analogs <- function(train, test, obs_rs = NULL, k, fun_analog, - return_indices = FALSE) { - require(dtwclust) +.analogs <- function(train, test, obs_hres, k, fun_analog, metric = NULL, return_indices = FALSE) { # train and obs_rs dim: 3 dimensions window, lat and lon (in this order) # test dim: 2 dimensions lat and lon (in this order) # Number of lats/lons of the high-resolution data + space_dims_hres <- dim(obs_hres)[c(2,3)] - if (!is.null(obs_rs)) { - space_dims <- dim(obs_rs)[c(2,3)] - obs_rs <- apply(obs_rs, 1, as.vector); names(dim(obs_rs))[1] <- "space" - } else { - space_dims <- dim(train)[c(2,3)] - } # Reformat train and test as an array with (time, points) train <- apply(train, 1, as.vector); names(dim(train))[1] <- "space" test <- as.vector(test) + obs_hres <- apply(obs_hres, 1, as.vector); names(dim(obs_hres))[1] <- "space" # Identify and remove NA's dum<-which(!apply(train,2,function (x) all(is.na(x))))[1] ## the column in which NA in space @@ -538,16 +550,28 @@ Analogs <- function(exp = NULL, obs, exp_lats = NULL, exp_lons = NULL, obs_lats, idx_na_tr <- is.na(train[ , dum]) # NA in space idy_na_tr <- is.na(train[1, ]) # NA in time idx_na_te <- is.na(test) - tr_wo_na <- t(train[!idx_na_tr , !idy_na_tr ]) - te_wo_na <- test[!idx_na_te] + idx_na <- idx_na_tr | idx_na_te + tr_wo_na <- t(train[!idx_na , !idy_na_tr ]) + te_wo_na <- test[!idx_na] te_wo_na <- InsertDim(data = te_wo_na, posdim = 1, lendim = 1, name = "time") if (all(is.na(test))) { - res <- array(NA, space_dims) + res <- array(NA, space_dims_hres) } else { - dtw_all <- apply(tr_wo_na, 1, function (x) dtw_basic(x, te_wo_na[1, ])) # dtw - best_vals <- head(sort(dtw_all), k) - idx <- match(best_vals, dtw_all) + if (metric == "rmse") { + rmse_all <- sqrt(rowMeans((sweep(tr_wo_na, 2, te_wo_na[1,]))^2)) # rmse + best_vals <- head(sort(rmse_all), k) + idx <- match(best_vals, rmse_all) + } else if (metric == "cor") { + cor_all <- apply(tr_wo_na, 1, function (x) cor(x,te_wo_na[1, ], method = "spearman")) # cor + best_vals <- head(sort(cor_all, decreasing = TRUE), k) + idx <- match(best_vals, cor_all) + } else if (metric == "dcor") { + require(energy) + dcor_all <- apply(tr_wo_na, 1, function (x) dcor(x,te_wo_na[1, ])) # dcor + best_vals <- head(sort(dcor_all, decreasing = TRUE), k,) + idx <- match(best_vals, dcor_all) + } if (isTRUE(any(idy_na_tr))) { dum <-(1:length(idy_na_tr))[!idy_na_tr] idx <- dum[idx] @@ -555,16 +579,17 @@ Analogs <- function(exp = NULL, obs, exp_lats = NULL, exp_lons = NULL, obs_lats, res_ind <- array(idx, k) names(dim(res_ind)) <- c("index") res_metric <- array(best_vals, c(k)) - names(dim(res_metric)) <- c("metric") - if (!is.null(obs_rs)) { - res <- obs_rs[ , idx] - } else { - res <- train[ , idx] - } - dim(res) <- c(space_dims, analogs = k) + names(dim(res_metric)) <- c("metric") + res <- obs_hres[ , idx] + dim(res) <- c(space_dims_hres, analogs = k) + if (!is.null(fun_analog)) { if (fun_analog == "wmean") { - weight <- 1 / best_vals + if (metric == "dist" | metric == "rmse") { + weight <- 1 / best_vals + } else { + weight <- best_vals + } res <- apply(res, c(1,2), function(x) weighted.mean(x, weight)) } else if (fun_analog == "min") { res<-res[,,which.min(dist)] -- GitLab From 1455abbe5643fd819478d6adcb8e8621b0b983e9 Mon Sep 17 00:00:00 2001 From: eduzenli Date: Wed, 10 Jan 2024 15:50:46 +0100 Subject: [PATCH 6/6] dist instead of rmse, documentation arragement and fine tuning --- R/Analogs.R | 169 +++++++++++++++++++++++++++++----------------------- 1 file changed, 96 insertions(+), 73 deletions(-) diff --git a/R/Analogs.R b/R/Analogs.R index b141b76..a5ce1ce 100644 --- a/R/Analogs.R +++ b/R/Analogs.R @@ -6,7 +6,7 @@ #'@description This function performs a downscaling using Analogs. To compute #'the analogs given a coarse-scale field, the function looks for days with similar conditions #'in the historical observations. The analogs function determines the N best analogs based -#'on RMSE, distance correlation, or Spearman's correlation metrics. To downscale +#'on Euclidian distance, distance correlation, or Spearman's correlation metrics. To downscale #'a local-scale variable, either the variable itself or another large-scale variable #'can be utilized as the predictor. In the first scenario, analogs are examined between #'the observation and model data of the same local-scale variable. In the latter scenario, @@ -15,9 +15,9 @@ #'the best analog, the function extracts the corresponding local-scale variable for that day #'from the observation of the local scale variable. The used local-scale and large-scale #'variables can be retrieved from independent regions. The input data for the first case must -#'include 'exp' and 'obs,' while in the second case, 'obs,' 'obsL,' and 'expL' are the +#'include 'exp' and 'obs,' while in the second case, 'obs,' 'obsL,' and 'exp' are the #'required input fields. Users can perform the downscaling process over the subregions -#'that can be identified through the 'region' and 'regionL' arguments, instead of focusing +#'that can be identified through the 'region' argument, instead of focusing #'on the entire area of the loaded data. #' #'The search of analogs must be done in the longest dataset posible, but might @@ -29,25 +29,20 @@ #'is intended to downscale climate prediction data (i.e., sub-seasonal, seasonal #'and decadal predictions) but can admit climate projections or reanalyses. It does #'not have constrains of specific region or variables to downscale. -#'@param exp an 's2dv_cube' object with named dimensions containing the experimental field on -#'the coarse scale for which the downscaling is aimed. The object must have, at least, -#'the dimensions latitude, longitude, start date and time. The object is expected to be -#'already subset for the desired region. Data can be in one or two integrated regions, e.g., -#'crossing the Greenwich meridian. To get the correct results in the latter case, -#'the borders of the region should be specified in the parameter 'region'. See parameter -#''region'. -#'@param obs an 's2dv_cube' object with named dimensions containing the observational field. -#'The object must have, at least, the dimensions latitude, longitude and start date. -#'The object is expected to be already subset for the desired region. -#'@param expL an 's2dv_cube' object with named dimensions containing the experimental -#'field of the large-scale variable on the coarse scale. The object must have, at least, -#'the dimensions latitude, longitude, start date and time. The object is expected to be -#'already subset for the desired region. Data can be in one or two integrated regions, -#'e.g., crossing the Greenwich meridian. To get the correct results in the latter case, -#'the borders of the region should be specified in the parameter regionL'. See -#''parameter regionL'. +#'@param exp an 's2dv_cube' object with named dimensions containing the experimental field +#'on the coarse scale for the variable targeted for downscaling (in case obsL is not provided) +#'or for the large-scale variable used as the predictor (if obsL is provided). +#'The object must have, at least, the dimensions latitude, longitude, start date and time. +#'The object is expected to be already subset for the desired region. Data can be in one +#'or two integrated regions, e.g., crossing the Greenwich meridian. To get the correct +#'results in the latter case, the borders of the region should be specified in the parameter +#''region'. See parameter 'region'. +#'@param obs an 's2dv_cube' object with named dimensions containing the observational field +#'for the variable targeted for downscaling. The object must have, at least, the dimensions +#'latitude, longitude and start date. The object is expected to be already subset for the +#'desired region. #'@param obsL an 's2dv_cube' object with named dimensions containing the observational -#'field of the large-scale variable.The object must have, at least, the dimensions latitude, +#'field of the large-scale variable. The object must have, at least, the dimensions latitude, #'longitude and start date. The object is expected to be already subset for the desired region. #'@param grid_exp a character vector with a path to an example file of the exp (if the #'predictor is the local scale variable) or expL (if the predictor is a large scale variable) @@ -69,8 +64,8 @@ #'@param member_dim a character vector indicating the member dimension name in the element #''data' in exp and obs. Default set to "member". #'@param metric a character vector to select the analog specification method. Only these -#'options are valid: "rmse", "dcor" (i.e., distance correlation) or "cor" (i.e., Spearman's -#'.correlation) The default metric is "rmse". +#'options are valid: "dist" (i.e., Euclidian distance), "dcor" (i.e., distance correlation) +#'or "cor" (i.e., Spearman's .correlation). The default metric is "dist". #'@param region a numeric vector indicating the borders of the region defined in exp. #'It consists of four elements in this order: lonmin, lonmax, latmin, latmax. lonmin refers #'to the left border, while lonmax refers to the right border. latmin indicates the lower @@ -96,13 +91,13 @@ #'dim(obs) <- c(lat = 12, lon = 15, sdate = 5, time = 30) #'obs_lons <- seq(0,6, 6/14) #'obs_lats <- seq(0,6, 6/11) -#'exp <- s2dv_cube(data = exp, lat = exp_lats, lon = exp_lons) -#'obs <- s2dv_cube(data = obs, lat = obs_lats, lon = obs_lons) +#'exp <- s2dv_cube(data = exp, coords = list(lat = exp_lats, lon = exp_lons)) +#'obs <- s2dv_cube(data = obs, coords = list(lat = obs_lats, lon = obs_lons)) #'downscaled_field <- CST_Analogs(exp = exp, obs = obs, grid_exp = 'r360x180') #'@export -CST_Analogs <- function(exp = NULL, obs, obsL = NULL, grid_exp, nanalogs = 3, +CST_Analogs <- function(exp, obs, obsL = NULL, grid_exp, nanalogs = 3, fun_analog = NULL, lat_dim = "lat", lon_dim = "lon", sdate_dim = "sdate", - time_dim = "time", member_dim = "member", metric = "rmse", region = NULL, + time_dim = "time", member_dim = "member", metric = "dist", region = NULL, return_indices = FALSE, loocv_window = TRUE, ncores = NULL) { # input exp and obs must be s2dv_cube objects @@ -127,7 +122,7 @@ CST_Analogs <- function(exp = NULL, obs, obsL = NULL, grid_exp, nanalogs = 3, obsL_lats = obsL$coords[[lat_dim]], obsL_lons = obsL$coords[[lon_dim]], grid_exp = grid_exp, nanalogs = nanalogs, fun_analog = fun_analog, lat_dim = lat_dim, lon_dim = lon_dim, sdate_dim = sdate_dim, - time_dim = time_dim, member_dim = member_dim, metric = "rmse", + time_dim = time_dim, member_dim = member_dim, metric = metric, region = region, return_indices = return_indices, loocv_window = loocv_window, ncores = ncores) @@ -178,19 +173,20 @@ CST_Analogs <- function(exp = NULL, obs, obsL = NULL, grid_exp, nanalogs = 3, #'is intended to downscale climate prediction data (i.e., sub-seasonal, seasonal #'and decadal predictions) but can admit climate projections or reanalyses. It does #'not have constrains of specific region or variables to downscale. -#'@param exp an array with named dimensions containing the experimental field on the -#'coarse scale for which the downscaling is aimed. The object must have, at least, -#'the dimensions latitude, longitude, start date and time. The object is expected to be -#'already subset for the desired region. Data can be in one or two integrated regions, e.g., -#'crossing the Greenwich meridian. To get the correct results in the latter case, -#'the borders of the region should be specified in the parameter 'region'. See parameter -#''region'. -#'@param obs an array with named dimensions containing the observational field. The object -#'must have, at least, the dimensions latitude, longitude, start date and time. The object -#'is expected to be already subset for the desired region. Optionally, 'obs' can have the -#'dimension 'window', containing the sampled fields into which the function will look for -#'the analogs. See function 'generate_window()'. Otherwise, the function will look for -#'analogs using all the possible fields contained in obs. +#'@param exp an array with named dimensions containing the experimental field +#'on the coarse scale for the variable targeted for downscaling (in case obsL is not provided) +#'or for the large-scale variable used as the predictor (if obsL is provided). +#'The object must have, at least, the dimensions latitude, longitude, start date and time. +#'The object is expected to be already subset for the desired region. Data can be in one +#'or two integrated regions, e.g., crossing the Greenwich meridian. To get the correct +#'results in the latter case, the borders of the region should be specified in the parameter +#''region'. See parameter 'region'. +#'@param obs an array with named dimensions containing the observational field for the variable +#'targeted for downscaling. The object must have, at least, the dimensions latitude, longitude, +#'start date and time. The object is expected to be already subset for the desired region. +#'Optionally, 'obs' can have the dimension 'window', containing the sampled fields into which +#'the function will look for the analogs. See function 'generate_window()'. Otherwise, +#'the function will look for analogs using all the possible fields contained in obs. #'@param exp_lats a numeric vector containing the latitude values in 'exp'. Latitudes must #'range from -90 to 90. #'@param exp_lons a numeric vector containing the longitude values in 'exp'. Longitudes @@ -199,20 +195,12 @@ CST_Analogs <- function(exp = NULL, obs, obsL = NULL, grid_exp, nanalogs = 3, #'range from -90 to 90. #'@param obs_lons a numeric vector containing the longitude values in 'obs'. Longitudes #'can range from -180 to 180 or from 0 to 360. -#'@param expL an 's2dv_cube' object with named dimensions containing the experimental -#'field of the large-scale variable on the coarse scale. The object must have, at least, -#'the dimensions latitude, longitude, start date and time. The object is expected to be -#'already subset for the desired region. Data can be in one or two integrated regions, -#'e.g., crossing the Greenwich meridian. To get the correct results in the latter case, -#'the borders of the region should be specified in the parameter regionL'. See -#''parameter regionL'. #'@param obsL an 's2dv_cube' object with named dimensions containing the observational #'field of the large-scale variable.The object must have, at least, the dimensions latitude, -#'longitude and start date. The object is expected to be already subset for the desired region -#'@param expL_lats a numeric vector containing the latitude values in 'expL'. Latitudes must -#'range from -90 to 90. -#'@param expL_lons a numeric vector containing the longitude values in 'expL'. Longitudes -#'can range from -180 to 180 or from 0 to 360. +#'longitude and start date. The object is expected to be already subset for the desired region. +#'Optionally, 'obsL' can have the dimension 'window', containing the sampled fields into which +#'the function will look for the analogs. See function 'generate_window()'. Otherwise, +#'the function will look for analogs using all the possible fields contained in obs. #'@param obsL_lats a numeric vector containing the latitude values in 'obsL'. Latitudes must #'range from -90 to 90. #'@param obsL_lons a numeric vector containing the longitude values in 'obsL'. Longitudes @@ -237,18 +225,13 @@ CST_Analogs <- function(exp = NULL, obs, obsL = NULL, grid_exp, nanalogs = 3, #'@param member_dim a character vector indicating the member dimension name in the element #''data' in exp and obs. Default set to "member". #'@param metric a character vector to select the analog specification method. Only these -#'options are valid: "rmse", "dcor" (i.e., distance correlation) or "cor" (i.e., Spearman's -#'.correlation) The default metric is "rmse". +#'options are valid: "dist" (i.e., Euclidian distance), "dcor" (i.e., distance correlation) +#'or "cor" (i.e., Spearman's .correlation). The default metric is "dist". #'@param region a numeric vector indicating the borders of the region defined in exp. #'It consists of four elements in this order: lonmin, lonmax, latmin, latmax. lonmin refers #'to the left border, while lonmax refers to the right border. latmin indicates the lower #'border, whereas latmax indicates the upper border. If set to NULL (default), the function #'takes the first and last elements of the latitudes and longitudes. -#'@param regionL a numeric vector indicating the borders of the region defined in expL. -#'It consists of four elements in this order: lonmin, lonmax, latmin, latmax. lonmin refers -#'to the left border, while lonmax refers to the right border. latmin indicates the lower -#'border, whereas latmax indicates the upper border. If set to NULL (default), the function -#'takes the first and last elements of the latitudes and longitudes of the large-scale region. #'@param return_indices a logical vector indicating whether to return the indices of the #'analogs together with the downscaled fields. The indices refer to the position of the #'element in the vector time * start_date. If 'obs' contain the dimension 'window', it will @@ -281,10 +264,10 @@ CST_Analogs <- function(exp = NULL, obs, obsL = NULL, grid_exp, nanalogs = 3, #'downscaled_field <- Analogs(exp = exp, obs = obs, exp_lats = exp_lats, exp_lons = exp_lons, #'obs_lats = obs_lats, obs_lons = obs_lons, grid_exp = 'r360x180') #'@export -Analogs <- function(exp = NULL, obs, exp_lats = NULL, exp_lons = NULL, obs_lats, obs_lons, +Analogs <- function(exp, obs, exp_lats = NULL, exp_lons = NULL, obs_lats, obs_lons, grid_exp, obsL = NULL, obsL_lats = NULL, obsL_lons = NULL, nanalogs = 3, fun_analog = NULL, lat_dim = "lat", lon_dim = "lon", sdate_dim = "sdate", - time_dim = "time", member_dim = "member", metric = "rmse", region = NULL, + time_dim = "time", member_dim = "member", metric = "dist", region = NULL, return_indices = FALSE, loocv_window = TRUE, ncores = NULL) { #----------------------------------- # Checkings @@ -405,6 +388,11 @@ Analogs <- function(exp = NULL, obs, exp_lats = NULL, exp_lons = NULL, obs_lats, stopifnot(fun_analog %in% c("mean", "wmean", "max", "min", "median")) } + # metric method to be used to specify the analogs + stopifnot(metric %in% c("cor", "dcor", "dist")) + + + if (!is.null(obsL)) { obs_train <- obsL obs_train_lats <- obsL_lats @@ -422,14 +410,15 @@ Analogs <- function(exp = NULL, obs, exp_lats = NULL, exp_lons = NULL, obs_lats, } # crop downscaling region, if the argument region is provided. - if (!is.null(region)) { + if (!is.null(region) & is.null(obsL)) { # if a border is equally distant from two different grids, the map will be cropped from the grid having smaller coordinate + a <- which.min(abs((region[1]-obs_lons))) b <- which.min(abs((region[2]-obs_lons))) c <- which.min(abs((region[3]-obs_lats))) d <- which.min(abs((region[4]-obs_lats))) obs <- ClimProjDiags::Subset(x = obs, along = list(lon_dim,lat_dim), - indices = list(a:b,c:d), drop = 'selected') + indices = list(a:b,c:d), drop = 'selected') } if (is.null(region)) { @@ -558,17 +547,17 @@ Analogs <- function(exp = NULL, obs, exp_lats = NULL, exp_lons = NULL, obs_lats, if (all(is.na(test))) { res <- array(NA, space_dims_hres) } else { - if (metric == "rmse") { - rmse_all <- sqrt(rowMeans((sweep(tr_wo_na, 2, te_wo_na[1,]))^2)) # rmse - best_vals <- head(sort(rmse_all), k) - idx <- match(best_vals, rmse_all) + if (metric == "dist") { + dist_all <- sqrt(rowSums((sweep(tr_wo_na, 2, te_wo_na[1,]))^2)) # euc dist + best_vals <- head(sort(dist_all), k) + idx <- match(best_vals, dist_all) } else if (metric == "cor") { cor_all <- apply(tr_wo_na, 1, function (x) cor(x,te_wo_na[1, ], method = "spearman")) # cor best_vals <- head(sort(cor_all, decreasing = TRUE), k) idx <- match(best_vals, cor_all) } else if (metric == "dcor") { - require(energy) - dcor_all <- apply(tr_wo_na, 1, function (x) dcor(x,te_wo_na[1, ])) # dcor +# require(energy) + dcor_all <- apply(tr_wo_na, 1, function (x) .dcor(x,te_wo_na[1, ])) # dcor best_vals <- head(sort(dcor_all, decreasing = TRUE), k,) idx <- match(best_vals, dcor_all) } @@ -585,16 +574,16 @@ Analogs <- function(exp = NULL, obs, exp_lats = NULL, exp_lons = NULL, obs_lats, if (!is.null(fun_analog)) { if (fun_analog == "wmean") { - if (metric == "dist" | metric == "rmse") { + if (metric == "dist") { weight <- 1 / best_vals } else { weight <- best_vals } res <- apply(res, c(1,2), function(x) weighted.mean(x, weight)) } else if (fun_analog == "min") { - res<-res[,,which.min(dist)] + res <- res[, , which.min(best_vals)] } else if (fun_analog == "max") { - res<-res[,,which.max(dist)] + res <- res[, , which.max(best_vals)] } else { res <- apply(res, c(1,2), fun_analog) } @@ -653,3 +642,37 @@ Analogs <- function(exp = NULL, obs, exp_lats = NULL, exp_lons = NULL, obs_lats, return(obj_window) } +# Distance correlation function +.dcor <- function(x, y) { + n <- length(x) + + # Calculate Euclidean distances for x + dist_x <- as.matrix(dist(matrix(x))) + # Calculate Euclidean distances for y + dist_y <- as.matrix(dist(matrix(y))) + + # Centering matrices + H <- diag(n) - 1/n + + # Centered distance matrices + dist_centered_x <- H %*% dist_x %*% H + dist_centered_y <- H %*% dist_y %*% H + + # Calculate the product of mean-centered distance matrices + B <- dist_centered_x %*% t(dist_centered_y) + C <- dist_centered_x %*% t(dist_centered_x) + D <- dist_centered_y %*% t(dist_centered_y) + + # Calculate the distance covariance + dcov_xy <- sqrt(sum(diag(B))) + + # Calculate the variances + cov_xx <- sqrt(sum(diag(C))) + cov_yy <- sqrt(sum(diag(D))) + + # Calculate the distance correlation + dcor_xy <- dcov_xy / sqrt(cov_xx * cov_yy) + + return(dcor_xy) +} + -- GitLab