From 917a58d05e0a54a9904295d0cf8fac8bf1807edb Mon Sep 17 00:00:00 2001 From: eduzenli Date: Tue, 9 Apr 2024 17:19:11 +0200 Subject: [PATCH 1/4] daily data downscaling bug is fixed for linear and logistic regression. Also, find.nn function was fixed --- R/Intlr.R | 93 ++++++++++++++++++++++++++++++++--------- R/LogisticReg.R | 109 +++++++++++++++++++++++++++++++++++++----------- 2 files changed, 157 insertions(+), 45 deletions(-) diff --git a/R/Intlr.R b/R/Intlr.R index 953031f..32490f1 100644 --- a/R/Intlr.R +++ b/R/Intlr.R @@ -610,19 +610,33 @@ Intlr <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, obs_lo } # Apply the linear regressions - # case hindcast - forecast + ## case hindcast - forecast if (!is.null(exp_cor)) { + if ( time_dim %in% names(dim(predictor)) ) { + if (dim(predictor)[time_dim] > 1) { + target_dims_predictor <- c(target_dims_predictor, time_dim) + target_dims_predictand <- c(target_dims_predictand, time_dim) + target_dims_forecast <- c(target_dims_forecast, time_dim) + } + } res <- Apply(list(predictor, predictand, forecast), target_dims = list(target_dims_predictor, target_dims_predictand, target_dims_forecast), - fun = .intlr, loocv = loocv, aux_dim = aux_dim, ncores = ncores)$output1 + fun = .intlr, loocv = loocv, aux_dim = aux_dim, ncores = ncores, + sdate_dim = sdate_dim, time_dim = time_dim)$output1 } - # case hindcast only + ## case hindcast only else { + if ( time_dim %in% names(dim(predictor)) ) { + if (dim(predictor)[time_dim] > 1) { + target_dims_predictor <- c(target_dims_predictor, time_dim) + target_dims_predictand <- c(target_dims_predictand, time_dim) + } + } res <- Apply(list(predictor, predictand), target_dims = list(target_dims_predictor, target_dims_predictand), - fun = .intlr, loocv = loocv, ncores = ncores)$output1 - names(dim(res))[1] <- sdate_dim + fun = .intlr, loocv = loocv, ncores = ncores, sdate_dim = sdate_dim, + time_dim = time_dim)$output1 } # restore ensemble dimension in observations if it existed originally @@ -652,19 +666,39 @@ Intlr <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, obs_lo #----------------------------------- # Atomic function to generate and apply the linear regressions #----------------------------------- -.intlr <- function(x, y, f = NULL, loocv, aux_dim = NULL) { +.intlr <- function(x, y, f = NULL, loocv, aux_dim = NULL, sdate_dim = "sdate", time_dim = "time") { require (plyr) + k_out <- 1 ## leave ONE out + daily <- FALSE + if (any(names(dim(x)) == time_dim)) { ## IF DAILY DATA IS PROVIDED + daily <- TRUE ## in order to arrange the res dimension, when loocv is not TRUE + k_out <- as.numeric (dim(x)[time_dim]) ## for daily, to omit the data of the corresponding year. + x <- MergeDims (x, merge_dims = c(time_dim, sdate_dim), rename_dim = sdate_dim) + y <- MergeDims (y, merge_dims = c(time_dim, sdate_dim), rename_dim = sdate_dim) + if (!is.null(f)) { + f <- MergeDims (f, merge_dims = c(time_dim, sdate_dim), rename_dim = sdate_dim) + } + } + if (!is.null(f)) { if (!is.null(aux_dim)) { + if (length(dim(x)) > 1 & daily) { + x <- aperm (x, c(3,1,2)) ## if 9nn or large scale, dimensions: sdate, nn, member + f <- aperm (f, c(3,1,2)) ## if 9nn or large scale, dimensions: sdate, nn, member + } tmp_df <- data.frame(x = adply(x,.margins = 3, .id = NULL), y = y) } else { tmp_df <- data.frame(x = as.vector(x), y = y) } } else { + if (length(dim(x)) > 1 & daily) { + x <- aperm (x, c(2,1)) ## if 9nn or large scale, dimensions: sdate, nn + } tmp_df <- data.frame(x = x, y = y) } + # if the data is all NA, force return return NA if (all(is.na(tmp_df)) | (sum(apply(tmp_df, 2, function(x) !all(is.na(x)))) == 1)) { if (is.null(f)) { @@ -677,7 +711,6 @@ Intlr <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, obs_lo } } } else { - # training ## apply principle components if method is 9nn if (any(names(dim(x)) == "nn")) { @@ -696,11 +729,27 @@ Intlr <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, obs_lo colnames(tmp_df) <- c(paste0("x.",1:(ncol(tmp_df)-1)), "y") } - lm1 <- .train_lm(df = tmp_df, loocv = loocv) + # training + lm1 <- .train_lm(df = tmp_df, loocv = loocv, k_out = k_out) # prediction - res <- .pred_lm(lm1 = lm1, df = tmp_df, f = f, loocv = loocv, aux_dim = aux_dim) + res <- .pred_lm(lm1 = lm1, df = tmp_df, f = f, loocv = loocv, aux_dim = aux_dim, k_out = k_out) } + + ## dimension naming + if (daily & is.null(f)) { + if (!loocv) { + dim (res) <- c(k_out, length(res)/k_out) + } + names(dim(res)) <- c(time_dim, sdate_dim) + } else if (daily & !is.null (f)) { + sdate_num <- as.numeric (dim(res)[sdate_dim])/k_out + res <- SplitDim (res, indices = rep(1:k_out, sdate_num), + new_dim_name = time_dim, split_dim = sdate_dim) + } else if (!daily & is.null(f)) { + dim(res) <- length(res) + names(dim(res)) <- sdate_dim + } ## if !daily & !is.null(f), the function already correctly assign the dimension names return(res) } @@ -709,18 +758,20 @@ Intlr <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, obs_lo # Function to generate the linear regressions. # Returns a list #----------------------------------- -.train_lm <- function(df, loocv) { +.train_lm <- function(df, loocv, k_out = 1) { # Remove predictor columns containing only NA's - df <- df[ ,apply(as.matrix(df[,colnames(df) != 'y'],nrow(df),ncol(df)-1), 2, function(x) !all(is.na(x)))] + df <- df[ ,apply(as.matrix(df[,colnames(df) != 'y'],nrow(df),ncol(df)-1), 2, + function(x) !all(is.na(x)))] if (loocv) { - - lm1 <- lapply(1:nrow(df), function(j) { - if (all(is.na(df[-j,]$y))) { + + lm1 <- lapply(1:(nrow(df)/k_out), function(j) { + window <- ((j-1)*k_out+1):((j-1)*k_out + k_out) # omit the data of the year including corresponding day + if (all(is.na(df[-window ,]$y))) { return(NA) } else { - return(lm(df[-j,], formula = y ~ .)) + return(lm(df[-window,], formula = y ~ .)) }}) } else { @@ -733,16 +784,18 @@ Intlr <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, obs_lo #----------------------------------- # Function to apply the linear regressions. #----------------------------------- -.pred_lm <- function(df, lm1, f, loocv, aux_dim) { +.pred_lm <- function(df, lm1, f, loocv, aux_dim, k_out = 1) { require (plyr) + if (loocv) { - pred_vals <- sapply(1:nrow(df), function(j) { + pred_vals <- sapply(1:length(lm1), function(j) { if (all(is.na(lm1[[j]]))) { - return(NA) + return(rep(NA, k_out)) } else { - return(predict(lm1[[j]], df[j,])) + window <- ((j-1)*k_out+1):((j-1)*k_out + k_out) # test the daily data of the corresponding year + return(predict(lm1[[j]], df[window,])) }}) } else { if (!is.na(lm1)) { @@ -799,7 +852,7 @@ Intlr <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, obs_lo names(dim(dist_id)) <- c("nn", lon_dim) idh <- SplitDim(dist_id, split_dim = lon_dim, - indices = rep(1:(length(lons_hres)),length(lats_hres)), + indices = rep(1:(length(lats_hres)),length(lons_hres)), new_dim_name = lat_dim) idh <- aperm (idh, c(1,3,2)) # dimension: nn, lat_dim, lon_dim diff --git a/R/LogisticReg.R b/R/LogisticReg.R index f74f2ee..91fde87 100644 --- a/R/LogisticReg.R +++ b/R/LogisticReg.R @@ -56,7 +56,9 @@ #'@param sdate_dim a character vector indicating the start date dimension name in the element #''data' in exp and obs. Default set to "sdate". #'@param member_dim a character vector indicating the member dimension name in the element -#''data' in exp and obs. Default set to "member". +#''data' in exp and obs. Default set to "member". +#'@param time_dim a character vector indicating the time dimension name in the element +#''data' in exp and obs. Default set to "time". #'@param region a numeric vector indicating the borders of the region defined in obs. #'It consists of four elements in this order: lonmin, lonmax, latmin, latmax. lonmin refers #'to the left border, while lonmax refers to the right border. latmin indicates the lower @@ -94,8 +96,8 @@ CST_LogisticReg <- function(exp, obs, exp_cor = NULL, target_grid, int_method = log_reg_method = "ens_mean", probs_cat = c(1/3,2/3), return_most_likely_cat = FALSE, points = NULL, method_point_interp = NULL, lat_dim = "lat", lon_dim = "lon", - sdate_dim = "sdate", member_dim = "member", region = NULL, - loocv = TRUE, ncores = NULL) { + sdate_dim = "sdate", member_dim = "member", time_dim = "time", + region = NULL, loocv = TRUE, ncores = NULL) { if (!inherits(exp,'s2dv_cube')) { stop("Parameter 'exp' must be of the class 's2dv_cube'") @@ -105,14 +107,16 @@ CST_LogisticReg <- function(exp, obs, exp_cor = NULL, target_grid, int_method = stop("Parameter 'obs' must be of the class 's2dv_cube'") } - res <- LogisticReg(exp = exp$data, obs = obs$data, exp_cor = exp_cor$data, exp_lats = exp$coords[[lat_dim]], - exp_lons = exp$coords[[lon_dim]], obs_lats = obs$coords[[lat_dim]], - obs_lons = obs$coords[[lon_dim]], target_grid = target_grid, - probs_cat = probs_cat, return_most_likely_cat = return_most_likely_cat, + res <- LogisticReg(exp = exp$data, obs = obs$data, exp_cor = exp_cor$data, + exp_lats = exp$coords[[lat_dim]], exp_lons = exp$coords[[lon_dim]], + obs_lats = obs$coords[[lat_dim]], obs_lons = obs$coords[[lon_dim]], + target_grid = target_grid, probs_cat = probs_cat, + return_most_likely_cat = return_most_likely_cat, int_method = int_method, log_reg_method = log_reg_method, points = points, method_point_interp = method_point_interp, lat_dim = lat_dim, lon_dim = lon_dim, sdate_dim = sdate_dim, member_dim = member_dim, - source_file_exp = exp$attrs$source_files[1], source_file_obs = obs$attrs$source_files[1], + time_dim = time_dim, source_file_exp = exp$attrs$source_files[1], + source_file_obs = obs$attrs$source_files[1], region = region, loocv = loocv, ncores = ncores) # Modify data, lat and lon in the origina s2dv_cube, adding the downscaled data @@ -206,7 +210,9 @@ CST_LogisticReg <- function(exp, obs, exp_cor = NULL, target_grid, int_method = #'@param sdate_dim a character vector indicating the start date dimension name in the element #''data' in exp and obs. Default set to "sdate". #'@param member_dim a character vector indicating the member dimension name in the element -#''data' in exp and obs. Default set to "member". +#''data' in exp and obs. Default set to "member". +#'@param time_dim a character vector indicating the time dimension name in the element +#''data' in exp and obs. Default set to "time". #'@param source_file_exp a character vector with a path to an example file of the exp data. #'Only needed if the downscaling is to a point location. #'@param source_file_obs a character vector with a path to an example file of the obs data. @@ -247,7 +253,7 @@ LogisticReg <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, target_grid, int_method = NULL, log_reg_method = "ens_mean", probs_cat = c(1/3,2/3), return_most_likely_cat = FALSE, points = NULL, method_point_interp = NULL, lat_dim = "lat", lon_dim = "lon", - sdate_dim = "sdate", member_dim = "member", + sdate_dim = "sdate", member_dim = "member", time_dim = "time", source_file_exp = NULL, source_file_obs = NULL, region = NULL, loocv = TRUE, ncores = NULL) { @@ -488,21 +494,54 @@ LogisticReg <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, as.integer(apply(terc, 1, function(r) which (r == 1)))}}, output_dims = sdate_dim, ncores = ncores)$output1 + target_dims_predictand <- sdate_dim + # Apply the logistic regressions + ## case hindcast only if (is.null(exp_cor)) { - res <- Apply(list(predictor, obs_cat), target_dims = list(target_dims_predictor, sdate_dim), + if ( time_dim %in% names(dim(predictor)) ) { + if (dim(predictor)[time_dim] > 1) { + target_dims_predictor <- c(target_dims_predictor, time_dim) + target_dims_predictand <- c(target_dims_predictand, time_dim) + } + } + res <- Apply(list(predictor, obs_cat), + target_dims = list(target_dims_predictor, target_dims_predictand), fun = function(x, y) .log_reg(x = x, y = y, loocv = loocv,probs_cat = probs_cat), output_dims = c(sdate_dim, "category"), ncores = ncores)$output1 + if ( time_dim %in% names(dim(predictor)) ) { + if (dim(predictor)[time_dim] > 1) { + res <- SplitDim(res, split_dim = sdate_dim, new_dim_name = time_dim, + indices = rep(1:(dim(predictor)[time_dim]), dim(predictor)[sdate_dim])) + } + } + if (return_most_likely_cat) { res <- Apply(res, target_dims = c(sdate_dim, "category"), .most_likely_category, output_dims = sdate_dim, ncores = ncores)$output1 } - } else { + } + ## case hindcast - forecast + else { + if ( time_dim %in% names(dim(predictor)) ) { + if (dim(predictor)[time_dim] > 1) { + target_dims_predictor <- c(target_dims_predictor, time_dim) + target_dims_predictand <- c(target_dims_predictand, time_dim) + target_dims_forecast <- c(target_dims_forecast, time_dim) + } + } res <- Apply(list(predictor, obs_cat, forecast), - target_dims = list(target_dims_predictor, sdate_dim, target_dims_forecast), + target_dims = list(target_dims_predictor, target_dims_predictand, + target_dims_forecast), fun = function(x, y, f) .log_reg(x = x, y = y, f = f, loocv = loocv, probs_cat = probs_cat), output_dims = c(sdate_dim, "category"), ncores = ncores)$output1 + if ( time_dim %in% names(dim(predictor)) ) { + if (dim(predictor)[time_dim] > 1) { + res <- SplitDim(res, split_dim = sdate_dim, new_dim_name = time_dim, + indices = rep(1:(dim(predictor)[time_dim]), dim(forecast)[sdate_dim])) + } + } } # restore ensemble dimension in observations if it existed originally @@ -561,7 +600,22 @@ LogisticReg <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, # atomic functions for logistic regressions .log_reg <- function(x, y, f = NULL, loocv, probs_cat) { - + + k_out <- 1 ## leave ONE out if the data is not daily. + daily <- FALSE + if (any(names(dim(x)) == time_dim)) { ## IF DAILY DATA IS PROVIDED + daily <- TRUE ## in order to arrange the res dimension, when loocv is not TRUE + k_out <- as.numeric (dim(x)[time_dim]) ## for daily, to omit the data of the corresponding year. + x <- MergeDims (x, merge_dims = c(time_dim, sdate_dim), rename_dim = sdate_dim) + y <- MergeDims (y, merge_dims = c(time_dim, sdate_dim), rename_dim = sdate_dim) + if (length(dim(x)) > 1) { + x <- aperm (x, c(2,1)) ## if ens_mean_sd or sorted_members, dimensions: sdate, pred or member + } + if (!is.null(f)) { + f <- MergeDims (f, merge_dims = c(time_dim, sdate_dim), rename_dim = sdate_dim) + } + } + tmp_df <- data.frame(x = x, y = y) # if the data is all NA, force return return NA @@ -581,10 +635,11 @@ LogisticReg <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, } else { # training - lm1 <- .train_lr(df = tmp_df, loocv = loocv) + lm1 <- .train_lr(df = tmp_df, loocv = loocv, k_out) # prediction - res <- .pred_lr(lm1 = lm1, df = tmp_df, f = f, loocv = loocv, probs_cat = probs_cat) + res <- .pred_lr(lm1 = lm1, df = tmp_df, f = f, loocv = loocv, + probs_cat = probs_cat, k_out = k_out) if ( !(is.null(f)) ) { ## if forecast is provided, and syear of forecast is 1. if ( nrow(f) == 1 ) { @@ -599,7 +654,7 @@ LogisticReg <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, #----------------------------------- # Function to train the logistic regressions. #----------------------------------- -.train_lr <- function(df, loocv) { +.train_lr <- function(df, loocv, k_out = 1) { require(nnet) @@ -608,7 +663,9 @@ LogisticReg <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, if (loocv) { - lm1 <- lapply(1:nrow(df), function(j) multinom(y ~ ., data = df[ -j, ], trace = FALSE)) + lm1 <- lapply(1:(nrow(df)/k_out), function(j) { + window <- ((j-1)*k_out+1):((j-1)*k_out + k_out) # omit the data of the year including corresponding day + multinom(y ~ ., data = df[ -window, ], trace = FALSE)}) } else { @@ -622,7 +679,7 @@ LogisticReg <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, #----------------------------------- # Function to apply the logistic regressions. #----------------------------------- -.pred_lr <- function(df, lm1, f, loocv, probs_cat) { +.pred_lr <- function(df, lm1, f, loocv, probs_cat, k_out = 1) { require(plyr) @@ -631,16 +688,18 @@ LogisticReg <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, # The error: "Error: Results must have the same dimensions." can # appear when the number of sdates is insufficient - pred_vals_ls <-list() - for (j in 1:nrow(df)) { - if (all(is.na(df[j, ]))) { - pred_vals_ls[[j]] <- rep(NA, length(probs_cat) + 1) + pred_vals_ls <- list() + for (j in 1:length(lm1)) { + window <- ((j-1)*k_out+1):((j-1)*k_out + k_out) # test the daily data of the corresponding year + if (all(is.na(df[window, ]))) { + pred_vals_ls[[j]] <- array(rep(NA, length(window) * (length(probs_cat) + 1)), + dim = c(length(window), length(probs_cat) + 1)) } else { - pred_vals_ls[[j]] <- predict(lm1[[j]], df[j,], type = "probs") + pred_vals_ls[[j]] <- predict(lm1[[j]], df[window,], type = "probs") } } - pred_vals <- laply(pred_vals_ls, .fun = as.array) + pred_vals <- do.call(rbind, pred_vals_ls) if( length(probs_cat) + 1 == 2) { pred_vals_dum<-array(NA, dim=c(nrow(df), 2)) -- GitLab From ba142910236f3b868ecaed4039692159850d6fb8 Mon Sep 17 00:00:00 2001 From: eduzenli Date: Fri, 12 Apr 2024 18:36:26 +0200 Subject: [PATCH 2/4] daily dwn version 2 --- R/Intlr.R | 61 +++++++++++++++++++++++-------------------------------- 1 file changed, 25 insertions(+), 36 deletions(-) diff --git a/R/Intlr.R b/R/Intlr.R index 32490f1..c7705a3 100644 --- a/R/Intlr.R +++ b/R/Intlr.R @@ -608,16 +608,14 @@ Intlr <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, obs_lo stop(paste0(lr_method, " method is not implemented yet")) } - + # Apply the linear regressions ## case hindcast - forecast if (!is.null(exp_cor)) { if ( time_dim %in% names(dim(predictor)) ) { - if (dim(predictor)[time_dim] > 1) { - target_dims_predictor <- c(target_dims_predictor, time_dim) - target_dims_predictand <- c(target_dims_predictand, time_dim) - target_dims_forecast <- c(target_dims_forecast, time_dim) - } + target_dims_predictor <- c(target_dims_predictor, time_dim) + target_dims_predictand <- c(target_dims_predictand, time_dim) + target_dims_forecast <- c(target_dims_forecast, time_dim) } res <- Apply(list(predictor, predictand, forecast), target_dims = list(target_dims_predictor, target_dims_predictand, @@ -628,15 +626,15 @@ Intlr <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, obs_lo ## case hindcast only else { if ( time_dim %in% names(dim(predictor)) ) { - if (dim(predictor)[time_dim] > 1) { - target_dims_predictor <- c(target_dims_predictor, time_dim) - target_dims_predictand <- c(target_dims_predictand, time_dim) - } + output_dims <- c(time_dim, sdate_dim) + target_dims_predictor <- c(target_dims_predictor, time_dim) + target_dims_predictand <- c(target_dims_predictand, time_dim) } res <- Apply(list(predictor, predictand), target_dims = list(target_dims_predictor, target_dims_predictand), fun = .intlr, loocv = loocv, ncores = ncores, sdate_dim = sdate_dim, - time_dim = time_dim)$output1 + time_dim = time_dim, + output_dims = output_dims)$output1 } # restore ensemble dimension in observations if it existed originally @@ -670,11 +668,11 @@ Intlr <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, obs_lo require (plyr) - k_out <- 1 ## leave ONE out + k_out <- 1 ## number of daily data in a year. It is useful in constructing window for loocv = TRUE case. k_out is also used to order the dimensions of pred.lm output for both loocv = TRUE and loocv = FALSE cases. daily <- FALSE if (any(names(dim(x)) == time_dim)) { ## IF DAILY DATA IS PROVIDED - daily <- TRUE ## in order to arrange the res dimension, when loocv is not TRUE - k_out <- as.numeric (dim(x)[time_dim]) ## for daily, to omit the data of the corresponding year. + daily <- TRUE ## in order to arrange the res dimension of x and f after MergeDims + k_out <- as.numeric (dim(x)[time_dim]) x <- MergeDims (x, merge_dims = c(time_dim, sdate_dim), rename_dim = sdate_dim) y <- MergeDims (y, merge_dims = c(time_dim, sdate_dim), rename_dim = sdate_dim) if (!is.null(f)) { @@ -702,7 +700,7 @@ Intlr <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, obs_lo # if the data is all NA, force return return NA if (all(is.na(tmp_df)) | (sum(apply(tmp_df, 2, function(x) !all(is.na(x)))) == 1)) { if (is.null(f)) { - res <- rep(NA, nrow(tmp_df)) + res <- array(rep(NA, nrow(tmp_df)), c(k_out, nrow(tmp_df)/k_out)) } else { if (!is.null(aux_dim)) { res <- array(NA, dim(f)[names(dim(f)) != aux_dim]) @@ -736,22 +734,12 @@ Intlr <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, obs_lo res <- .pred_lm(lm1 = lm1, df = tmp_df, f = f, loocv = loocv, aux_dim = aux_dim, k_out = k_out) } - ## dimension naming - if (daily & is.null(f)) { - if (!loocv) { - dim (res) <- c(k_out, length(res)/k_out) + if (!is.null(f) & daily) { + res <- SplitDim(data = res, split_dim = sdate_dim, new_dim_name = time_dim, + indices = rep (1:k_out, dim(res)[sdate_dim] / k_out)) } - names(dim(res)) <- c(time_dim, sdate_dim) - } else if (daily & !is.null (f)) { - sdate_num <- as.numeric (dim(res)[sdate_dim])/k_out - res <- SplitDim (res, indices = rep(1:k_out, sdate_num), - new_dim_name = time_dim, split_dim = sdate_dim) - } else if (!daily & is.null(f)) { - dim(res) <- length(res) - names(dim(res)) <- sdate_dim - } ## if !daily & !is.null(f), the function already correctly assign the dimension names - - return(res) + + return(res) } #----------------------------------- @@ -792,17 +780,18 @@ Intlr <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, obs_lo if (loocv) { pred_vals <- sapply(1:length(lm1), function(j) { if (all(is.na(lm1[[j]]))) { - return(rep(NA, k_out)) + return(array(rep(NA, k_out), c(k_out, nrow(df) / k_out))) } else { window <- ((j-1)*k_out+1):((j-1)*k_out + k_out) # test the daily data of the corresponding year return(predict(lm1[[j]], df[window,])) }}) + pred_vals <- array (pred_vals, c(k_out, nrow(df) / k_out)) } else { if (!is.na(lm1)) { # case to downscale hindcasts if (is.null(f)) { - pred_vals_ls <- lapply(lm1, predict, data = df) - pred_vals <- unlist(pred_vals_ls) + pred_vals_ls <- lapply(lm1, predict, newdata = df) + pred_vals <- array (unlist(pred_vals_ls), c(k_out, nrow(df) / k_out)) # case to downscale forecasts } else { if (!is.null(aux_dim)) { @@ -816,17 +805,17 @@ Intlr <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, obs_lo # basic pred_vals_ls <- lapply(lm1, predict, newdata = data.frame(x = as.vector(f))) pred_vals <- unlist(pred_vals_ls) - dim(pred_vals) <- dim(f ) + dim(pred_vals) <- dim(f) } } } else { if (is.null(f)) { - pred_vals <- rep(NA, nrow(df)) + pred_vals <- array(rep(NA, k_out), c(k_out, nrow(df) / k_out)) } else { if (!is.null(aux_dim)) { pred_vals <- array(NA, dim(f)[names(dim(f)) != aux_dim]) } else { - pred_vals <- array(NA, dim(f)) + pred_vals <- array(NA, dim(f)) } } } -- GitLab From c7b8c32f7e07e5261e350c8d0e7cd2106afe1aee Mon Sep 17 00:00:00 2001 From: eduzenli Date: Sun, 21 Apr 2024 11:59:46 +0200 Subject: [PATCH 3/4] obs_cat is arranged wrt daily data --- R/LogisticReg.R | 68 +++++++++++++++++++++++-------------------------- 1 file changed, 32 insertions(+), 36 deletions(-) diff --git a/R/LogisticReg.R b/R/LogisticReg.R index 91fde87..d42c39e 100644 --- a/R/LogisticReg.R +++ b/R/LogisticReg.R @@ -486,6 +486,20 @@ LogisticReg <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, # convert observations to categorical predictands + k_out <- 1 ## in case loocv = TRUE and the data is NOT daily, leave one data out. + daily <- FALSE + if ( time_dim %in% names(dim(obs_ref)) & dim(obs_ref)[time_dim] > 1) { + daily <- TRUE + sdate_num <- as.numeric (dim(obs_ref)[sdate_dim]) + k_out <- as.numeric (dim(obs_ref)[time_dim]) ## in case loocv = TRUE and the data is daily, leave one-year data out + obs_ref <- MergeDims (obs_ref, merge_dims = c(time_dim, sdate_dim), rename_dim = sdate_dim) + predictor <- MergeDims (predictor, merge_dims = c(time_dim, sdate_dim), rename_dim = sdate_dim) + if (!is.null(exp_cor)) { + sdate_num_fr <- as.numeric (dim(forecast)[sdate_dim]) ## sdate_num of forecast + forecast <- MergeDims (forecast, merge_dims = c(time_dim, sdate_dim), rename_dim = sdate_dim) + } + } + obs_cat <- Apply(obs_ref, target_dims = sdate_dim, function(x) { if (!any(!is.na(x))) { rep(NA,length(x)) @@ -493,28 +507,23 @@ LogisticReg <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, terc <- convert2prob(as.vector(x), prob = probs_cat) as.integer(apply(terc, 1, function(r) which (r == 1)))}}, output_dims = sdate_dim, ncores = ncores)$output1 - + target_dims_predictand <- sdate_dim # Apply the logistic regressions ## case hindcast only if (is.null(exp_cor)) { - if ( time_dim %in% names(dim(predictor)) ) { - if (dim(predictor)[time_dim] > 1) { - target_dims_predictor <- c(target_dims_predictor, time_dim) - target_dims_predictand <- c(target_dims_predictand, time_dim) - } - } res <- Apply(list(predictor, obs_cat), target_dims = list(target_dims_predictor, target_dims_predictand), - fun = function(x, y) .log_reg(x = x, y = y, loocv = loocv,probs_cat = probs_cat), + fun = function(x, y) .log_reg(x = x, y = y, loocv = loocv, probs_cat = probs_cat, + sdate_dim = sdate_dim, k_out = k_out), output_dims = c(sdate_dim, "category"), ncores = ncores)$output1 - if ( time_dim %in% names(dim(predictor)) ) { - if (dim(predictor)[time_dim] > 1) { - res <- SplitDim(res, split_dim = sdate_dim, new_dim_name = time_dim, - indices = rep(1:(dim(predictor)[time_dim]), dim(predictor)[sdate_dim])) - } - } + if ( daily ) { + res <- SplitDim(res, split_dim = sdate_dim, new_dim_name = time_dim, + indices = rep(1:k_out, sdate_num)) + obs_ref <- SplitDim(obs_ref, split_dim = sdate_dim, new_dim_name = time_dim, + indices = rep(1:k_out, sdate_num)) + } if (return_most_likely_cat) { res <- Apply(res, target_dims = c(sdate_dim, "category"), .most_likely_category, @@ -534,13 +543,15 @@ LogisticReg <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, target_dims = list(target_dims_predictor, target_dims_predictand, target_dims_forecast), fun = function(x, y, f) .log_reg(x = x, y = y, f = f, loocv = loocv, - probs_cat = probs_cat), + probs_cat = probs_cat, sdate_dim = sdate_dim, + k_out = k_out), output_dims = c(sdate_dim, "category"), ncores = ncores)$output1 - if ( time_dim %in% names(dim(predictor)) ) { - if (dim(predictor)[time_dim] > 1) { - res <- SplitDim(res, split_dim = sdate_dim, new_dim_name = time_dim, - indices = rep(1:(dim(predictor)[time_dim]), dim(forecast)[sdate_dim])) - } + + if ( daily ) { + res <- SplitDim(res, split_dim = sdate_dim, new_dim_name = time_dim, + indices = rep(1:k_out, sdate_num_fr)) + obs_ref <- SplitDim(obs_ref, split_dim = sdate_dim, new_dim_name = time_dim, + indices = rep(1:k_out, sdate_num)) } } @@ -599,22 +610,7 @@ LogisticReg <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, } # atomic functions for logistic regressions -.log_reg <- function(x, y, f = NULL, loocv, probs_cat) { - - k_out <- 1 ## leave ONE out if the data is not daily. - daily <- FALSE - if (any(names(dim(x)) == time_dim)) { ## IF DAILY DATA IS PROVIDED - daily <- TRUE ## in order to arrange the res dimension, when loocv is not TRUE - k_out <- as.numeric (dim(x)[time_dim]) ## for daily, to omit the data of the corresponding year. - x <- MergeDims (x, merge_dims = c(time_dim, sdate_dim), rename_dim = sdate_dim) - y <- MergeDims (y, merge_dims = c(time_dim, sdate_dim), rename_dim = sdate_dim) - if (length(dim(x)) > 1) { - x <- aperm (x, c(2,1)) ## if ens_mean_sd or sorted_members, dimensions: sdate, pred or member - } - if (!is.null(f)) { - f <- MergeDims (f, merge_dims = c(time_dim, sdate_dim), rename_dim = sdate_dim) - } - } +.log_reg <- function(x, y, f = NULL, loocv, probs_cat, sdate_dim, k_out = 1) { tmp_df <- data.frame(x = x, y = y) -- GitLab From f3a71cfc45dc805950a3c15261e3bc83a2e9b94e Mon Sep 17 00:00:00 2001 From: eduzenli Date: Sun, 21 Apr 2024 16:28:34 +0200 Subject: [PATCH 4/4] Processes with the MergeDims function are moved before the wrapper --- R/Intlr.R | 63 +++++++++++++++++++++---------------------------------- 1 file changed, 24 insertions(+), 39 deletions(-) diff --git a/R/Intlr.R b/R/Intlr.R index c7705a3..6f7e124 100644 --- a/R/Intlr.R +++ b/R/Intlr.R @@ -608,32 +608,41 @@ Intlr <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, obs_lo stop(paste0(lr_method, " method is not implemented yet")) } - + + k_out <- 1 ## ## It is used in constructing window for loocv = TRUE case. k_out is also used to order the dimensions of pred.lm output for both loocv = TRUE and loocv = FALSE cases. In case the data is NOT daily, k_out is 1. + daily <- FALSE + if ( time_dim %in% names(dim(predictor)) ) { + daily <- TRUE + k_out <- as.numeric (dim(predictor)[time_dim]) ## in case loocv = TRUE and the data is daily, leave one-year data out + predictor <- MergeDims (predictor, merge_dims = c(time_dim, sdate_dim), rename_dim = sdate_dim) + predictand <- MergeDims (predictand, merge_dims = c(time_dim, sdate_dim), rename_dim = sdate_dim) + output_dims <- c(time_dim, sdate_dim) # for hindcast dwn + if (!is.null(exp_cor)) { + sdate_num_fr <- as.numeric (dim(forecast)[sdate_dim]) ## sdate_num of forecast + forecast <- MergeDims (forecast, merge_dims = c(time_dim, sdate_dim), rename_dim = sdate_dim) + } + } + # Apply the linear regressions ## case hindcast - forecast if (!is.null(exp_cor)) { - if ( time_dim %in% names(dim(predictor)) ) { - target_dims_predictor <- c(target_dims_predictor, time_dim) - target_dims_predictand <- c(target_dims_predictand, time_dim) - target_dims_forecast <- c(target_dims_forecast, time_dim) - } res <- Apply(list(predictor, predictand, forecast), target_dims = list(target_dims_predictor, target_dims_predictand, target_dims_forecast), fun = .intlr, loocv = loocv, aux_dim = aux_dim, ncores = ncores, - sdate_dim = sdate_dim, time_dim = time_dim)$output1 + sdate_dim = sdate_dim, k_out = k_out)$output1 + + if (daily) { + res <- SplitDim(data = res, split_dim = sdate_dim, new_dim_name = time_dim, + indices = rep (1:k_out, sdate_num_fr)) + } } ## case hindcast only else { - if ( time_dim %in% names(dim(predictor)) ) { - output_dims <- c(time_dim, sdate_dim) - target_dims_predictor <- c(target_dims_predictor, time_dim) - target_dims_predictand <- c(target_dims_predictand, time_dim) - } res <- Apply(list(predictor, predictand), target_dims = list(target_dims_predictor, target_dims_predictand), fun = .intlr, loocv = loocv, ncores = ncores, sdate_dim = sdate_dim, - time_dim = time_dim, + k_out = k_out, output_dims = output_dims)$output1 } @@ -660,40 +669,20 @@ Intlr <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, obs_lo return(res) } - #----------------------------------- # Atomic function to generate and apply the linear regressions #----------------------------------- -.intlr <- function(x, y, f = NULL, loocv, aux_dim = NULL, sdate_dim = "sdate", time_dim = "time") { +.intlr <- function(x, y, f = NULL, loocv, aux_dim = NULL, sdate_dim = "sdate", k_out = 1) { require (plyr) - k_out <- 1 ## number of daily data in a year. It is useful in constructing window for loocv = TRUE case. k_out is also used to order the dimensions of pred.lm output for both loocv = TRUE and loocv = FALSE cases. - daily <- FALSE - if (any(names(dim(x)) == time_dim)) { ## IF DAILY DATA IS PROVIDED - daily <- TRUE ## in order to arrange the res dimension of x and f after MergeDims - k_out <- as.numeric (dim(x)[time_dim]) - x <- MergeDims (x, merge_dims = c(time_dim, sdate_dim), rename_dim = sdate_dim) - y <- MergeDims (y, merge_dims = c(time_dim, sdate_dim), rename_dim = sdate_dim) - if (!is.null(f)) { - f <- MergeDims (f, merge_dims = c(time_dim, sdate_dim), rename_dim = sdate_dim) - } - } - if (!is.null(f)) { if (!is.null(aux_dim)) { - if (length(dim(x)) > 1 & daily) { - x <- aperm (x, c(3,1,2)) ## if 9nn or large scale, dimensions: sdate, nn, member - f <- aperm (f, c(3,1,2)) ## if 9nn or large scale, dimensions: sdate, nn, member - } tmp_df <- data.frame(x = adply(x,.margins = 3, .id = NULL), y = y) } else { tmp_df <- data.frame(x = as.vector(x), y = y) } } else { - if (length(dim(x)) > 1 & daily) { - x <- aperm (x, c(2,1)) ## if 9nn or large scale, dimensions: sdate, nn - } tmp_df <- data.frame(x = x, y = y) } @@ -711,6 +700,7 @@ Intlr <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, obs_lo } else { ## apply principle components if method is 9nn + ## TODO 9NN MIGHT GIVE ERROR FOR FORECAST CASE. REVIEW AFTER DEVELOPING FORECAST STRATEGY!! if (any(names(dim(x)) == "nn")) { PR <- prcomp(~ x.1 + x.2 + x.3 + x.4 + x.5 + x.6 + x.7 + x.8 + x.9, data = tmp_df[, -ncol(tmp_df)], @@ -734,11 +724,6 @@ Intlr <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, obs_lo res <- .pred_lm(lm1 = lm1, df = tmp_df, f = f, loocv = loocv, aux_dim = aux_dim, k_out = k_out) } - if (!is.null(f) & daily) { - res <- SplitDim(data = res, split_dim = sdate_dim, new_dim_name = time_dim, - indices = rep (1:k_out, dim(res)[sdate_dim] / k_out)) - } - return(res) } -- GitLab