From 47167187ea9f9b306edc89f0ddd402bad3473166 Mon Sep 17 00:00:00 2001 From: eduzenli Date: Wed, 20 Nov 2024 17:08:33 +0100 Subject: [PATCH 1/2] output_dims for for the arrays without time dimension. PCA for 9nn case. BOth updated --- R/Intlr.R | 151 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 99 insertions(+), 52 deletions(-) diff --git a/R/Intlr.R b/R/Intlr.R index c0f3b9c..a1e3764 100644 --- a/R/Intlr.R +++ b/R/Intlr.R @@ -611,13 +611,13 @@ Intlr <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, obs_lo k_out <- 1 ## ## It is used in constructing window for loocv = TRUE case. k_out is also used to order the dimensions of pred.lm output for both loocv = TRUE and loocv = FALSE cases. In case the data is NOT daily, k_out is 1. daily <- FALSE + output_dims <- c(time_dim, sdate_dim) # for hindcast dwn if ( time_dim %in% names(dim(predictor)) ) { daily <- TRUE k_out <- as.numeric (dim(predictor)[time_dim]) ## in case loocv = TRUE and the data is daily, leave one-year data out sdate_num <- as.numeric (dim(predictand)[sdate_dim]) ## sdate_num of hindcast predictor <- MergeDims (predictor, merge_dims = c(time_dim, sdate_dim), rename_dim = sdate_dim) predictand <- MergeDims (predictand, merge_dims = c(time_dim, sdate_dim), rename_dim = sdate_dim) - output_dims <- c(time_dim, sdate_dim) # for hindcast dwn if (!is.null(exp_cor)) { sdate_num_fr <- as.numeric (dim(forecast)[sdate_dim]) ## sdate_num of forecast forecast <- MergeDims (forecast, merge_dims = c(time_dim, sdate_dim), rename_dim = sdate_dim) @@ -634,8 +634,8 @@ Intlr <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, obs_lo sdate_dim = sdate_dim, k_out = k_out)$output1 if (daily) { - res <- SplitDim(data = res, split_dim = sdate_dim, new_dim_name = time_dim, - indices = rep (1:k_out, sdate_num_fr)) + res <- CSTools::SplitDim(data = res, split_dim = sdate_dim, new_dim_name = time_dim, + indices = rep (1:k_out, sdate_num_fr)) } } ## case hindcast only @@ -645,11 +645,15 @@ Intlr <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, obs_lo fun = .intlr, loocv = loocv, ncores = ncores, sdate_dim = sdate_dim, k_out = k_out, output_dims = output_dims)$output1 + if (!daily) { + res <- ClimProjDiags::Subset(x = res, along = time_dim, indices = 1, drop = 'selected') + } } if (daily) { - predictand <- SplitDim(data = predictand, split_dim = sdate_dim, new_dim_name = time_dim, - indices = rep (1:k_out, sdate_num)) + predictand <- CSTools::SplitDim(data = predictand, split_dim = sdate_dim, + new_dim_name = time_dim, + indices = rep (1:k_out, sdate_num)) } # restore ensemble dimension in observations if it existed originally @@ -704,32 +708,21 @@ Intlr <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, obs_lo } } } else { - - ## apply principle components if method is 9nn - ## TODO 9NN MIGHT GIVE ERROR FOR FORECAST CASE. REVIEW AFTER DEVELOPING FORECAST STRATEGY!! - if (any(names(dim(x)) == "nn")) { - PR <- prcomp(~ x.1 + x.2 + x.3 + x.4 + x.5 + x.6 + x.7 + x.8 + x.9, - data = tmp_df[, -ncol(tmp_df)], - na.action = na.omit, scale = FALSE) ## prcomp between predictors - ## 95% of variablity should be explained by the principle components - N_predictors <- which (summary(PR)$importance[ "Cumulative Proportion",] > .95)[1] - PR_mat <- PR$x[, 1:N_predictors, drop = FALSE] - if (!is.null(PR$na.action)) { ## if there are NAs, relocate them to the correct order - dum <- array(NA, dim = c(nrow(tmp_df), ncol(PR_mat) )) - dum [as.numeric(rownames(PR_mat)), ] <- PR_mat - PR_mat <- dum - } - tmp_df <- data.frame(x = PR_mat, y = y) - colnames(tmp_df) <- c(paste0("x.",1:(ncol(tmp_df)-1)), "y") - } + + # for now, PCA is TRUE only for nn. in future, it might be true for multiple var options and ls. + if (any(names(dim(x)) == "nn")) { + pca <- TRUE + } else { + pca <- FALSE + } # training - lm1 <- .train_lm(df = tmp_df, loocv = loocv, k_out = k_out) + lm1 <- .train_lm(df = tmp_df, loocv = loocv, k_out = k_out, pca = pca) # prediction - res <- .pred_lm(lm1 = lm1, df = tmp_df, f = f, loocv = loocv, aux_dim = aux_dim, k_out = k_out) + res <- .pred_lm(lm1 = lm1, df = tmp_df, f = f, loocv = loocv, aux_dim = aux_dim, + k_out = k_out, pca = pca) } - return(res) } @@ -737,60 +730,99 @@ Intlr <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, obs_lo # Function to generate the linear regressions. # Returns a list #----------------------------------- -.train_lm <- function(df, loocv, k_out = 1) { +.train_lm <- function(df, loocv, k_out = 1, pca = FALSE) { # Remove predictor columns containing only NA's - df <- df[ ,apply(as.matrix(df[,colnames(df) != 'y'],nrow(df),ncol(df)-1), 2, - function(x) !all(is.na(x)))] - - if (loocv) { + df_all<- df[ ,apply(as.matrix(df[,colnames(df) != 'y'],nrow(df),ncol(df)-1), 2, + function(x) !all(is.na(x)))] - lm1 <- lapply(1:(nrow(df)/k_out), function(j) { - window <- ((j-1)*k_out+1):((j-1)*k_out + k_out) # omit the data of the year including corresponding day - if (all(is.na(df[-window ,]$y))) { + if (loocv) { + lm1 <- lapply(1:(nrow(df_all)/k_out), function(j) { + window <- ((j-1)*k_out+1):((j-1)*k_out + k_out) # omit the data of the year including corresponding day + df <- df_all[-window,] + ## apply principle components if method is 9nn + ## TODO 9NN MIGHT GIVE ERROR FOR FORECAST CASE. REVIEW AFTER DEVELOPING FORECAST STRATEGY!! + if (pca) { + PCA <- .pca(dx = df[, -ncol(df)], exp.var = 0.95, center = TRUE, scale = FALSE) + df <- data.frame(x = PCA$PR_mat, y = df[,ncol(df)]) + colnames(df) <- c(paste0("x.",1:(ncol(df)-1)), "y") + colnames(PCA$rotation) <- c(paste0("x.",1:(ncol(df)-1))) + } else { + PCA <- NULL + } + if (all(is.na(df$y))) { return(NA) } else { - return(lm(df[-window,], formula = y ~ .)) + return(list(lm = lm(df, formula = y ~ .), N_predictors = PCA$N_predictors, + rotation = PCA$rotation)) }}) } else { - - lm1 <- ifelse(all(is.na(df$y)), NA, list(lm(data = df, formula = y ~ .))) + if (pca) { + PCA <- .pca(dx = df_all[,-ncol(df_all)], exp.var = 0.95, center = TRUE, scale = FALSE) + df_all <- data.frame(x = PCA$PR_mat, y = df_all[,ncol(df_all)]) + colnames(df_all) <- c(paste0("x.",1:(ncol(df_all)-1)), "y") + colnames(PCA$rotation) <- c(paste0("x.",1:(ncol(df_all)-1))) + } else { + PCA <- NULL + } + if(all(is.na(df_all$y))) { + lm1 <- NA + } else { + lm1<- list(lm = lm(data = df_all, formula = y ~ .), N_predictors = PCA$N_predictors, + rotation = PCA$rotation) + } } - return(lm1) } #----------------------------------- # Function to apply the linear regressions. #----------------------------------- -.pred_lm <- function(df, lm1, f, loocv, aux_dim, k_out = 1) { +.pred_lm <- function(df, lm1, f, loocv, aux_dim, k_out = 1, pca = FALSE) { require (plyr) - if (loocv) { pred_vals <- sapply(1:length(lm1), function(j) { - if (all(is.na(lm1[[j]]))) { + if (all(is.na(lm1[[j]]$lm))) { return(array(rep(NA, k_out), c(k_out, nrow(df) / k_out))) } else { window <- ((j-1)*k_out+1):((j-1)*k_out + k_out) # test the daily data of the corresponding year - return(predict(lm1[[j]], df[window,])) + if (pca) { + ## transfer test data to PCA space. + pred_data <- as.matrix(df[window,-ncol(df)])%*%lm1[[j]]$rotation + pred_data <- as.data.frame (pred_data) + } else { + pred_data <- df[window,] + } + return(predict(lm1[[j]]$lm, pred_data)) }}) pred_vals <- array (pred_vals, c(k_out, nrow(df) / k_out)) } else { - if (!is.na(lm1)) { + # if lm function exist, in other words not NA + if (any(!is.na(lm1))) { # case to downscale hindcasts - if (is.null(f)) { - pred_vals_ls <- lapply(lm1, predict, newdata = df) - pred_vals <- array (unlist(pred_vals_ls), c(k_out, nrow(df) / k_out)) + if (is.null(f)) { + if (pca) { + pred_data <- as.matrix(df[,-ncol(df)])%*%lm1$rotation + pred_data <- as.data.frame(pred_data) + } else { + pred_data <- df + } + pred_vals_ls <- predict(lm1$lm, newdata = pred_data) + pred_vals <- array (pred_vals_ls, c(k_out, nrow(df) / k_out)) # case to downscale forecasts } else { if (!is.null(aux_dim)) { # 9nn & large-scale fcst_df <- adply(f,.margins = 3, .id = NULL) + if (pca) { + ## transfer test data to PCA space. + fcst_df <- as.matrix(fcst_df)%*%lm1$rotation + fcst_df <- as.data.frame (fcst_df) + } colnames(fcst_df) <- paste0('x.',1:ncol(fcst_df)) - pred_vals_ls <- lapply(lm1, predict, newdata = fcst_df) - pred_vals <- unlist(pred_vals_ls) + pred_vals <- predict(lm1$lm, newdata = fcst_df) dim(pred_vals) <- dim(f)[names(dim(f)) != aux_dim] } else { # basic @@ -811,7 +843,6 @@ Intlr <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, obs_lo } } } - return(pred_vals) } @@ -831,9 +862,9 @@ Intlr <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, obs_lo names(dim(dist_id)) <- c("nn", lon_dim) - idh <- SplitDim(dist_id, split_dim = lon_dim, - indices = rep(1:(length(lats_hres)),length(lons_hres)), - new_dim_name = lat_dim) + idh <- CSTools::SplitDim(dist_id, split_dim = lon_dim, + indices = rep(1:(length(lats_hres)),length(lons_hres)), + new_dim_name = lat_dim) idh <- aperm (idh, c(1,3,2)) # dimension: nn, lat_dim, lon_dim @@ -870,3 +901,19 @@ Intlr <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, obs_lo return(nearest) } + +.pca <- function (dx, exp.var = 0.95, scale = FALSE, center = TRUE, ...) { + PR <- suppressWarnings(prcomp (~., dx, center = center, + na.action = na.omit, + scale = scale, ...)) ## prcomp between predictors + + N_predictors <- which (summary(PR)$importance[ "Cumulative Proportion",] > exp.var)[1] + PR_mat <- PR$x[, 1:N_predictors, drop = FALSE] + if (!is.null(PR$na.action)) { ## if there are NAs, relocate them to the correct order + dum <- array(NA, dim = c(nrow(dx), ncol(PR_mat) )) + dum [as.numeric(rownames(PR_mat)), ] <- PR_mat + PR_mat <- dum + } + return (list(PR_mat = PR_mat, N_predictors = N_predictors, + rotation = PR$rotation[,1:N_predictors])) +} -- GitLab From f8396d18d45b9ba71d9fe9fb410003fa63dff9f1 Mon Sep 17 00:00:00 2001 From: eduzenli Date: Fri, 22 Nov 2024 13:42:18 +0100 Subject: [PATCH 2/2] forecast dwn erors are corrected for large scale and 9nn --- R/Intlr.R | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/R/Intlr.R b/R/Intlr.R index a1e3764..a1a8f76 100644 --- a/R/Intlr.R +++ b/R/Intlr.R @@ -688,7 +688,7 @@ Intlr <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, obs_lo if (!is.null(f)) { if (!is.null(aux_dim)) { - tmp_df <- data.frame(x = adply(x,.margins = 3, .id = NULL), y = y) + tmp_df <- data.frame(x = adply(x,.margins = 3, .id = NULL, .fun = as.matrix), y = y) } else { tmp_df <- data.frame(x = as.vector(x), y = y) } @@ -696,6 +696,8 @@ Intlr <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, obs_lo tmp_df <- data.frame(x = x, y = y) } + colnames(tmp_df) <- c(paste0("x.",1:(ncol(tmp_df)-1)), "y") + # if the data is all NA, force return return NA if (all(is.na(tmp_df)) | (sum(apply(tmp_df, 2, function(x) !all(is.na(x)))) == 1)) { if (is.null(f)) { @@ -746,6 +748,7 @@ Intlr <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, obs_lo PCA <- .pca(dx = df[, -ncol(df)], exp.var = 0.95, center = TRUE, scale = FALSE) df <- data.frame(x = PCA$PR_mat, y = df[,ncol(df)]) colnames(df) <- c(paste0("x.",1:(ncol(df)-1)), "y") + PCA$rotation <- as.matrix(PCA$rotation) colnames(PCA$rotation) <- c(paste0("x.",1:(ncol(df)-1))) } else { PCA <- NULL @@ -761,6 +764,7 @@ Intlr <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, obs_lo PCA <- .pca(dx = df_all[,-ncol(df_all)], exp.var = 0.95, center = TRUE, scale = FALSE) df_all <- data.frame(x = PCA$PR_mat, y = df_all[,ncol(df_all)]) colnames(df_all) <- c(paste0("x.",1:(ncol(df_all)-1)), "y") + PCA$rotation <- as.matrix(PCA$rotation) colnames(PCA$rotation) <- c(paste0("x.",1:(ncol(df_all)-1))) } else { PCA <- NULL @@ -815,7 +819,9 @@ Intlr <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, obs_lo } else { if (!is.null(aux_dim)) { # 9nn & large-scale - fcst_df <- adply(f,.margins = 3, .id = NULL) + # concatenate members + fcst_df <- as.data.frame(matrix(aperm(f, c(1, 3, 2)), + nrow = dim(f)[1] * dim(f)[3], ncol = dim(f)[2])) if (pca) { ## transfer test data to PCA space. fcst_df <- as.matrix(fcst_df)%*%lm1$rotation @@ -826,8 +832,7 @@ Intlr <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, obs_lo dim(pred_vals) <- dim(f)[names(dim(f)) != aux_dim] } else { # basic - pred_vals_ls <- lapply(lm1, predict, newdata = data.frame(x = as.vector(f))) - pred_vals <- unlist(pred_vals_ls) + pred_vals <- predict(lm1$lm, newdata = data.frame(x.1 = as.vector(f))) dim(pred_vals) <- dim(f) } } -- GitLab