diff --git a/R/Intlr.R b/R/Intlr.R index 953031fe5dcbcfedc79ea3200f2c921ee92fe960..6f7e1249a54d829455b30474c630dae62ed396b6 100644 --- a/R/Intlr.R +++ b/R/Intlr.R @@ -608,21 +608,42 @@ Intlr <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, obs_lo stop(paste0(lr_method, " method is not implemented yet")) } - + + k_out <- 1 ## ## It is used in constructing window for loocv = TRUE case. k_out is also used to order the dimensions of pred.lm output for both loocv = TRUE and loocv = FALSE cases. In case the data is NOT daily, k_out is 1. + daily <- FALSE + if ( time_dim %in% names(dim(predictor)) ) { + daily <- TRUE + k_out <- as.numeric (dim(predictor)[time_dim]) ## in case loocv = TRUE and the data is daily, leave one-year data out + predictor <- MergeDims (predictor, merge_dims = c(time_dim, sdate_dim), rename_dim = sdate_dim) + predictand <- MergeDims (predictand, merge_dims = c(time_dim, sdate_dim), rename_dim = sdate_dim) + output_dims <- c(time_dim, sdate_dim) # for hindcast dwn + if (!is.null(exp_cor)) { + sdate_num_fr <- as.numeric (dim(forecast)[sdate_dim]) ## sdate_num of forecast + forecast <- MergeDims (forecast, merge_dims = c(time_dim, sdate_dim), rename_dim = sdate_dim) + } + } + # Apply the linear regressions - # case hindcast - forecast + ## case hindcast - forecast if (!is.null(exp_cor)) { res <- Apply(list(predictor, predictand, forecast), target_dims = list(target_dims_predictor, target_dims_predictand, target_dims_forecast), - fun = .intlr, loocv = loocv, aux_dim = aux_dim, ncores = ncores)$output1 + fun = .intlr, loocv = loocv, aux_dim = aux_dim, ncores = ncores, + sdate_dim = sdate_dim, k_out = k_out)$output1 + + if (daily) { + res <- SplitDim(data = res, split_dim = sdate_dim, new_dim_name = time_dim, + indices = rep (1:k_out, sdate_num_fr)) + } } - # case hindcast only + ## case hindcast only else { res <- Apply(list(predictor, predictand), target_dims = list(target_dims_predictor, target_dims_predictand), - fun = .intlr, loocv = loocv, ncores = ncores)$output1 - names(dim(res))[1] <- sdate_dim + fun = .intlr, loocv = loocv, ncores = ncores, sdate_dim = sdate_dim, + k_out = k_out, + output_dims = output_dims)$output1 } # restore ensemble dimension in observations if it existed originally @@ -648,11 +669,10 @@ Intlr <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, obs_lo return(res) } - #----------------------------------- # Atomic function to generate and apply the linear regressions #----------------------------------- -.intlr <- function(x, y, f = NULL, loocv, aux_dim = NULL) { +.intlr <- function(x, y, f = NULL, loocv, aux_dim = NULL, sdate_dim = "sdate", k_out = 1) { require (plyr) @@ -665,10 +685,11 @@ Intlr <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, obs_lo } else { tmp_df <- data.frame(x = x, y = y) } + # if the data is all NA, force return return NA if (all(is.na(tmp_df)) | (sum(apply(tmp_df, 2, function(x) !all(is.na(x)))) == 1)) { if (is.null(f)) { - res <- rep(NA, nrow(tmp_df)) + res <- array(rep(NA, nrow(tmp_df)), c(k_out, nrow(tmp_df)/k_out)) } else { if (!is.null(aux_dim)) { res <- array(NA, dim(f)[names(dim(f)) != aux_dim]) @@ -677,9 +698,9 @@ Intlr <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, obs_lo } } } else { - # training ## apply principle components if method is 9nn + ## TODO 9NN MIGHT GIVE ERROR FOR FORECAST CASE. REVIEW AFTER DEVELOPING FORECAST STRATEGY!! if (any(names(dim(x)) == "nn")) { PR <- prcomp(~ x.1 + x.2 + x.3 + x.4 + x.5 + x.6 + x.7 + x.8 + x.9, data = tmp_df[, -ncol(tmp_df)], @@ -696,31 +717,34 @@ Intlr <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, obs_lo colnames(tmp_df) <- c(paste0("x.",1:(ncol(tmp_df)-1)), "y") } - lm1 <- .train_lm(df = tmp_df, loocv = loocv) + # training + lm1 <- .train_lm(df = tmp_df, loocv = loocv, k_out = k_out) # prediction - res <- .pred_lm(lm1 = lm1, df = tmp_df, f = f, loocv = loocv, aux_dim = aux_dim) + res <- .pred_lm(lm1 = lm1, df = tmp_df, f = f, loocv = loocv, aux_dim = aux_dim, k_out = k_out) } - - return(res) + + return(res) } #----------------------------------- # Function to generate the linear regressions. # Returns a list #----------------------------------- -.train_lm <- function(df, loocv) { +.train_lm <- function(df, loocv, k_out = 1) { # Remove predictor columns containing only NA's - df <- df[ ,apply(as.matrix(df[,colnames(df) != 'y'],nrow(df),ncol(df)-1), 2, function(x) !all(is.na(x)))] + df <- df[ ,apply(as.matrix(df[,colnames(df) != 'y'],nrow(df),ncol(df)-1), 2, + function(x) !all(is.na(x)))] if (loocv) { - - lm1 <- lapply(1:nrow(df), function(j) { - if (all(is.na(df[-j,]$y))) { + + lm1 <- lapply(1:(nrow(df)/k_out), function(j) { + window <- ((j-1)*k_out+1):((j-1)*k_out + k_out) # omit the data of the year including corresponding day + if (all(is.na(df[-window ,]$y))) { return(NA) } else { - return(lm(df[-j,], formula = y ~ .)) + return(lm(df[-window,], formula = y ~ .)) }}) } else { @@ -733,23 +757,26 @@ Intlr <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, obs_lo #----------------------------------- # Function to apply the linear regressions. #----------------------------------- -.pred_lm <- function(df, lm1, f, loocv, aux_dim) { +.pred_lm <- function(df, lm1, f, loocv, aux_dim, k_out = 1) { require (plyr) + if (loocv) { - pred_vals <- sapply(1:nrow(df), function(j) { + pred_vals <- sapply(1:length(lm1), function(j) { if (all(is.na(lm1[[j]]))) { - return(NA) + return(array(rep(NA, k_out), c(k_out, nrow(df) / k_out))) } else { - return(predict(lm1[[j]], df[j,])) + window <- ((j-1)*k_out+1):((j-1)*k_out + k_out) # test the daily data of the corresponding year + return(predict(lm1[[j]], df[window,])) }}) + pred_vals <- array (pred_vals, c(k_out, nrow(df) / k_out)) } else { if (!is.na(lm1)) { # case to downscale hindcasts if (is.null(f)) { - pred_vals_ls <- lapply(lm1, predict, data = df) - pred_vals <- unlist(pred_vals_ls) + pred_vals_ls <- lapply(lm1, predict, newdata = df) + pred_vals <- array (unlist(pred_vals_ls), c(k_out, nrow(df) / k_out)) # case to downscale forecasts } else { if (!is.null(aux_dim)) { @@ -763,17 +790,17 @@ Intlr <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, obs_lo # basic pred_vals_ls <- lapply(lm1, predict, newdata = data.frame(x = as.vector(f))) pred_vals <- unlist(pred_vals_ls) - dim(pred_vals) <- dim(f ) + dim(pred_vals) <- dim(f) } } } else { if (is.null(f)) { - pred_vals <- rep(NA, nrow(df)) + pred_vals <- array(rep(NA, k_out), c(k_out, nrow(df) / k_out)) } else { if (!is.null(aux_dim)) { pred_vals <- array(NA, dim(f)[names(dim(f)) != aux_dim]) } else { - pred_vals <- array(NA, dim(f)) + pred_vals <- array(NA, dim(f)) } } } @@ -799,7 +826,7 @@ Intlr <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, obs_lo names(dim(dist_id)) <- c("nn", lon_dim) idh <- SplitDim(dist_id, split_dim = lon_dim, - indices = rep(1:(length(lons_hres)),length(lats_hres)), + indices = rep(1:(length(lats_hres)),length(lons_hres)), new_dim_name = lat_dim) idh <- aperm (idh, c(1,3,2)) # dimension: nn, lat_dim, lon_dim diff --git a/R/LogisticReg.R b/R/LogisticReg.R index f74f2ee17c181e8ebf3b6d98ad3c4f12f254d3cf..d42c39e88fa79403c9370b1f873ac463296c72b0 100644 --- a/R/LogisticReg.R +++ b/R/LogisticReg.R @@ -56,7 +56,9 @@ #'@param sdate_dim a character vector indicating the start date dimension name in the element #''data' in exp and obs. Default set to "sdate". #'@param member_dim a character vector indicating the member dimension name in the element -#''data' in exp and obs. Default set to "member". +#''data' in exp and obs. Default set to "member". +#'@param time_dim a character vector indicating the time dimension name in the element +#''data' in exp and obs. Default set to "time". #'@param region a numeric vector indicating the borders of the region defined in obs. #'It consists of four elements in this order: lonmin, lonmax, latmin, latmax. lonmin refers #'to the left border, while lonmax refers to the right border. latmin indicates the lower @@ -94,8 +96,8 @@ CST_LogisticReg <- function(exp, obs, exp_cor = NULL, target_grid, int_method = log_reg_method = "ens_mean", probs_cat = c(1/3,2/3), return_most_likely_cat = FALSE, points = NULL, method_point_interp = NULL, lat_dim = "lat", lon_dim = "lon", - sdate_dim = "sdate", member_dim = "member", region = NULL, - loocv = TRUE, ncores = NULL) { + sdate_dim = "sdate", member_dim = "member", time_dim = "time", + region = NULL, loocv = TRUE, ncores = NULL) { if (!inherits(exp,'s2dv_cube')) { stop("Parameter 'exp' must be of the class 's2dv_cube'") @@ -105,14 +107,16 @@ CST_LogisticReg <- function(exp, obs, exp_cor = NULL, target_grid, int_method = stop("Parameter 'obs' must be of the class 's2dv_cube'") } - res <- LogisticReg(exp = exp$data, obs = obs$data, exp_cor = exp_cor$data, exp_lats = exp$coords[[lat_dim]], - exp_lons = exp$coords[[lon_dim]], obs_lats = obs$coords[[lat_dim]], - obs_lons = obs$coords[[lon_dim]], target_grid = target_grid, - probs_cat = probs_cat, return_most_likely_cat = return_most_likely_cat, + res <- LogisticReg(exp = exp$data, obs = obs$data, exp_cor = exp_cor$data, + exp_lats = exp$coords[[lat_dim]], exp_lons = exp$coords[[lon_dim]], + obs_lats = obs$coords[[lat_dim]], obs_lons = obs$coords[[lon_dim]], + target_grid = target_grid, probs_cat = probs_cat, + return_most_likely_cat = return_most_likely_cat, int_method = int_method, log_reg_method = log_reg_method, points = points, method_point_interp = method_point_interp, lat_dim = lat_dim, lon_dim = lon_dim, sdate_dim = sdate_dim, member_dim = member_dim, - source_file_exp = exp$attrs$source_files[1], source_file_obs = obs$attrs$source_files[1], + time_dim = time_dim, source_file_exp = exp$attrs$source_files[1], + source_file_obs = obs$attrs$source_files[1], region = region, loocv = loocv, ncores = ncores) # Modify data, lat and lon in the origina s2dv_cube, adding the downscaled data @@ -206,7 +210,9 @@ CST_LogisticReg <- function(exp, obs, exp_cor = NULL, target_grid, int_method = #'@param sdate_dim a character vector indicating the start date dimension name in the element #''data' in exp and obs. Default set to "sdate". #'@param member_dim a character vector indicating the member dimension name in the element -#''data' in exp and obs. Default set to "member". +#''data' in exp and obs. Default set to "member". +#'@param time_dim a character vector indicating the time dimension name in the element +#''data' in exp and obs. Default set to "time". #'@param source_file_exp a character vector with a path to an example file of the exp data. #'Only needed if the downscaling is to a point location. #'@param source_file_obs a character vector with a path to an example file of the obs data. @@ -247,7 +253,7 @@ LogisticReg <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, target_grid, int_method = NULL, log_reg_method = "ens_mean", probs_cat = c(1/3,2/3), return_most_likely_cat = FALSE, points = NULL, method_point_interp = NULL, lat_dim = "lat", lon_dim = "lon", - sdate_dim = "sdate", member_dim = "member", + sdate_dim = "sdate", member_dim = "member", time_dim = "time", source_file_exp = NULL, source_file_obs = NULL, region = NULL, loocv = TRUE, ncores = NULL) { @@ -480,6 +486,20 @@ LogisticReg <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, # convert observations to categorical predictands + k_out <- 1 ## in case loocv = TRUE and the data is NOT daily, leave one data out. + daily <- FALSE + if ( time_dim %in% names(dim(obs_ref)) & dim(obs_ref)[time_dim] > 1) { + daily <- TRUE + sdate_num <- as.numeric (dim(obs_ref)[sdate_dim]) + k_out <- as.numeric (dim(obs_ref)[time_dim]) ## in case loocv = TRUE and the data is daily, leave one-year data out + obs_ref <- MergeDims (obs_ref, merge_dims = c(time_dim, sdate_dim), rename_dim = sdate_dim) + predictor <- MergeDims (predictor, merge_dims = c(time_dim, sdate_dim), rename_dim = sdate_dim) + if (!is.null(exp_cor)) { + sdate_num_fr <- as.numeric (dim(forecast)[sdate_dim]) ## sdate_num of forecast + forecast <- MergeDims (forecast, merge_dims = c(time_dim, sdate_dim), rename_dim = sdate_dim) + } + } + obs_cat <- Apply(obs_ref, target_dims = sdate_dim, function(x) { if (!any(!is.na(x))) { rep(NA,length(x)) @@ -487,22 +507,52 @@ LogisticReg <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, terc <- convert2prob(as.vector(x), prob = probs_cat) as.integer(apply(terc, 1, function(r) which (r == 1)))}}, output_dims = sdate_dim, ncores = ncores)$output1 - + + target_dims_predictand <- sdate_dim + # Apply the logistic regressions + ## case hindcast only if (is.null(exp_cor)) { - res <- Apply(list(predictor, obs_cat), target_dims = list(target_dims_predictor, sdate_dim), - fun = function(x, y) .log_reg(x = x, y = y, loocv = loocv,probs_cat = probs_cat), + res <- Apply(list(predictor, obs_cat), + target_dims = list(target_dims_predictor, target_dims_predictand), + fun = function(x, y) .log_reg(x = x, y = y, loocv = loocv, probs_cat = probs_cat, + sdate_dim = sdate_dim, k_out = k_out), output_dims = c(sdate_dim, "category"), ncores = ncores)$output1 + if ( daily ) { + res <- SplitDim(res, split_dim = sdate_dim, new_dim_name = time_dim, + indices = rep(1:k_out, sdate_num)) + obs_ref <- SplitDim(obs_ref, split_dim = sdate_dim, new_dim_name = time_dim, + indices = rep(1:k_out, sdate_num)) + } + if (return_most_likely_cat) { res <- Apply(res, target_dims = c(sdate_dim, "category"), .most_likely_category, output_dims = sdate_dim, ncores = ncores)$output1 } - } else { + } + ## case hindcast - forecast + else { + if ( time_dim %in% names(dim(predictor)) ) { + if (dim(predictor)[time_dim] > 1) { + target_dims_predictor <- c(target_dims_predictor, time_dim) + target_dims_predictand <- c(target_dims_predictand, time_dim) + target_dims_forecast <- c(target_dims_forecast, time_dim) + } + } res <- Apply(list(predictor, obs_cat, forecast), - target_dims = list(target_dims_predictor, sdate_dim, target_dims_forecast), + target_dims = list(target_dims_predictor, target_dims_predictand, + target_dims_forecast), fun = function(x, y, f) .log_reg(x = x, y = y, f = f, loocv = loocv, - probs_cat = probs_cat), + probs_cat = probs_cat, sdate_dim = sdate_dim, + k_out = k_out), output_dims = c(sdate_dim, "category"), ncores = ncores)$output1 + + if ( daily ) { + res <- SplitDim(res, split_dim = sdate_dim, new_dim_name = time_dim, + indices = rep(1:k_out, sdate_num_fr)) + obs_ref <- SplitDim(obs_ref, split_dim = sdate_dim, new_dim_name = time_dim, + indices = rep(1:k_out, sdate_num)) + } } # restore ensemble dimension in observations if it existed originally @@ -560,8 +610,8 @@ LogisticReg <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, } # atomic functions for logistic regressions -.log_reg <- function(x, y, f = NULL, loocv, probs_cat) { - +.log_reg <- function(x, y, f = NULL, loocv, probs_cat, sdate_dim, k_out = 1) { + tmp_df <- data.frame(x = x, y = y) # if the data is all NA, force return return NA @@ -581,10 +631,11 @@ LogisticReg <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, } else { # training - lm1 <- .train_lr(df = tmp_df, loocv = loocv) + lm1 <- .train_lr(df = tmp_df, loocv = loocv, k_out) # prediction - res <- .pred_lr(lm1 = lm1, df = tmp_df, f = f, loocv = loocv, probs_cat = probs_cat) + res <- .pred_lr(lm1 = lm1, df = tmp_df, f = f, loocv = loocv, + probs_cat = probs_cat, k_out = k_out) if ( !(is.null(f)) ) { ## if forecast is provided, and syear of forecast is 1. if ( nrow(f) == 1 ) { @@ -599,7 +650,7 @@ LogisticReg <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, #----------------------------------- # Function to train the logistic regressions. #----------------------------------- -.train_lr <- function(df, loocv) { +.train_lr <- function(df, loocv, k_out = 1) { require(nnet) @@ -608,7 +659,9 @@ LogisticReg <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, if (loocv) { - lm1 <- lapply(1:nrow(df), function(j) multinom(y ~ ., data = df[ -j, ], trace = FALSE)) + lm1 <- lapply(1:(nrow(df)/k_out), function(j) { + window <- ((j-1)*k_out+1):((j-1)*k_out + k_out) # omit the data of the year including corresponding day + multinom(y ~ ., data = df[ -window, ], trace = FALSE)}) } else { @@ -622,7 +675,7 @@ LogisticReg <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, #----------------------------------- # Function to apply the logistic regressions. #----------------------------------- -.pred_lr <- function(df, lm1, f, loocv, probs_cat) { +.pred_lr <- function(df, lm1, f, loocv, probs_cat, k_out = 1) { require(plyr) @@ -631,16 +684,18 @@ LogisticReg <- function(exp, obs, exp_cor = NULL, exp_lats, exp_lons, obs_lats, # The error: "Error: Results must have the same dimensions." can # appear when the number of sdates is insufficient - pred_vals_ls <-list() - for (j in 1:nrow(df)) { - if (all(is.na(df[j, ]))) { - pred_vals_ls[[j]] <- rep(NA, length(probs_cat) + 1) + pred_vals_ls <- list() + for (j in 1:length(lm1)) { + window <- ((j-1)*k_out+1):((j-1)*k_out + k_out) # test the daily data of the corresponding year + if (all(is.na(df[window, ]))) { + pred_vals_ls[[j]] <- array(rep(NA, length(window) * (length(probs_cat) + 1)), + dim = c(length(window), length(probs_cat) + 1)) } else { - pred_vals_ls[[j]] <- predict(lm1[[j]], df[j,], type = "probs") + pred_vals_ls[[j]] <- predict(lm1[[j]], df[window,], type = "probs") } } - pred_vals <- laply(pred_vals_ls, .fun = as.array) + pred_vals <- do.call(rbind, pred_vals_ls) if( length(probs_cat) + 1 == 2) { pred_vals_dum<-array(NA, dim=c(nrow(df), 2))