diff --git a/.Rbuildignore b/.Rbuildignore index 834c4fa124507946a4eb8131288fe93d2ed7cf00..5493f6df42b306fd00555b902681e895fd4c2925 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,5 +1,10 @@ -.git -.gitignore -.tar.gz -.pdf -./.nc +.*\.git$ +.*\.gitignore$ +.*\.tar.gz$ +.*\.pdf$ +.*^(?!inst)\.nc$ +sample_data +README\.Rmd$ +README\.md$ +\..*\.RData$ +vignettes diff --git a/.gitignore b/.gitignore index 591c822411801ca32f1fd399c37977ec724b13ad..26a2200ce258f2658c2328e321b49be5d8935268 100644 --- a/.gitignore +++ b/.gitignore @@ -12,5 +12,7 @@ checkout_output.txt merge_output.txt master_pull.txt *.eps +README.Rmd *.ps Rplots.pdf +.nfs* diff --git a/DESCRIPTION b/DESCRIPTION index bf75372b7692fe229b72764520e5d7c50c77d0d9..f77ce597e30b6bd72d90a0934d1b2e88a221a59f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: s2dverification Title: Set of Common Tools for Forecast Verification -Version: 2.8.3 +Version: 2.8.5 Authors@R: c( person("BSC-CNS", role = c("aut", "cph")), person("Virginie", "Guemas", , "virginie.guemas@bsc.es", role = "aut"), @@ -50,3 +50,4 @@ URL: https://earth.bsc.es/gitlab/es/s2dverification/wikis/home BugReports: https://earth.bsc.es/gitlab/es/s2dverification/issues LazyData: true SystemRequirements: cdo +Encoding: UTF-8 diff --git a/R/CDORemap.R b/R/CDORemap.R index 55f12904cc86274ee2ba8b81424543d636233c2e..88c2cd42187cf84d6b1d99bd58ffd6b10554ddc3 100644 --- a/R/CDORemap.R +++ b/R/CDORemap.R @@ -60,7 +60,7 @@ CDORemap <- function(data_array = NULL, lons, lats, grid, method, } } names(array_dims) <- c(new_lat_dim_name, new_lon_dim_name) - data_array <- array(NA, array_dims) + data_array <- array(as.numeric(NA), array_dims) } if (!(is.logical(data_array) || is.numeric(data_array)) || !is.array(data_array)) { stop("Parameter 'data_array' must be a numeric array.") @@ -398,6 +398,7 @@ CDORemap <- function(data_array = NULL, lons, lats, grid, method, cdo_version <- as.numeric_version( strsplit(suppressWarnings(system2("cdo", args = '-V', stderr = TRUE))[[1]], ' ')[[1]][5] ) + warning("CDORemap: Using CDO version ", cdo_version, ".") if ((cdo_version >= as.numeric_version('1.7.0')) && (method == 'con')) { method <- 'ycon' } @@ -433,12 +434,13 @@ CDORemap <- function(data_array = NULL, lons, lats, grid, method, if (length(other_dims) > 1 || (length(other_dims) > 0 && (is_irregular))) { if (!(length(dim(data_array)) %in% other_dims)) { if (avoid_writes || is_irregular) { - dim_to_move <- max(other_dims) + dims_mod <- dim(data_array) + dims_mod[which(names(dim(data_array)) %in% + c(lon_dim, lat_dim))] <- 0 + dim_to_move <- which.max(dims_mod) permutation <- (1:length(dim(data_array)))[-dim_to_move] permutation <- c(permutation, dim_to_move) - permutation_back <- 1:length(dim(data_array)) - permutation_back[dim_to_move] <- length(dim(data_array)) - permutation_back[length(dim(data_array))] <- dim_to_move + permutation_back <- sort(permutation, index.return = TRUE)$ix dim_backup <- dim(data_array) data_array <- aperm(data_array, permutation) dim(data_array) <- dim_backup[permutation] @@ -460,6 +462,7 @@ CDORemap <- function(data_array = NULL, lons, lats, grid, method, } if ((other_dims_per_chunk > 1) || (other_dims_per_chunk > 0 && is_irregular)) { unlimited_dim <- tail(sort(tail(other_dims_ordered_by_size, other_dims_per_chunk)), 1) + #unlimited_dim <- tail(other_dims) } } @@ -498,6 +501,12 @@ CDORemap <- function(data_array = NULL, lons, lats, grid, method, } names(attr(lons, 'variables')) <- lon_var_name names(attr(lats, 'variables')) <- lat_var_name + if (!is.null(attr(lons, 'variables')[[1]][['dim']])) { + attr(lons, 'variables')[[1]][['dim']] <- NULL + } + if (!is.null(attr(lats, 'variables')[[1]][['dim']])) { + attr(lats, 'variables')[[1]][['dim']] <- NULL + } lons_lats_taken <- FALSE for (i in 1:total_slices) { tmp_file <- tempfile('R_CDORemap_', write_dir, fileext = '.nc') @@ -533,8 +542,33 @@ CDORemap <- function(data_array = NULL, lons, lats, grid, method, found_lat_dim <- found_dim_names[which(found_dim_names %in% .KnownLatNames())[1]] found_lon_dim_size <- length(ncdf_remapped$dim[[found_lon_dim]]$vals) found_lat_dim_size <- length(ncdf_remapped$dim[[found_lat_dim]]$vals) - found_lons <- ncvar_get(ncdf_remapped, 'lon', collapse_degen = FALSE) - found_lats <- ncvar_get(ncdf_remapped, 'lat', collapse_degen = FALSE) + found_var_names <- names(ncdf_remapped$var) + found_lon_var_name <- which(found_var_names %in% .KnownLonNames()) + found_lat_var_name <- which(found_var_names %in% .KnownLatNames()) + if (length(found_lon_var_name) > 0) { + found_lon_var_name <- found_var_names[found_lon_var_name[1]] + } else { + found_lon_var_name <- NULL + } + if (length(found_lat_var_name) > 0) { + found_lat_var_name <- found_var_names[found_lat_var_name[1]] + } else { + found_lat_var_name <- NULL + } + if (length(found_lon_var_name) > 0) { + found_lons <- ncvar_get(ncdf_remapped, found_lon_var_name, + collapse_degen = FALSE) + } else { + found_lons <- ncdf_remapped$dim[[found_lon_dim]]$vals + dim(found_lons) <- found_lon_dim_size + } + if (length(found_lat_var_name) > 0) { + found_lats <- ncvar_get(ncdf_remapped, found_lat_var_name, + collapse_degen = FALSE) + } else { + found_lats <- ncdf_remapped$dim[[found_lat_dim]]$vals + dim(found_lats) <- found_lat_dim_size + } if (length(dim(lons)) == length(dim(found_lons))) { new_lon_name <- lon_dim } else { diff --git a/R/Clim.R b/R/Clim.R index 615f845a898c00a72e1a7c6ab2c7191327818b73..4fdfa038f9ef4b5396870173f9d35346a914a9eb 100644 --- a/R/Clim.R +++ b/R/Clim.R @@ -47,8 +47,8 @@ Clim <- function(var_exp, var_obs, memb = TRUE, kharin = FALSE, NDV = FALSE) { trend_obs <- array(dim = dim(var_exp)) trend_exp <- array(dim = dim(var_exp)) for (jdate in 1:dimexp[3]) { - trend_exp[, , jdate, , , , ] <- tmp_exp$trend[, , 4, , , , ] - + jdate * tmp_exp$trend[, , 2, , , , ] + trend_exp[, , jdate, , , , ] <- tmp_exp$trend[, , 4, , , , ] + + jdate * tmp_exp$trend[, , 2, , , , ] tmp_obs2 <- MeanListDim(tmp_obs$trend,c(2, 1)) trend_obs[, , jdate, , , , ] <- InsertDim(InsertDim(tmp_obs2[4, , , , ] + jdate * tmp_obs2[2, , , , ], 1, dimexp[1]), @@ -66,13 +66,13 @@ Clim <- function(var_exp, var_obs, memb = TRUE, kharin = FALSE, NDV = FALSE) { reg_obs <- array(dim = dim(var_exp)) reg_exp <- array(dim = dim(var_exp)) for (jdate in 1:dimexp[3]) { - reg_exp[, , jdate, , , , ] <- tmp_exp$regression[, , 4, , , , ] - + iniexp[, , jdate, , , , ] * + reg_exp[, , jdate, , , , ] <- tmp_exp$regression[, , 4, , , , ] + + iniexp[, , jdate, , , , ] * tmp_exp$regression[, , 2, , , , ] tmp_obs2 <- MeanListDim(tmp_obs$regression,c(2, 1)) - reg_obs[, , jdate, , , , ] <- InsertDim(InsertDim(tmp_obs2[4, , , , ] - + MeanListDim(iniobs,c(2, 1))[jdate, , , , ] - * tmp_obs2[2, , , , ], 1, dimexp[1]), + reg_obs[, , jdate, , , , ] <- InsertDim(InsertDim(tmp_obs2[4, , , , ] + + MeanListDim(iniobs,c(2, 1))[jdate, , , , ] * + tmp_obs2[2, , , , ], 1, dimexp[1]), 2, dimexp[2]) } out_clim_exp <- reg_exp - reg_obs + InsertDim(InsertDim(InsertDim( diff --git a/R/ColorBar.R b/R/ColorBar.R index 4f92a2a159e9ba504a14a955270f240e338a95be..d47c3103a25f16c8054d649e72c74b0c6109fc24 100644 --- a/R/ColorBar.R +++ b/R/ColorBar.R @@ -223,6 +223,10 @@ ColorBar <- function(brks = NULL, cols = NULL, vertical = TRUE, stop("Parameter 'subsampleg' must be numeric.") } subsampleg <- round(subsampleg) + draw_labels <- TRUE + if ((subsampleg) < 1) { + draw_labels <- FALSE + } # Check plot if (!is.logical(plot)) { @@ -433,7 +437,11 @@ ColorBar <- function(brks = NULL, cols = NULL, vertical = TRUE, labels <- c(labels, extra_labels) tick_reorder <- sort(at, index.return = TRUE) at <- tick_reorder$x - labels <- labels[tick_reorder$ix] + if (draw_labels) { + labels <- labels[tick_reorder$ix] + } else { + labels <- FALSE + } axis(d, at = at, tick = draw_ticks, labels = labels, cex.axis = cex_labels, tcl = cex_ticks) par(saved_pars) } diff --git a/R/Filter.R b/R/Filter.R index f924096881542177710e8d70e125ad6b0df9b4e6..bb5eba31f8848f1b7d566a5ca4847b12c1838f21 100644 --- a/R/Filter.R +++ b/R/Filter.R @@ -10,7 +10,7 @@ Filter <- function(xdata, freq) { for (phase in seq(0, pi, (pi / (10 * fac2)))) { xtest <- cos(phase + c(1:ndat) * jfreq * 2 * pi) test <- lm(xdata[is.na(xdata) == FALSE] ~ xtest[ - is.na(xdata) == FALSE])$fitted.value + is.na(xdata) == FALSE])$fitted.values if (sum(test ^ 2) > maxi) { endphase <- phase endfreq <- jfreq @@ -21,7 +21,7 @@ Filter <- function(xdata, freq) { xend <- cos(endphase + c(1:ndat) * endfreq * 2 * pi) xdata[is.na(xdata) == FALSE] <- xdata[is.na(xdata) == FALSE] - lm( xdata[is.na(xdata) == FALSE] ~ xend[is.na(xdata) == FALSE] - )$fitted.value + )$fitted.values xdata } diff --git a/R/Load.R b/R/Load.R index 056496618d08c063c1812c96d23cbd74d1a45ee6..5cebec71106778ac3e1205aa6f432f332e552853 100644 --- a/R/Load.R +++ b/R/Load.R @@ -18,38 +18,38 @@ Load <- function(var, exp = NULL, obs = NULL, sdates, nmember = NULL, } load_parameters <- lapply(parameter_names, get, envir = environment()) names(load_parameters) <- parameter_names - parameters_to_show <- c('var', 'exp', 'obs', 'sdates', 'grid', 'output', 'storefreq') + parameters_to_show <- c('var', 'exp', 'obs', 'sdates', 'nmember', 'leadtimemin', + 'leadtimemax', 'latmin', 'latmax', 'lonmin', 'lonmax', + 'output', 'grid', 'storefreq') load_parameters <- c(load_parameters[parameters_to_show], load_parameters[-match(parameters_to_show, names(load_parameters))]) - message(paste("* The load call you issued is:\n* Load(", + if (!silent) { + message(paste("* The load call you issued is:\n* Load(", paste(strwrap( paste(unlist(lapply(names(load_parameters[1:length(parameters_to_show)]), function(x) paste(x, '=', if (x == 'sdates' && length(load_parameters[[x]]) > 4) { paste0("c('", load_parameters[[x]][1], "', '", load_parameters[[x]][2], "', ..., '", tail(load_parameters[[x]], 1), "')") + } else if ((x %in% c('exp', 'obs')) && is.list(load_parameters[[x]])) { + paste0("list(", paste(unlist(lapply(load_parameters[[x]], + function (y) { + paste0("list(", + if ('name' %in% names(y)) { + paste0('name = "', y[['name']], '", ...') + } else { + "..." + }, ")" + ) + })), collapse = ', '), + ")") + # Print a stamp of the call the user issued. } else { paste(deparse(load_parameters[[x]]), collapse = '') }))), collapse = ', '), width = getOption('width') - 9, indent = 0, exdent = 8), collapse = '\n*'), ", ...)\n* See the full call in '$load_parameters' after Load() finishes.", sep = '')) - # .message("* The load call you issued is:") - # .message("* Load(") - # .message( - # strwrap( - # paste( - # unlist(lapply(names(load_parameters[1:length(parameters_to_show)]), - # function(x) paste(x, '=', - # if (x == 'sdates' && length(load_parameters[[x]]) > 4) { - # paste0("c('", load_parameters[[x]][1], "', '", load_parameters[[x]][2], - # "', ..., '", tail(load_parameters[[x]], 1), "')") - # } else { - # paste(deparse(load_parameters[[x]]), collapse = '') - # }))), - # collapse = ', '), - # width = getOption('width') - 9, indent = 0, exdent = 8)) - # .message("* , ...)") - # .message("* See the full call in '$load_parameters' after Load() finishes.") + } # Run Load() error-aware, so that it always returns something errors <- try({ @@ -379,8 +379,10 @@ Load <- function(var, exp = NULL, obs = NULL, sdates, nmember = NULL, replace_globs <- path_glob_permissive %in% c('no', 'partial') # If not all data has been provided in 'exp' and 'obs', configuration file is read. - if (length(exps_to_fetch) > 0 || length(obs_to_fetch) > 0) { - .message("Some 'path's not explicitly provided in 'exp' and 'obs', so will now proceed to open the configuration file.") + if ((length(exps_to_fetch) > 0 || length(obs_to_fetch) > 0)) { + if (!silent) { + .message("Some 'path's not explicitly provided in 'exp' and 'obs', so will now proceed to open the configuration file.") + } data_info <- ConfigFileOpen(configfile, silent, TRUE) # Check that the var, exp and obs parameters are right and keep the entries diff --git a/R/Subset.R b/R/Subset.R index a084a1c2ff3526c460bdcc87e8741cfa355d0386..c82b0c67f6804ab2a3ce9127ede530a6fbe4cbb2 100644 --- a/R/Subset.R +++ b/R/Subset.R @@ -78,11 +78,15 @@ Subset <- function(x, along, indices, drop = FALSE) { metadata[['dim']] <- metadata[['dim']][-dims_to_drop] if (is.character(dim_names)) { names(metadata[['dim']]) <- dim_names[-dims_to_drop] - metadata[['dimensions']] <- dim_names[-dims_to_drop] + if ('dimensions' %in% names(attributes(x))) { + metadata[['dimensions']] <- dim_names[-dims_to_drop] + } } } else if (is.character(dim_names)) { names(metadata[['dim']]) <- dim_names - metadata[['dimensions']] <- dim_names + if ('dimensions' %in% names(attributes(x))) { + metadata[['dimensions']] <- dim_names + } } attributes(subset) <- metadata subset diff --git a/R/Utils.R b/R/Utils.R index 603d47163f48771ae6310b7e27e13666e98939dd..4460018139d3bf55bfd8cc3f9b1b2a4f6f17f850 100644 --- a/R/Utils.R +++ b/R/Utils.R @@ -733,16 +733,16 @@ if (fnc$var[[namevar]]$hasScaleFact || fnc$var[[namevar]]$hasAddOffset) { tmp <- (tmp - add_offset) / scale_factor } - nc_close(fnc) - fnc <- nc_create(filein2, list(ncdf_var)) - ncvar_put(fnc, ncdf_var, tmp) + #nc_close(fnc) + fnc2 <- nc_create(filein2, list(ncdf_var)) + ncvar_put(fnc2, ncdf_var, tmp) if (add_offset != 0) { - ncatt_put(fnc, ncdf_var, 'add_offset', add_offset) + ncatt_put(fnc2, ncdf_var, 'add_offset', add_offset) } if (scale_factor != 1) { - ncatt_put(fnc, ncdf_var, 'scale_factor', scale_factor) + ncatt_put(fnc2, ncdf_var, 'scale_factor', scale_factor) } - nc_close(fnc) + nc_close(fnc2) system(paste0("cdo -s -sellonlatbox,", if (lonmin > lonmax) { "0,360," } else { @@ -751,27 +751,27 @@ " -remap", work_piece[['remap']], ",", common_grid_name, " ", filein2, " ", filein, " 2>/dev/null", sep = "")) file.remove(filein2) - fnc <- nc_open(filein) - lon <- ncvar_get(fnc, 'lon') - lat <- ncvar_get(fnc, 'lat') + fnc2 <- nc_open(filein) + sub_lon <- ncvar_get(fnc2, 'lon') + sub_lat <- ncvar_get(fnc2, 'lat') ## We read the longitudes and latitudes from the file. ## In principle cdo should put in order the longitudes ## and slice them properly unless data is across greenwich - lon[which(lon < 0)] <- lon[which(lon < 0)] + 360 - lon_indices <- 1:length(lon) + sub_lon[which(sub_lon < 0)] <- sub_lon[which(sub_lon < 0)] + 360 + sub_lon_indices <- 1:length(sub_lon) if (lonmax < lonmin) { - lon_indices <- lon_indices[which((lon <= lonmax) | (lon >= lonmin))] + sub_lon_indices <- sub_lon_indices[which((sub_lon <= lonmax) | (sub_lon >= lonmin))] } - lat_indices <- 1:length(lat) + sub_lat_indices <- 1:length(sub_lat) ## In principle cdo should put in order the latitudes - if (lat[1] < lat[length(lat)]) { - lat_indices <- length(lat):1 + if (sub_lat[1] < sub_lat[length(sub_lat)]) { + sub_lat_indices <- length(sub_lat):1 } - final_dims[c(1, 2)] <- c(length(lon_indices), length(lat_indices)) - subset_indices[[dim_matches[1]]] <- lon_indices - subset_indices[[dim_matches[2]]] <- lat_indices + final_dims[c(1, 2)] <- c(length(sub_lon_indices), length(sub_lat_indices)) + subset_indices[[dim_matches[1]]] <- sub_lon_indices + subset_indices[[dim_matches[2]]] <- sub_lat_indices - tmp <- take(ncvar_get(fnc, namevar, collapse_degen = FALSE), + tmp <- take(ncvar_get(fnc2, namevar, collapse_degen = FALSE), 1:length(subset_indices), subset_indices) if (!is.null(mask)) { @@ -802,13 +802,13 @@ mask_lon_indices <- which((mask_lons >= lonmin) | (mask_lons <= lonmax)) } mask_lat_indices <- which((mask_lats >= latmin) & (mask_lats <= latmax)) - if (lat[1] < lat[length(lat)]) { + if (sub_lat[1] < sub_lat[length(sub_lat)]) { mask_lat_indices <- mask_lat_indices[length(mask_lat_indices):1] } mask <- mask[mask_lon_indices, mask_lat_indices] } - lon <- lon[lon_indices] - lat <- lat[lat_indices] + sub_lon <- sub_lon[sub_lon_indices] + sub_lat <- sub_lat[sub_lat_indices] ### nc_close(fnc_mask) ### system(paste0("cdo -s -sellonlatbox,", if (lonmin > lonmax) { ### "0,360," @@ -834,6 +834,13 @@ dim(tmp) <- final_dims # If we are exploring the file we don't need to process and arrange # the retrieved data. We only need to keep the dimension sizes. + if (is_2d_var && lonlat_subsetting_requested && remap_needed) { + final_lons <- sub_lon + final_lats <- sub_lat + } else { + final_lons <- lon + final_lats <- lat + } if (explore_dims) { if (work_piece[['is_file_per_member']]) { ## TODO: When the exp_full_path contains asterisks and is file_per_member @@ -846,7 +853,7 @@ nmemb <- length(files) } } - dims <- list(member = nmemb, ftime = nltime, lon = lon, lat = lat) + dims <- list(member = nmemb, ftime = nltime, lon = final_lons, lat = final_lats) } else { # If we are not exploring, then we have to process the retrieved data if (is_2d_var) { @@ -863,13 +870,13 @@ } if (output == 'areave' || output == 'lon') { - weights <- InsertDim(cos(lat * pi / 180), 1, length(lon)) + weights <- InsertDim(cos(final_lats * pi / 180), 1, length(final_lons)) weights[which(is.na(x))] <- NA if (output == 'areave') { weights <- weights / mean(weights, na.rm = TRUE) mean(x * weights, na.rm = TRUE) } else { - weights <- weights / InsertDim(Mean1Dim(weights, 2, narm = TRUE), 2, length(lat)) + weights <- weights / InsertDim(Mean1Dim(weights, 2, narm = TRUE), 2, length(final_lats)) Mean1Dim(x * weights, 2, narm = TRUE) } } else if (output == 'lat') { @@ -1511,11 +1518,20 @@ # Function to permute arrays of non-atomic elements (e.g. POSIXct) .aperm2 <- function(x, new_order) { - y <- array(1:length(x), dim = dim(x)) - y <- aperm(y, new_order) old_dims <- dim(x) - x <- x[as.vector(y)] + attr_bk <- attributes(x) + if ('dim' %in% names(attr_bk)) { + attr_bk[['dim']] <- NULL + } + if (is.numeric(x)) { + x <- aperm(x, new_order) + } else { + y <- array(1:length(x), dim = dim(x)) + y <- aperm(y, new_order) + x <- x[as.vector(y)] + } dim(x) <- old_dims[new_order] + attributes(x) <- c(attributes(x), attr_bk) x } @@ -1523,6 +1539,11 @@ # It expects as inputs two named numeric vectors, and it extends them # with dimensions of length 1 until an ordered common dimension # format is reached. +# The first output is dims1 extended with 1s. +# The second output is dims2 extended with 1s. +# The third output is a merged dimension vector. If dimensions with +# the same name are found in the two inputs, and they have a different +# length, the maximum is taken. .MergeArrayDims <- function(dims1, dims2) { new_dims1 <- c() new_dims2 <- c() @@ -1550,7 +1571,7 @@ new_dims1 <- c(new_dims1, dims_to_add) new_dims2 <- c(new_dims2, dims2) } - list(new_dims1, new_dims2) + list(new_dims1, new_dims2, pmax(new_dims1, new_dims2)) } # This function takes two named arrays and merges them, filling with @@ -1565,33 +1586,41 @@ # 'a' 'b' 'c' 'e' 'd' 'f' 'g' # 2 4 3 7 5 9 11 .MergeArrays <- function(array1, array2, along) { - if (!(identical(names(dim(array1)), names(dim(array2))) && - identical(dim(array1)[-which(names(dim(array1)) == along)], - dim(array2)[-which(names(dim(array2)) == along)]))) { - new_dims <- .MergeArrayDims(dim(array1), dim(array2)) - dim(array1) <- new_dims[[1]] - dim(array2) <- new_dims[[2]] - for (j in 1:length(dim(array1))) { - if (names(dim(array1))[j] != along) { - if (dim(array1)[j] != dim(array2)[j]) { - if (which.max(c(dim(array1)[j], dim(array2)[j])) == 1) { - na_array_dims <- dim(array2) - na_array_dims[j] <- dim(array1)[j] - dim(array2)[j] - na_array <- array(dim = na_array_dims) - array2 <- abind(array2, na_array, along = j) - names(dim(array2)) <- names(na_array_dims) - } else { - na_array_dims <- dim(array1) - na_array_dims[j] <- dim(array2)[j] - dim(array1)[j] - na_array <- array(dim = na_array_dims) - array1 <- abind(array1, na_array, along = j) - names(dim(array1)) <- names(na_array_dims) + if (!(is.null(array1) || is.null(array2))) { + if (!(identical(names(dim(array1)), names(dim(array2))) && + identical(dim(array1)[-which(names(dim(array1)) == along)], + dim(array2)[-which(names(dim(array2)) == along)]))) { + new_dims <- .MergeArrayDims(dim(array1), dim(array2)) + dim(array1) <- new_dims[[1]] + dim(array2) <- new_dims[[2]] + for (j in 1:length(dim(array1))) { + if (names(dim(array1))[j] != along) { + if (dim(array1)[j] != dim(array2)[j]) { + if (which.max(c(dim(array1)[j], dim(array2)[j])) == 1) { + na_array_dims <- dim(array2) + na_array_dims[j] <- dim(array1)[j] - dim(array2)[j] + na_array <- array(dim = na_array_dims) + array2 <- abind(array2, na_array, along = j) + names(dim(array2)) <- names(na_array_dims) + } else { + na_array_dims <- dim(array1) + na_array_dims[j] <- dim(array2)[j] - dim(array1)[j] + na_array <- array(dim = na_array_dims) + array1 <- abind(array1, na_array, along = j) + names(dim(array1)) <- names(na_array_dims) + } } } } } + if (!(along %in% names(dim(array2)))) { + stop("The dimension specified in 'along' is not present in the ", + "provided arrays.") + } + array1 <- abind(array1, array2, along = which(names(dim(array1)) == along)) + names(dim(array1)) <- names(dim(array2)) + } else if (is.null(array1)) { + array1 <- array2 } - array1 <- abind(array1, array2, along = which(names(dim(array1)) == along)) - names(dim(array1)) <- names(dim(array2)) array1 } diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a285963e0e2b8c7471201a2f0bcda284c9c8afae --- /dev/null +++ b/README.md @@ -0,0 +1,118 @@ +s2dverification +=============== + +s2dverification (seasonal to decadal verification) is an R framework +that aids in the analysis of forecasts from the data retrieval stage, +through computation of statistics and skill scores against observations, +to visualisation of data and results. While some of its components are +only targeted to verification of seasonal to decadal climate forecasts +it provides tools that can be useful for verification of forecasts +in any field. + +A review of this package, including examples, has been published in the Environmental Modelling & Software journal. + +> Manubens, N., L.-P. Caron, A. Hunter, O. Bellprat, E. Exarchou, N.S. Fučkar, J. Garcia-Serrano, F. Massonnet, M. Ménégoz, V. Sicardi, L. Batté, C. Prodhomme, V. Torralba, N. Cortesi, O. Mula-Valls, K. Serradell, V. Guemas, F.J. Doblas-Reyes (2018). An R Package for Climate Forecast Verification. Environmental Modelling & Software, 103, 29-42, doi:10.1016/j.envsoft.2018.01.018 + +Find out more in the overview below, on the wiki page at + or on the +CRAN website at +. + +You can also sign up to the s2dverification mailing list by sending a +message with the subject 'subscribe' to +if you want to keep abreast of internal discussons or latest development +releases. + +Installation +------------ + +s2dverification has a system dependency, the CDO libraries, for +interpolation of grid data and retrieval of metadata. Make sure you have +these libraries installed in the system or download and install from +. + +You can then install the publicly released version of s2dverification from CRAN: +```r +install.packages("s2dverification") +``` +Or the development version from the GitLab repository: +```r +# install.packages("devtools") +devtools::install_git("https://earth.bsc.es/gitlab/es/s2dverification.git") +``` +Overview +-------- + +The following diagram depicts the modules of s2dverification and how +they interact: + + + +The [**Data +retrieval**](vignettes/data_retrieval.md) +module allows you to gather and homogenize NetCDF data files from forecasts, +hindcasts or observations stored in a local or remote file system. Some simple +previous steps are required, however, to set up some configuration parameters +so that the module can locate the source files and recognize the variables of +interest. + +Once the data has been loaded into an R object +[**Statistics**](vignettes/statistics.md) can be computed, such as +drift-corrected anomalies, trend removal, frequency filtering and more. + +Either after computing statistics or directly from the original data, the +functions in the +[**Verification**](vignettes/verification.md) +module allow you to compute deterministic and probabilistic scores and +skill scores such as root mean square error, time or spatial correlation or +brier score, with reliability indicators such as p-values and confidence +intervals. + +[**Visualisation**](vignettes/visualisation.md) +functions are also provided to plot the results obtained from any of the +modules above. + +If it's your first time using s2dverification you can check an +[**Example**](vignettes/example.md) +of use spanning its four modules or review the +[**Tutorial**](vignettes/tutorial.md). +You will find more detailed examples in the documentation page of +each module. + +You can also check the examples of usage of each function after attaching the +package as follows: +```r +ls('package:s2dverification') +## [1] "ACC" "Alpha" +## [3] "Ano" "Ano_CrossValid" +## [5] "Clim" "ColorBar" +## [7] "ConfigAddEntry" "ConfigApplyMatchingEntries" +## [9] "ConfigEditDefinition" "ConfigEditEntry" +## [11] "ConfigFileCreate" "ConfigFileOpen" +## [13] "ConfigFileSave" "ConfigRemoveDefinition" +## [15] "ConfigRemoveEntry" "ConfigShowDefinitions" +## [17] "ConfigShowSimilarEntries" "ConfigShowTable" +## [19] "Consist_Trend" "Corr" +## [21] "CRPS" "Enlarge" +## [23] "Eno" "EnoNew" +## [25] "Filter" "FitAcfCoef" +## [27] "FitAutocor" "GenSeries" +## [29] "Histo2Hindcast" "IniListDims" +## [31] "InsertDim" "LeapYear" +## [33] "Load" "Mean1Dim" +## [35] "MeanListDim" "Plot2VarsVsLTime" +## [37] "PlotACC" "PlotAno" +## [39] "PlotClim" "PlotEquiMap" +## [41] "PlotSection" "PlotStereoMap" +## [43] "PlotVsLTime" "ProbBins" +## [45] "RatioRMS" "RatioSDRMS" +## [47] "Regression" "RMS" +## [49] "RMSSS" "sampleDepthData" +## [51] "sampleMap" "sampleTimeSeries" +## [53] "Season" "SelIndices" +## [55] "Smoothing" "Spectrum" +## [57] "Spread" "Trend" +``` +```r +?FunctionName +``` diff --git a/inst/doc/plot_maps.R b/inst/doc/plot_maps.R index 0393f833e843a66cc13e2654c248dce663f04f40..cfe55d96e8c271efeb1b7a1723c2b9a0e0b5bef1 100755 --- a/inst/doc/plot_maps.R +++ b/inst/doc/plot_maps.R @@ -26,9 +26,9 @@ year0 <- 1960 # first start date yearf <- 2005 # last start date intsdate <- 5 # interval between start dates runmeanlen <- 12 # length of the window for running mean (in months) -members <- list(19900101 = c('r1i1p1')) +members <- list('19900101' = c('r1i1p1')) #PUT IN ORDER. NONE CAN BE MISSING -chunks <- list(19900101 = c('199001-199001', '199002-199002', '199003-199003')) +chunks <- list('19900101' = c('199001-199001', '199002-199002', '199003-199003')) diff --git a/inst/doc/plot_timeseries.R b/inst/doc/plot_timeseries.R index cf92ec0489fd6b663cc1fe24bf8a1c03ef092ca3..29f914e6943441b9b7287d8bc88795ca0dff3757 100755 --- a/inst/doc/plot_timeseries.R +++ b/inst/doc/plot_timeseries.R @@ -24,9 +24,9 @@ year0 <- 1960 # first start date yearf <- 2005 # last start date intsdate <- 5 # interval between start dates runmeanlen <- 12 # length of the window for running mean (in months) -members <- list(19900101 = c('r1i1p1')) +members <- list('19900101' = c('r1i1p1')) #PUT IN ORDER. NONE CAN BE MISSING -chunks <- list(19900101 = c('199001-199001', '199002-199002', '199003-199003')) +chunks <- list('19900101' = c('199001-199001', '199002-199002', '199003-199003')) obs <- switch(var, 'siarea' = c('hadisst_v1.1'), 'siextent' = c('hadisst_v1.1'), 'tas' = c('ncep-reanalysis', 'era40'), 'tos' = c('ersst_v3b', diff --git a/man/ColorBar.Rd b/man/ColorBar.Rd index fb7cc7e508b23f6d87773077c3313436a1f03827..51ca567029edd49ca38e0f0daac7816acee1ba29 100644 --- a/man/ColorBar.Rd +++ b/man/ColorBar.Rd @@ -33,7 +33,7 @@ TRUE/FALSE for vertical/horizontal colour bar (disregarded if plot = FALSE). } \item{subsampleg}{ The first of each subsampleg breaks will be ticked on the colorbar.\cr -Takes by default an approximation of a value that yields a readable tick arrangement (extreme breaks always ticked). See the code of the function for details or use 'extra_labels' for customized tick arrangements. +Takes by default an approximation of a value that yields a readable tick arrangement (extreme breaks always ticked). If set to 0 or lower, no labels are drawn. See the code of the function for details or use 'extra_labels' for customized tick arrangements. } \item{bar_limits}{ Vector of two numeric values with the extremes of the range of values represented in the colour bar. If 'var_limits' go beyond this interval, the drawing of triangle extremes is triggered at the corresponding sides, painted in 'col_inf' and 'col_sup'. Either of them can be set as NA and will then take as value the corresponding extreme in 'var_limits' (hence a triangle end won't be triggered for these sides). Takes as default the extremes of 'brks' if available, else the same values as 'var_limits'. diff --git a/man/ConfigFileOpen.Rd b/man/ConfigFileOpen.Rd index 5e52fc0f74139ac90a2fdb200cedbc053dd8245f..c42153fbf36c2b3ff11c6cee8130ad9c47d903d7 100644 --- a/man/ConfigFileOpen.Rd +++ b/man/ConfigFileOpen.Rd @@ -82,10 +82,11 @@ For example:\cr !!table of experiments\cr ecmwf, tos, /path/to/dataset/, $FILE_NAME$\cr There are some reserved variables that will offer information about the store frequency, the current startdate Load() is fetching, etc:\cr - $START_DATE$, $STORE_FREQ$, $MEMBER_NUMBER$\cr - for observations: $YEAR$, $MONTH$, $DAY$\cr + $VAR_NAME$, $START_DATE$, $STORE_FREQ$, $MEMBER_NUMBER$\cr + for experiments only: $EXP_NAME$\cr + for observations only: $OBS_NAME$, $YEAR$, $MONTH$, $DAY$\cr Additionally, from an element in an entry value you can access the other elements of the entry as:\cr - $EXP_NAME$, $VAR_NAME$, $EXP_MAIN_PATH$, $EXP_FILE_PATH$, \cr$VAR_NAME$, $SUFFIX$, $VAR_MIN$, $VAR_MAX$\cr + $EXP_MAIN_PATH$, $EXP_FILE_PATH$, \cr$VAR_NAME$, $SUFFIX$, $VAR_MIN$, $VAR_MAX$\cr The variable $SUFFIX$ is useful because it can be used to take part in the main or file path. For example: '/path/to$SUFFIX$/dataset/'.\cr It will be replaced by the value in the column that corresponds to the suffix unless the user specifies a different suffix via the parameter 'suffixexp' or 'suffixobs'.\cr diff --git a/man/Eno.Rd b/man/Eno.Rd index 8a5e0b5f9a40582fbe573e0a9f2f74965b2e6207..8aed1d5b2316317a5fd202981dac038514dbfd28 100644 --- a/man/Eno.Rd +++ b/man/Eno.Rd @@ -25,27 +25,22 @@ Same dimensions as var but without the posdim dimension. \examples{ # See examples on Load() to understand the first lines in this example \dontrun{ -configfile <- paste0(tempdir(), '/sample.conf') -ConfigFileCreate(configfile, confirm = FALSE) -c <- ConfigFileOpen(configfile) -c <- ConfigEditDefinition(c, 'DEFAULT_VAR_MIN', '-1e19', confirm = FALSE) -c <- ConfigEditDefinition(c, 'DEFAULT_VAR_MAX', '1e19', confirm = FALSE) data_path <- system.file('sample_data', package = 's2dverification') -exp_data_path <- paste0(data_path, '/model/$EXP_NAME$/') -obs_data_path <- paste0(data_path, '/$OBS_NAME$/') -c <- ConfigAddEntry(c, 'experiments', dataset_name = 'experiment', - var_name = 'tos', main_path = exp_data_path, - file_path = '$STORE_FREQ$_mean/$VAR_NAME$_3hourly/$VAR_NAME$_$START_DATE$.nc') -c <- ConfigAddEntry(c, 'observations', dataset_name = 'observation', - var_name = 'tos', main_path = obs_data_path, - file_path = '$STORE_FREQ$_mean/$VAR_NAME$/$VAR_NAME$_$YEAR$$MONTH$.nc') -ConfigFileSave(c, configfile, confirm = FALSE) - +exp <- list( + name = 'experiment', + path = file.path(data_path, 'model/$EXP_NAME$/monthly_mean', + '$VAR_NAME$_3hourly/$VAR_NAME$_$START_DATES$.nc') + ) +obs <- list( + name = 'observation', + path = file.path(data_path, 'observation/$OBS_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$YEAR$$MONTH$.nc') + ) # Now we are ready to use Load(). startDates <- c('19851101', '19901101', '19951101', '20001101', '20051101') -sampleData <- Load('tos', c('experiment'), c('observation'), startDates, - output = 'lonlat', latmin = 27, latmax = 48, lonmin = -12, - lonmax = 40, configfile = configfile) +sampleData <- Load('tos', list(exp), list(obs), startDates, + leadtimemin = 1, leadtimemax = 4, output = 'lonlat', + latmin = 27, latmax = 48, lonmin = -12, lonmax = 40) } \dontshow{ startDates <- c('19851101', '19901101', '19951101', '20001101', '20051101') diff --git a/man/EnoNew.Rd b/man/EnoNew.Rd index 0cec23b452c87389481ddca2ee422d6097313262..20ed93214ceb513402f135727903055c2fa12c78 100644 --- a/man/EnoNew.Rd +++ b/man/EnoNew.Rd @@ -14,28 +14,22 @@ EnoNew(xdata, detrend = FALSE, filter = FALSE) \examples{ # See examples on Load() to understand the first lines in this example \dontrun{ -configfile <- paste0(tempdir(), '/sample.conf') -ConfigFileCreate(configfile, confirm = FALSE) -c <- ConfigFileOpen(configfile) -c <- ConfigEditDefinition(c, 'DEFAULT_VAR_MIN', '-1e19', confirm = FALSE) -c <- ConfigEditDefinition(c, 'DEFAULT_VAR_MAX', '1e19', confirm = FALSE) data_path <- system.file('sample_data', package = 's2dverification') -exp_data_path <- paste0(data_path, '/model/$EXP_NAME$/') -obs_data_path <- paste0(data_path, '/$OBS_NAME$/') -c <- ConfigAddEntry(c, 'experiments', dataset_name = 'experiment', - var_name = 'tos', main_path = exp_data_path, - file_path = '$STORE_FREQ$_mean/$VAR_NAME$_3hourly/$VAR_NAME$_$START_DATE$.nc') -c <- ConfigAddEntry(c, 'observations', dataset_name = 'observation', - var_name = 'tos', main_path = obs_data_path, - file_path = '$STORE_FREQ$_mean/$VAR_NAME$/$VAR_NAME$_$YEAR$$MONTH$.nc') -ConfigFileSave(c, configfile, confirm = FALSE) - +exp <- list( + name = 'experiment', + path = file.path(data_path, 'model/$EXP_NAME$/monthly_mean', + '$VAR_NAME$_3hourly/$VAR_NAME$_$START_DATES$.nc') + ) +obs <- list( + name = 'observation', + path = file.path(data_path, 'observation/$OBS_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$YEAR$$MONTH$.nc') + ) # Now we are ready to use Load(). startDates <- c('19851101', '19901101', '19951101', '20001101', '20051101') -sampleData <- Load('tos', c('experiment'), c('observation'), startDates, +sampleData <- Load('tos', list(exp), list(obs), startDates, leadtimemin = 1, leadtimemax = 4, output = 'lonlat', - latmin = 27, latmax = 48, lonmin = -12, lonmax = 40, - configfile = configfile) + latmin = 27, latmax = 48, lonmin = -12, lonmax = 40) } \dontshow{ startDates <- c('19851101', '19901101', '19951101', '20001101', '20051101') diff --git a/man/Histo2Hindcast.Rd b/man/Histo2Hindcast.Rd index 3764f66134d5ac463840108b903c50001a56fec1..06cf92e5d6da704840f931b1db5fbca59b8df168 100644 --- a/man/Histo2Hindcast.Rd +++ b/man/Histo2Hindcast.Rd @@ -31,28 +31,22 @@ An array with the same number of dimensions as varin, the same dimensions 1 and \examples{ # See examples on Load() to understand the first lines in this example \dontrun{ -configfile <- paste0(tempdir(), '/sample.conf') -ConfigFileCreate(configfile, confirm = FALSE) -c <- ConfigFileOpen(configfile) -c <- ConfigEditDefinition(c, 'DEFAULT_VAR_MIN', '-1e19', confirm = FALSE) -c <- ConfigEditDefinition(c, 'DEFAULT_VAR_MAX', '1e19', confirm = FALSE) data_path <- system.file('sample_data', package = 's2dverification') -exp_data_path <- paste0(data_path, '/model/$EXP_NAME$/') -obs_data_path <- paste0(data_path, '/$OBS_NAME$/') -c <- ConfigAddEntry(c, 'experiments', dataset_name = 'experiment', - var_name = 'tos', main_path = exp_data_path, - file_path = '$STORE_FREQ$_mean/$VAR_NAME$_3hourly/$VAR_NAME$_$START_DATE$.nc') -c <- ConfigAddEntry(c, 'observations', dataset_name = 'observation', - var_name = 'tos', main_path = obs_data_path, - file_path = '$STORE_FREQ$_mean/$VAR_NAME$/$VAR_NAME$_$YEAR$$MONTH$.nc') -ConfigFileSave(c, configfile, confirm = FALSE) - +exp <- list( + name = 'experiment', + path = file.path(data_path, 'model/$EXP_NAME$/monthly_mean', + '$VAR_NAME$_3hourly/$VAR_NAME$_$START_DATES$.nc') + ) +obs <- list( + name = 'observation', + path = file.path(data_path, 'observation/$OBS_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$YEAR$$MONTH$.nc') + ) # Now we are ready to use Load(). -startDates <- c('19901101') -sampleData <- Load('tos', c('experiment'), c('observation'), startDates, - leadtimemin = 1, leadtimemax = 4, output = 'areave', - latmin = 27, latmax = 48, lonmin = -12, lonmax = 40, - configfile = configfile) +startDates <- c('19851101', '19901101', '19951101', '20001101', '20051101') +sampleData <- Load('tos', list(exp), list(obs), startDates, + leadtimemin = 1, leadtimemax = 4, output = 'lonlat', + latmin = 27, latmax = 48, lonmin = -12, lonmax = 40) } \dontshow{ startDates <- c('19901101') diff --git a/man/Load.Rd b/man/Load.Rd index 32e02ada994f8116c5a46f52869fdd73af9d0ad4..eef60299dab3bb0e60216aee4cb8bcd63109759e 100644 --- a/man/Load.Rd +++ b/man/Load.Rd @@ -111,13 +111,13 @@ Load(var, exp = NULL, obs = NULL, sdates, nmember = NULL, } \arguments{ \item{var}{ -Name of the variable to load.\cr -If the variable name inside the files to load is not the same as this, adjust properly the parameters 'exp' and 'obs'.\cr -This parameter is mandatory.\cr -Ex: 'tas' +Short name of the variable to load. It should coincide with the variable name inside the data files.\cr +E.g.: \code{var = 'tos'}, \code{var = 'tas'}, \code{var = 'prlr'}.\cr +In some cases, though, the path to the files contains twice or more times the short name of the variable but the actual name of the variable inside the data files is different. In these cases it may be convenient to provide \code{var} with the name that appears in the file paths (see details on parameters \code{exp} and \code{obs}). } \item{exp}{ -This argument can take two formats: a list of lists or a vector of character strings. Each format will trigger a different mechanism of locating the requested datasets.\cr +Parameter to specify which experimental datasets to load data from.\cr +It can take two formats: a list of lists or a vector of character strings. Each format will trigger a different mechanism of locating the requested datasets.\cr The first format is adequate when loading data you'll only load once or occasionally. The second format is targeted to avoid providing repeatedly the information on a certain dataset but is more complex to use.\cr\cr IMPORTANT: Place first the experiment with the largest number of members and, if possible, with the largest number of leadtimes. If not possible, the arguments 'nmember' and/or 'nleadtime' should be filled to not miss any member or leadtime.\cr If 'exp' is not specified or set to NULL, observational data is loaded for each start-date as far as 'leadtimemax'. If 'leadtimemax' is not provided, \code{Load()} will retrieve data of a period of time as long as the time period between the first specified start date and the current date.\cr @@ -272,8 +272,9 @@ Takes by default the value 'conservative'. \item{grid}{ A common grid can be specified through the parameter 'grid' when loading 2-dimensional data. Data is then interpolated onto this grid whichever 'output' type is specified. If the selected output type is 'areave' and a 'grid' is specified, the area averages are calculated after interpolating to the specified grid.\cr If not specified and the selected output type is 'lon', 'lat' or 'lonlat', this parameter takes as default value the grid of the first experimental dataset, which is read automatically from the source files.\cr -The grid must be supported by 'cdo' tools: rNXxNY or tTRgrid.\cr -Ex: 'r96x72'\cr +The grid must be supported by 'cdo' tools. Now only supported: rNXxNY or tTRgrid.\cr +Both rNXxNY and tRESgrid yield rectangular regular grids. rNXxNY yields grids that are evenly spaced in longitudes and latitudes (in degrees). tRESgrid refers to a grid generated with series of spherical harmonics truncated at the RESth harmonic. However these spectral grids are usually associated to a gaussian grid, the latitudes of which are spaced with a Gaussian quadrature (not evenly spaced in degrees). The pattern tRESgrid will yield a gaussian grid.\cr +Ex: 'r96x72' Advanced: If the output type is 'lon', 'lat' or 'lonlat' and no common grid is specified, the grid of the first experimental or observational dataset is detected and all data is then interpolated onto this grid. If the first experimental or observational dataset's data is found shifted along the longitudes (i.e., there's no value at the longitude 0 but at a longitude close to it), the data is re-interpolated to suppress the shift. This has to be done in order to make sure all the data from all the datasets is properly aligned along longitudes, as there's no option so far in \code{Load} to specify grids starting at longitudes other than 0. This issue doesn't affect when loading in 'areave' mode without a common grid, the data is not re-interpolated in that case. } \item{maskmod}{ @@ -490,12 +491,46 @@ History:\cr # The observational dataset used for verification is the 'ERSST' # observational dataset. # -# The code is not run because it dispatches system calls to 'cdo' and 'nco' -# which is not allowed as per CRAN policies. You can run it in your system -# though. Instead, the code in 'dontshow' is run, which loads the equivalent -# data already processed in R. +# The next two examples are equivalent and show how to load the variable +# 'tos' from these sample datasets, the first providing lists of lists to +# the parameters 'exp' and 'obs' (see documentation on these parameters) and +# the second providing vectors of character strings, hence using a +# configuration file. +# +# The code is not run because it dispatches system calls to 'cdo' which is +# not allowed in the examples as per CRAN policies. You can run it on your +# system though. +# Instead, the code in 'dontshow' is run, which loads the equivalent +# already processed data in R. +# +# Example 1: Providing lists of lists to 'exp' and 'obs': +# + \dontrun{ +data_path <- system.file('sample_data', package = 's2dverification') +exp <- list( + name = 'experiment', + path = file.path(data_path, 'model/$EXP_NAME$/monthly_mean', + '$VAR_NAME$_3hourly/$VAR_NAME$_$START_DATES$.nc') + ) +obs <- list( + name = 'observation', + path = file.path(data_path, 'observation/$OBS_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$YEAR$$MONTH$.nc') + ) +# Now we are ready to use Load(). +startDates <- c('19851101', '19901101', '19951101', '20001101', '20051101') +sampleData <- Load('tos', list(exp), list(obs), startDates, + output = 'areave', latmin = 27, latmax = 48, + lonmin = -12, lonmax = 40) + } +# +# Example 2: Providing vectors of character strings to 'exp' and 'obs' +# and using a configuration file. +# +# The configuration file 'sample.conf' that we will create in the example +# has the proper entries to load these (see ?LoadConfigFile for details on +# writing a configuration file). # -# Example 1: providing lists in 'exp' and 'obs'. \dontrun{ data_path <- system.file('sample_data', package = 's2dverification') expA <- list(name = 'experiment', path = file.path(data_path, diff --git a/man/PlotLayout.Rd b/man/PlotLayout.Rd index 9e0c2cb7937db03122e02a89a83365f2cc0bb54d..34331a3776bb2eddc5179e7c06abaa3d7f85afe0 100644 --- a/man/PlotLayout.Rd +++ b/man/PlotLayout.Rd @@ -41,7 +41,7 @@ Multi-dimensional array with at least the dimensions expected by the specified p Parameters to be sent to the plotting function 'fun'. If multiple arrays are provided in 'var' and multiple functions are provided in 'fun', the parameters provided through \dots will be sent to all the plot functions, as common parameters. To specify concrete arguments for each of the plot functions see parameter 'special_args'. } \item{special_args}{ -List of sub-lists, each sub-list having extra arguments for each of the plot functions provided in 'fun'. +List of sub-lists, each sub-list having specific extra arguments for each of the plot functions provided in 'fun'. If you want to fix a different value for each plot in the layout you can do so by a) splitting your array into a list of sub-arrays (each with the data for one plot) and providing it as parameter 'var', b) providing a list of named sub-lists in 'special_args', where the names of each sub-list match the names of the parameters to be adjusted, and each value in a sub-list contains the value of the corresponding parameter. } \item{nrow}{ Numeric value to force the number of rows in the automatically generated layout. If higher than the required, this will yield blank cells in the layout (which can then be populated). If lower than the required the function will stop. By default it is configured to arrange the layout in a shape as square as possible. Blank cells can be manually populated after with customized plots (see SwitchTofigure). diff --git a/man/s2dverification.Rd b/man/s2dverification.Rd index 23d31299fe781177a445de3cbfc50ad9609d8ed8..27da42f0ba78f7833db6b10d5756e057cbb2a8cc 100644 --- a/man/s2dverification.Rd +++ b/man/s2dverification.Rd @@ -10,8 +10,8 @@ Set of tools to verify forecasts through the computation of typical prediction s \tabular{ll}{ Package: \tab s2dverification\cr Type: \tab Package\cr -Version: \tab 2.8.3\cr -Date: \tab 2017-05-22\cr +Version: \tab 2.8.4\cr +Date: \tab 2019-01-30\cr License: \tab LGPLv3\cr } Check an overview of the package functionalities and its modules at \url{https://earth.bsc.es/gitlab/es/s2dverification/wikis/home}. diff --git a/man/sampleMap.Rd b/man/sampleMap.Rd index e1afac5b6f8d17072d489fb6e545a131841fd978..fd5b9714149030200274dc214ac7c56ac1e933fa 100644 --- a/man/sampleMap.Rd +++ b/man/sampleMap.Rd @@ -7,29 +7,22 @@ This data set provides data in function of longitudes and latitudes for the vari The data is provided through a variable named 'sampleMap' and is structured as expected from the 'Load()' function in the 's2dverification' package if was called as follows:\cr\cr \preformatted{ -configfile <- paste0(tempdir(), '/sample.conf') -ConfigFileCreate(configfile, confirm = FALSE) -c <- ConfigFileOpen(configfile) -c <- ConfigEditDefinition(c, 'DEFAULT_VAR_MIN', '-1e19', confirm = FALSE) -c <- ConfigEditDefinition(c, 'DEFAULT_VAR_MAX', '1e19', confirm = FALSE) data_path <- system.file('sample_data', package = 's2dverification') -exp_data_path <- paste0(data_path, '/model/$EXP_NAME$/') -obs_data_path <- paste0(data_path, '/$OBS_NAME$/') -c <- ConfigAddEntry(c, 'experiments', dataset_name = 'experiment', - var_name = 'tos', main_path = exp_data_path, - file_path = file.path('$STORE_FREQ$_mean', - '$VAR_NAME$_3hourly/$VAR_NAME$_$START_DATE$.nc') -c <- ConfigAddEntry(c, 'observations', dataset_name = 'observation', - var_name = 'tos', main_path = obs_data_path, - file_path = file.path('$STORE_FREQ$_mean', - '$VAR_NAME$/$VAR_NAME$_$YEAR$$MONTH$.nc') -ConfigFileSave(c, configfile, confirm = FALSE) - +exp <- list( + name = 'experiment', + path = file.path(data_path, 'model/$EXP_NAME$/monthly_mean', + '$VAR_NAME$_3hourly/$VAR_NAME$_$START_DATES$.nc') + ) +obs <- list( + name = 'observation', + path = file.path(data_path, 'observation/$OBS_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$YEAR$$MONTH$.nc') + ) # Now we are ready to use Load(). startDates <- c('19851101', '19901101', '19951101', '20001101', '20051101') -sampleData <- Load('tos', c('experiment'), c('observation'), startDates, - output = 'lonlat', latmin = 27, latmax = 48, lonmin = -12, - lonmax = 40, configfile = configfile) +sampleData <- Load('tos', list(exp), list(obs), startDates, + leadtimemin = 1, leadtimemax = 4, output = 'lonlat', + latmin = 27, latmax = 48, lonmin = -12, lonmax = 40) } Check the documentation on 'Load()' in the package 's2dverification' for more information. } diff --git a/man/sampleTimeSeries.Rd b/man/sampleTimeSeries.Rd index 669d9441c9d9ae6c27dfe39387e1513d9f0886fb..869c91e4af79597dc40bf3d2da43fc1bfca09cfd 100644 --- a/man/sampleTimeSeries.Rd +++ b/man/sampleTimeSeries.Rd @@ -6,29 +6,22 @@ This data set provides area averaged data for the variable 'tos', i.e. sea surface temperature, over the mediterranean zone from the example datasets attached to the package. See examples on Load() for more details.\cr\cr The data is provided through a variable named 'sampleTimeSeries' and is structured as expected from the 'Load()' function in the 's2dverification' package if was called as follows:\cr\cr \preformatted{ -configfile <- paste0(tempdir(), '/sample.conf') -ConfigFileCreate(configfile, confirm = FALSE) -c <- ConfigFileOpen(configfile) -c <- ConfigEditDefinition(c, 'DEFAULT_VAR_MIN', '-1e19', confirm = FALSE) -c <- ConfigEditDefinition(c, 'DEFAULT_VAR_MAX', '1e19', confirm = FALSE) data_path <- system.file('sample_data', package = 's2dverification') -exp_data_path <- paste0(data_path, '/model/$EXP_NAME$/') -obs_data_path <- paste0(data_path, '/$OBS_NAME$/') -c <- ConfigAddEntry(c, 'experiments', dataset_name = 'experiment', - var_name = 'tos', main_path = exp_data_path, - file_path = file.path('$STORE_FREQ$_mean', - '$VAR_NAME$_3hourly/$VAR_NAME$_$START_DATE$.nc') -c <- ConfigAddEntry(c, 'observations', dataset_name = 'observation', - var_name = 'tos', main_path = obs_data_path, - file_path = file.path('$STORE_FREQ$_mean/$VAR_NAME$', - '$VAR_NAME$_$YEAR$$MONTH$.nc') -ConfigFileSave(c, configfile, confirm = FALSE) - +exp <- list( + name = 'experiment', + path = file.path(data_path, 'model/$EXP_NAME$/monthly_mean', + '$VAR_NAME$_3hourly/$VAR_NAME$_$START_DATES$.nc') + ) +obs <- list( + name = 'observation', + path = file.path(data_path, 'observation/$OBS_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$YEAR$$MONTH$.nc') + ) # Now we are ready to use Load(). startDates <- c('19851101', '19901101', '19951101', '20001101', '20051101') -sampleData <- Load('tos', c('experiment'), c('observation'), startDates, - output = 'areave', latmin = 27, latmax = 48, lonmin = -12, - lonmax = 40, configfile = configfile) +sampleData <- Load('tos', list(exp), list(obs), startDates, + output = 'areave', latmin = 27, latmax = 48, lonmin = -12, + lonmax = 40) } Check the documentation on 'Load()' in the package 's2dverification' for more information. } diff --git a/s2dverification-manual.pdf b/s2dverification-manual.pdf index 73866213b995894800577557a2ec44459e2bb07f..0392b213206b1f57fe404dee8b31e4b81d31694c 100644 Binary files a/s2dverification-manual.pdf and b/s2dverification-manual.pdf differ diff --git a/vignettes/data_retrieval.md b/vignettes/data_retrieval.md new file mode 100644 index 0000000000000000000000000000000000000000..4f3bd03c614f936aa02de93b2a4dcb59eaf66257 --- /dev/null +++ b/vignettes/data_retrieval.md @@ -0,0 +1,667 @@ +--- +title: "Data Retrieval" +author: "Nicolau Manubens" +date: "`r Sys.Date()`" +output: pdf_document +vignette: > + %\VignetteEngine{R.rsp::md} + %\VignetteIndexEntry{Data Retrieval} + %\usepackage[utf8]{inputenc} +--- + +Data retrieval +============== +One of the strong points of s2dverification is its data retrieval module. +Its aim is to load and homogenize forecast or hindcast data generated by +climate models, usually in multiple runs and multiple ensemble members per +run, as well as the corresponding observational time series obtained from +either pure observations or reanalyses. + +A climate model run simulates a time period starting at a **starting date** +and generates data for a series of time steps regularly spaced along the +**forecast time** that conform the simulated time period. A model run is +also usually referred to as starting date. +The **lead-time** at each forecast time step is the amount of simulated time +since the beginning of the simulation, e.g., the first value of a +monthly-averaged forecast has a lead-time of 0 months, the second value has a +lead-time of 1 month and so on. In the context of s2dverification, however, +the concept lead-time is used to make reference to the forecast time steps: +the first lead-time of a forecast is the value with a lead-time of 0 time units +(the first time step), the second lead-time is the one with a lead-time of 1 +time unit (the second time step) and so on. +Data at one forecast time (or lead-time) from different runs or starting dates +conform a time series defined along the **actual time**. + +`Load()` is the interface function to the data retrieval module. Through it +you can request previously monthly-averaged or daily-averaged 2-dimensional +variables or global mean variables from as many experimental and observational +multi-member datasets as needed, at the selected starting dates and forecast +time steps, over any longitude-latitude region of the Earth surface. + +`Load()` automatically fetches the selected starting dates of the requested +experimental datasets and the date-corresponding observational data, +homogenizes it and loads it into an R object ready to be processed. +If done manually this homogenization procedure can be time-consuming and +become a hurdle in the verification process. + +The object returned by `Load()` contains, among others, two arrays with the +requested data, one for the experimental data and the other for observational +data, with a disctintive dimension structure of s2dverification: +```r +c(n. of datasets, n. of members, n. of starting dates, n. of lead-times, n. of latitudes, n. of longitudes) +``` + +The homogenization process consists of the following steps: + 1. Regridding data. `Load()` relies on CDO tools for this task (see +[**Installation**](../README.md)). + 2. Applying masks to disable certain grid cells. + 3. Disabling values beyond certain thresholds. + 4. Computing area-weighted averages. + +(The 1st, 2nd and 4th steps only apply if loading 2-dimensional variables.) + +Supported file formats +---------------------- +There are some guidelines that the files of the datasets must follow so that +`Load()` is able to recognize, fetch and process them: + - Data files must be in NetCDF 3/4 local or remote files or indexed in NCML +catalogs provided by THREDDS servers. + - Data files must contain monthly or daily time series. Data in other +frequencies can also be loaded as experimental data, but the automatical fetch +of date-corresponding observations will fail. + - The only supported source (and target) grids are regular and gaussian grids +(those definable with the CDO grid names that follow the patterns 'rx' +or 'tgrid'). + - The variable of interest must be defined over the dimensions +(time[, members]) if is a global means time-series or (time[, members], +latitudes, longitudes) if is a 2-dimensional field. These dimensions +can be named at will and can be in any order. The less time-consuming orders +to load are, however, (time[, members]) or (time[, members], latitudes, +longitudes). Here, the dimension time makes reference to forecast time. + - If the data is 2-dimensional, a variable with the grid latitudes and a +variable with the grid longitudes must be defined, with the same name as their +corresponding dimension. + - The files of an experimental dataset must be distributed in either of the +following two ways: + - File per starting date: each file must contain the starting date\* +somewhere in its path. + - File per starting date per member: each file must contain the the +starting date\* and the member number\*\* somewhere in its path. + - The files of an observational dataset must be distributed in either of the +following three ways: + - File per month: each file must contain the year and month\*\*\* somewhere +in its path. + - File per month per member: each file must contain the member +number\*\* and the year and month\*\*\* somewhere in its path. + - Single file: the time axis must be properly defined to work with +`cdo showmon` and `cdo showyear`. + +\*: starting dates in the format YYYYMMDD. +\*\*: member number, from 0 to total_number_of_members - 1, with as many +preceding 0s as needed to reach the needed number of digits to represent +total_number_of_members - 1. +\*\*\*: year in the format YYYY and month in the format MM. + +Load() options +-------------- + +As of s2dverification v2.5.0, `Load()` has 27 parameters. +14 of them are for selecting the data of interest, 8 of them are for adjusting +the homogenization steps and the other 5 are for general adjustments of +`Load()`. See them explained in 3 sections: + - [**Selection parameters**](#selection) + - [**Homogenization parameters**](#homogenization) + - [**General parameters**](#general) + +### Selection parameters +The selection parameters are `var`, `exp`, `obs`, `sdates`, `nmember`, +`nmemberobs`, `nleadtime` (deprecated), `leadtimemin`, `leadtimemax`, +`sampleperiod`, `lonmin`, `lonmax`, `latmin` and `latmax`. You can see most of +them in use in the [**Example**](example.md) except for `nmember`, +`nmemberobs`, `nleadtime` and `sampleperiod`. The first part of that example +is reproduced in more detail below in this section. + +`var` must be provided with the short name of a 2-dimensional or global mean +variable of interest, for example `var = 'tas'` (most usually the provided +name must be the same as the variable name inside the NetCDF data files). +`exp` and `obs` contain the information on the location of the requested +datasets, `sdates` and `leadtimemin`/ `leadtimemax` tell `Load()` which +starting dates and lead-times to pick and so on. +Check the [**user manual**](../s2dverification-manual.pdf) or `?Load` for a +thorough explanation. + +Now, as an example, let's consider we have two experimental datasets and one +observational datasets, both with data for the variable 'tas' and with the +files distributed as depicted in the following directory tree: +``` + /path/to/ + |--experiments/ + | |--experimentA/ + | | |--monthly_mean/ + | | |--tas/ + | | |--tas_19911101.nc + | | |--tas_19921101.nc + | | | · + | | | · + | | | · + | | |--tas_20001101.nc + | |--experimentB/ + | |--monthly_mean/ + | |--tas/ + | |--tas_19911101.nc + | |--tas_19921101.nc + | | · + | | · + | | · + | |--tas_20001101.nc + |--observations/ + |--observationX/ + |--monthly_mean/ + |--tas/ + |--tas_199101.nc + |--tas_199102.nc + |--tas_199103.nc + |--tas_199104.nc + |--tas_199105.nc + |--tas_199106.nc + |--tas_199107.nc + |--tas_199108.nc + |--tas_199109.nc + |--tas_199110.nc + |--tas_199111.nc + |--tas_199112.nc + |--tas_199201.nc + |--tas_199202.nc + | · + | · + | · + |--tas_199401.nc + |--tas_199402.nc + |--tas_199403.nc +``` +We assume these files are NetCDF 3/4 compliant and have the variable 'tas' +defined inside, fulfilling the guidelines explained above in the previous +section. + +We can now build the information blocks of each dataset as follows: +```r +library(s2dverification) + +expA <- list( + name = 'experimentA', + path = file.path('/path/to/experiments/$EXP_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc') + ) +expB <- list( + name = 'experimentB', + path = file.path('/path/to/experiments/$EXP_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc') + ) +obsX <- list( + name = 'observationX', + path = file.path('/path/to/observations/$OBS_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$YEAR$$MONTH$.nc') + ) +``` +`Load()` will automatically replace the wildcards (keywords starting and +ending with '$') with the corresponding starting dates, variable name, etc. +See help on parameters `exp` and `obs` in `?Load` for a full explanation. + +Now time to issue the `Load()` call: the target variable name will be `'tas'` +and the parameters `exp` and `obs` can be built from the previous information +blocks. Apart from that we just need to specify a set of starting dates for +`Load()` to work: + +```r +sdates <- paste0(1991:2000, '1101') +data <- Load(var = 'tas', exp = list(expA, expB), obs = list(obsX), + sdates = sdates) +``` + +This will pick by default as many members and lead-times of each experimental +dataset as found in the first one ('expA'), and as many members and lead-times +of each observational dataset (only 1 in this example) as found in the first +one ('obsX'). + +All the data will be returned in a named list with many components, in a +format similar to that used in the packages 'downscaleR' and 'loadR' (see +`?Load` for a full description of the returned metadata). +The two components that contain the experimental and observational data are +`$mod` and `$obs`. These are two arrays with the dimensions in the following +order: +```r +c(n. of datasets, n. of members, n. of starting dates, n. of lead-times, n. of latitudes, n. of longitudes) +``` +The presence of the last two dimensions will depend on whether the requested +variable is 2-dimensional or global mean and on the selected `output` type +(see below or in `?Load`). See +[**Normalizing fields**](statistics.md#normalizing) +for the *raison d'être* of this array structure for observational data. + +In the case of the example, if 'expA' has 5 members but 'expB' has 8, the +resulting size of the members dimension will be 5 and the last 3 members of +'expB' will be missed. Besides, if 'expB' has only 2 members, the 3 remaining +members will be filled up with NA values. It happens exactly the same with the +lead-times. +To avoid this default behaviour you can specify the expected maximum number of +members in `nmember` and `nmemberobs` and the expected maximum number of +lead-times in `leadtimemax`. + +Also, if not interested in the few first lead-times of the starting dates, you +can discard them specifying the first lead-time of interest with the parameter +`leadtimemin`, and even pick one lead-time of each `sampleperiod` lead-times +starting from `leadtimemin`. + +The following code will pick the lead-times corresponding to a time period of +one year, starting from December, since the first month of the starting dates +is November and `leadtimemin` is 2: + +```r +sdates <- paste0(1991:2000, '1101') +data <- Load(var = 'tas', exp = list(expA, expB), obs = list(obsX), + sdates = sdates, leadtimemin = 2, leadtimemax = 13) +``` + +It is possible to load experimental data only or observational data only by +providing `NULL` to `exp` or `obs`. + +In the example just above, the loaded variable is 'tas', a 2-dimensional +variable. However though, `Load()` computes by default an area-weighted +average, hence the component `data$mod` will have dimensions +`c(2, 5, 3, 6)` and the component `data$obs` will have dimensions +`c(1, 1, 3, 6)`. Read below in **Homogenization parameters** on how to change +the `output` type of `Load()`. + +If willing to work with averages of a specific region of the Earth surface +you can specify the limits of its bounding box via the parameters `latmin`, +`latmax`, `lonmin` and `lonmax`. For example, for the North Pacific region: + +```r +sdates <- paste0(1991:2000, '1101') +data <- Load(var = 'tas', exp = list(expA, expB), obs = list(obsX), + sdates = sdates, leadtimemin = 2, leadtimemax = 13, + latmin = -10, latmax = 60, lonmin = 100, lonmax = 250) +``` + +### Homogenization parameters +This group of parameters is conformed by all the parameters that have effects +in any of the homogenization steps mentioned in the introduction of +[**Data retrieval**](data_retrieval.md) namely: `output`, `method`, `grid`, +`maskmod`, `maskobs`, `varmin`, `varmax` and `remapcells`. + +When loading 2-dimensional variables, `Load()` calculates and returns by +default area-weighted averages of the values in the grid cells that fall +inside the requested region. The area averages are computed from the original +datasets in their original grids. `Load()`, however, prior to computing the +averages, can remap all the data to a common grid if you specify it via the +`grid` argument. The grid specification must be a character string that follows +one of the patterns 'rx' or 'tgrid', where '' and '' are +the number of longitudes and latitudes of a rectangular grid and '' is the +factor of truncation of spherical harmonics of a gaussian grid. + E.g.: 'r320x160' or 't73grid'. + +Internally, `Load()` will dispatch CDO calls to interpolate the original files. +One of four methods of interpolation can be chosen through the parameter +`method`: 'conservative', 'bilinear', 'bicubic' or 'distance-weighted'. The +parameter `remapcells` is related to the precision of grid interpolations, you +can read more in `?Load`. + +Additionally, any grid cells of your choice can be disabled before computing +the averages by providing a mask for each dataset in `exp` and `obs`. These +masks must be provided with the arguments `maskmod` and `maskobs` and can +simply be a pointer to a NetCDF mask or an R array with the proper size. +Read more on the arguments `maskmod` and `maskobs` in `?Load` to see the +guidelines the mask files must follow or how to provide a mask as an R array. + +It is also possible to work directly with 2-dimensional data without averaging +by setting the parameter `output = 'lonlat'`. With this option the parameters +`grid`, `method`, `maskmod` and `maskobs` keep applying: you can request your +data to be in a common rectangular or gaussian grid of your choice and disable +some values with a mask. + +To summarize the homogenization parameters mentioned so far, we could dispatch the +following `Load()` call: + +```r +mask <- list(path = '/path/to/masks/land_mask_r320x160.nc') +sdates <- paste0(1991:2000, '1101') +data <- Load(var = 'tas', exp = list(expA, expB), obs = list(obsX), + sdates = sdates, leadtimemin = 2, leadtimemax = 13, + latmin = -10, latmax = 60, lonmin = 100, lonmax = 250, + output = 'lonlat', grid = 'r320x160', method = 'bicubic', + maskmod = list(mask, mask), maskobs = list(mask)) +``` + +Finally, extreme values can be disabled by setting a upper and a lower +threshold via the parameters `varmin` and `varmax`. + +### General parameters +The general parameters are `storefreq`, `configfile`, `silent`, `nprocs` and +`dimnames`. + +#### Loading daily data +It is possible to load not only monthly data but also daily data with exactly +the same funcionalities available in both modes. `storefreq` tells `Load()` +whether the experimental data to load is stored in a monthly frequency or in a +daily frequency. This way it will be able to work out the dates of the +corresponding observational data and fetch the corresponding files. The only +supported values for this parameter currently are 'monthly' (the default) and +'daily'. +Note that in the case that only experimental datasets are requested +(via `exp`) any storage frequency is be supported, since `Load()` doesn't +have to make date-correspondences. It is the user's responsibility in these +cases to ensure that the experimental datasets are stored in a common time +frequency. + +#### Adjusting dimension names +Regarding the guidelines that the files of a dataset must follow, the variable +of interest inside the NetCDF files must be defined over the dimensions +(time[, members][, latitudes, longitudes]). These dimensions can be named at +will but, by default, the expected names are 'ensemble', 'latitude' +and 'longitude' (the time dimension can take any name). If the name of any of +the dimensions in all the datasets to load is different to this default, it +can be adjusted with the parameter `dimnames` as in the following example: +```r +dimnames = list(member = 'members', lat = 'j', lon = 'i') +``` +However, if only one of the datasets breaks the rules or each dataset has its +own naming convention, you can adjust individually when building the +information blocks of each dataset as in the following example: +```r +expA <- list( + name = 'experimentA', + path = file.path('/path/to/experiments/$EXP_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc'), + dimnames = list(member = 'members') + ) +expB <- list( + name = 'experimentB', + path = file.path('/path/to/experiments/$EXP_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc'), + dimnames = list(lat = 'j', lon = 'i') + ) +obsX <- list( + name = 'observationX', + path = file.path('/path/to/observations/$OBS_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$YEAR$$MONTH$.nc'), + dimnames = list(member = 'ens', lat = 'y', lon = 'x') + ) +``` + +#### Workflow of `Load()` and parallelism +The workflow of `Load()` consists of 3 steps: + 1. Building up work pieces +Each work piece is a packet of information that describes the location of a +**single** data file requested in a `Load()` call (e.g. 'tas_19901101.nc') and +the homogenization and arranging operations to be applied to that file. + 2. Setting up a cluster +A multi-core cluster is set up. As of now `Load()` can only make the most of +multiple cores in a single processor but not of multiple processors. +The number of nodes in the multi-core cluster can be adjusted via `nprocs` +(number of processes, as each node is a full new R process). Note that the +number of nodes does not necessarly have to coincide with the number of cores: +if `nprocs` is lower than the number of cores, probably some of the available +cores will remain unused and if `nprocs` is higher, a single core will have +more than one task assigned at a time. Depending on the environment this may +be convenient. +Three aspects to take into account when selecting the appropriate `nprocs` are: + - Whether hyperthreading is activated in the system + - The fact that each node will dispatch parallel blocking calls to CDO +along its execution + - A 'bigmemory' shared memory region will be set up for the multiple cores +to arrange the resulting data of each work piece. +Setting `nprocs = 1` will avoid creating a cluster and all the workload will +be carried out in a single process (that may dispatch parallel blocking CDO +calls). + 3. Dispatching the work pieces to the cluster +Once the work pieces and the cluster are ready, `Load()` dispatches the work +pieces with load balancing to the nodes of the cluster. Each node will keep +applying the internal function `LoadDataFile()` to each work piece. + +#### Running `Load()` silently +All along the process `Load()` keeps informing about the status and any +detected potential issues. When run in batch mode it may be of your interest +to turn the messages off by setting `silent = TRUE`. + +#### Using a configuration file +When using `Load()` frequently, one may end up repeating many times the code +to build the same information blocks to load certain specific datasets. +Besides, when using `Load()` in a team, one of its members may be interested +in using datasets that other members usually work with or may want to make +accessible his datasets to the colleagues. + +To help in these two situations `Load()` has a built in mechanism to read +**configuration files** that contain, in a tabular and compact format, the +information blocks of the datasets of interest. This way, by defining the +information of a dataset only once in a configuration file, `Load()` will be +able to read its information as needed. Also, the members of a team can share +a configuration file to put the information of their datasets at others' +disposal. + +A configuration file can be created by hand, following the specifications in +`?ConfigFileOpen`, or can be created and edited through a set of functions +included in s2dverification: `ConfigFileCreate()`, `ConfigFileOpen()`, +`ConfigAddEntry()`, `ConfigEditEntry()`, `ConfigRemoveEntry()`, +`ConfigEditDefinition()`, `ConfigRemoveDefinition()`, `ConfigFileSave()`, +`ConfigShowTable()`, `ConfigShowDefinitions()`, `ConfigShowSimilarEntries()` +and `ConfigApplyMatchingEntries()`. + +The first option can be faster for experienced users but the second option is +safer and useful to automatise the creation of configuration files. The +second option will be used in the following examples but the raw file will +also be displayed to show the internals of the configuration files and to +give some hints on how to create them manually. + +A blank new configuration file can be created as follows: +```r +conf_path <- '/path/to/configfile.conf' +ConfigFileCreate(conf_path) +``` +This will generate a file at the specified path with all the boilerplate: +``` +# s2dverification configuration file +# +# Check ?ConfigFileOpen after loading s2dverification for detailed +# documentation on this configuration file. + +############# +!!definitions +############# +DEFAULT_EXP_MAIN_PATH = $EXP_NAME$ +DEFAULT_EXP_FILE_PATH = $STORE_FREQ$/$VAR_NAME$_$START_DATE$.nc +DEFAULT_NC_VAR_NAME = $VAR_NAME$ +DEFAULT_SUFFIX = +DEFAULT_VAR_MIN = +DEFAULT_VAR_MAX = +DEFAULT_OBS_MAIN_PATH = $OBS_NAME$ +DEFAULT_OBS_FILE_PATH = $STORE_FREQ$/$VAR_NAME$_$YEAR$$MONTH$.nc +DEFAULT_DIM_NAME_LONGITUDES = longitude +DEFAULT_DIM_NAME_LATITUDES = latitude +DEFAULT_DIM_NAME_MEMBERS = ensemble + + +###################### +!!table of experiments +###################### +#exp_name, var_name[, exp_main_path[, exp_file_path[, nc_var_name[, suffix[, var_min[, var_max]]]]]] + +####################### +!!table of observations +####################### +#obs_name, var_name[, obs_main_path[, obs_file_path[, nc_var_name[, suffix[, var_min[, var_max]]]]]] +``` +The lines starting with '#' are comments and the ones starting with '!' are +key lines that can't be moved or removed. + +As just shown, the configuration file consists of a list of definitions and of +two tables, one to keep information on experimental datasets and the other +for observational datasets. + +The rows or entries of the tables will be filled up each with information +relative to a dataset. An entry will associate a pair of dataset name and +variable name to a set of information pieces. +Each entry keeps 8 columns, the pair of keys and 6 information pieces +associated to a dataset: the dataset short name or identifier, the variable +short name, the main path or path to the dataset main folder, the file path or +path to the NetCDF files of the dataset, the actual variable name inside the +files, a suffix (read further) and a lower and uper thresholds to disable +values of the dataset beyond them. +The list of definitions contains, initially, the default values for the columns +of the tables. It can be extended with other definitions that can be accessed +from throughout the configuration file, which is useful to avoid repeating +certain paths, for example, in multiple entries. A definition follows the +format `VARIABLE = value` and its value can be invoked with `$VARIABLE$`. +Apart from these explicit definitions, there are some implicit key definitions: + - `$EXP_NAME$` to retrieve, from an experimental dataset entry, the +identifier name of an experimenal dataset, as requested to `Load()`. + - `$OBS_NAME$`, similar to `$OBS_NAME$`. + - `$VAR_NAME$`, the short name of the variable, as requested to `Load()`. + - `$START_DATE$`, one of the starting dates, as requested to `Load()`. + - `$YEAR$`, `$MONTH$`, `$DAY$`, taken from the `$START_DATE$`. + - `$MEMBER_NUMBER$`, the number of the member in loadin process, from 0 to +total_number_of_members - 1 and with padding zeros. + - `$STORE_FREQ$`, the frequency of the experimental data in disk, as +specified to `Load()` via `storefreq`. + +The next step is to add the entries of the datasets of interest. It can be +achieved with a combination of the functions `ConfigFileOpen()`, +`ConfigAddEntry()` and `ConfigFileSave()`: +```r +c <- ConfigFileOpen(conf_path) +c <- ConfigAddEntry(c, 'experiments', dataset_name = 'expA', var_name = 'tas', + main_path = '/path/to/experiments/$EXP_NAME$/', + file_path = 'monthly_mean/$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc') +c <- ConfigAddEntry(c, 'experiments', dataset_name = 'expB', var_name = 'tas', + main_path = '/path/to/experiments/$EXP_NAME$/', + file_path = 'monthly_mean/$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc') +c <- ConfigAddEntry(c, 'observations', dataset_name = 'obsX', var_name = 'tas', + main_path = '/path/to/observations/$OBS_NAME$/', + file_path = 'monthly_mean/$VAR_NAME$/$VAR_NAME$_$YEAR$$MONTH$.nc') +ConfigFileSave(c, conf_path) +``` + +The tables in the file, at this point, are shown next: +``` +###################### +!!table of experiments +###################### +#exp_name, var_name[, exp_main_path[, exp_file_path[, nc_var_name[, suffix[, var_min[, var_max]]]]]] +expA, tas, /path/to/experiments/$EXP_NAME$/, monthly_mean/$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc, *, *, *, * +expB, tas, /path/to/experiments/$EXP_NAME$/, monthly_mean/$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc, *, *, *, * + +####################### +!!table of observations +####################### +#obs_name, var_name[, obs_main_path[, obs_file_path[, nc_var_name[, suffix[, var_min[, var_max]]]]]] +obsX, tas, /path/to/observations/$OBS_NAME$/, monthly_mean/$VAR_NAME$/$VAR_NAME$_$YEAR$$MONTH$.nc, *, *, *, * +``` + +An asterisk ('*') means that the cell will take the default value defined in +the defaults for the corresponding column. So the column 'nc_var_name' takes +'$VAR_NAME$' which is then replaced by 'tas' in this case. The other default +values are all empty. + +Once the file is ready it can be used from `Load()` by specifying its path via +the parameter `configfile`. Then, `exp` and `obs` can be used in its simple +format: a vector of character strings that tell the name of the datasets of +interest. For example: +```r +sdates <- paste0(1991:2000, '1101') +data <- Load(var = 'tas', exp = c('expA', 'expB'), obs = c('obsX'), + sdates = sdates, configfile = '/path/to/configfile.conf') +``` + +From this point on, one could create files with entries for each pair of +(dataset name, variable name) of interest. + +However, the first two columns of the tables in a configuration file can also +take regular expressions to match more than one dataset name or variable name. +This way, a more compact and generalized table can be obtained: +```r +c <- ConfigFileOpen(conf_path) +c <- ConfigAddEntry(c, 'experiments', dataset_name = '.*', var_name = '.*', + main_path = '/path/to/experiments/$EXP_NAME$/', + file_path = 'monthly_mean/$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc') +c <- ConfigAddEntry(c, 'observations', dataset_name = '.*', var_name = '.*', + main_path = '/path/to/observations/$OBS_NAME$/', + file_path = 'monthly_mean/$VAR_NAME$/$VAR_NAME$_$YEAR$$MONTH$.nc') +ConfigFileSave(c, conf_path) +``` + +The corresponding tables are: +``` +###################### +!!table of experiments +###################### +#exp_name, var_name[, exp_main_path[, exp_file_path[, nc_var_name[, suffix[, var_min[, var_max]]]]]] +.*, .*, /path/to/experiments/$EXP_NAME$/, monthly_mean/$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc, *, *, *, * + +####################### +!!table of observations +####################### +#obs_name, var_name[, obs_main_path[, obs_file_path[, nc_var_name[, suffix[, var_min[, var_max]]]]]] +.*, .*, /path/to/observations/$OBS_NAME$/, monthly_mean/$VAR_NAME$/$VAR_NAME$_$YEAR$$MONTH$.nc, *, *, *, * +``` + +The regular expression '.*' matches any sequence of characters, so these +entries will assign to all the (experiment name, variable name) pairs a path +with the pattern +`/path/to/experiments/$EXP_NAME$/monthly_mean/$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc` +or, to all the (observation name, variable name) pairs a path with the pattern +`/path/to/observations/$OBS_NAME$/monthly_mean/$VAR_NAME$/$VAR_NAME$_$YEAR$$MONTH$.nc`. +In other words, with only two entries in the configuration file you will be +able to load many variables from many datasets, as long as they follow a +common convention. + +If more than one entry match a pair (dataset name, variable name) all of +them will be applied from more general to more specific to obtain a value +for each of the 6 information pieces that, at the end, will tell the location +and information of the requested variable for the requested dataset. + +The configuration file, all in all, is a complex but powerful mechanism. +Read `?ConfigFileOpen` for a complete explanation of the details of the tables +in the configuration files and how multiple matches are applied to build up the +final information. Also see there why the 'suffix' column can be useful. + +#### Re-arranging long runs +Sometimes it can happen that one needs to compare data from two experimental +datasets, one stored in multiple files (one per starting dates) and the other +stored in a single long run (one file with one starting date). + +In this case running one single `Load()` call won't be enough since the +`sdates` argument has to take different values in function of the distribution +of the starting dates of the dataset to load. Besides, running two separate +`Load()` calls with different starting dates yields two non-comparable data +sets. + +To solve this, it is possible to re-arrange the long run into multiple starting +dates with `Histo2Hindcast()` by just specifying the original starting date, +the series of target starting dates and the number of target lead-times: + +Loading the multiple run experiment: +``` +expA <- list( + name = 'experimentA', + path = file.path('/path/to/experiments/$EXP_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc') + ) +sdatesA <- paste0(1991:2000, '1101') +leadtimesA <- 12 +data <- Load(var = 'tas', exp = list(expA), obs = NULL, sdates = sdatesA, + leadtimemax = leadtimesA) +``` + +Loading the single run experiment: +``` +expC <- list( + name = 'experimentC', + path = file.path('/path/to/experiments/$EXP_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc') + ) +sdatesC <- '19501101' +leadtimemin <- (1991 - 1950) * 12 + 1 +leadtimemax <- leadtimemin + (2000 - 1991 + 1) * 12 +data <- Load(var = 'tas', exp = list(expC), obs = NULL, sdates = sdatesC, + leadtimemin = leadtimemin, leadtimemax = leadtimemax) +data <- Histo2Hindcast(data, sdatesA[1], sdatesA, leadtimesA) +``` diff --git a/vignettes/ex_ano_expA_obsX.png b/vignettes/ex_ano_expA_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..cbb43565f8d978f0b0957822422226f306005a74 Binary files /dev/null and b/vignettes/ex_ano_expA_obsX.png differ diff --git a/vignettes/ex_ano_expB_obsX.png b/vignettes/ex_ano_expB_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..1726f6708be37f53a759bbaf3f0bb848212d2b29 Binary files /dev/null and b/vignettes/ex_ano_expB_obsX.png differ diff --git a/vignettes/ex_clim_expA_expB_obsX.png b/vignettes/ex_clim_expA_expB_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..2cf4157d566399f02ba5c7cfbd76dc6cd34f2768 Binary files /dev/null and b/vignettes/ex_clim_expA_expB_obsX.png differ diff --git a/vignettes/ex_corr_expA_expB_obsX.png b/vignettes/ex_corr_expA_expB_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..c5f8ce9c848acc3478cd1dcf83c9bea791837180 Binary files /dev/null and b/vignettes/ex_corr_expA_expB_obsX.png differ diff --git a/vignettes/ex_raw_expA_obsX.png b/vignettes/ex_raw_expA_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..801abb0741d730960143b2b85bbb0483838c9aa0 Binary files /dev/null and b/vignettes/ex_raw_expA_obsX.png differ diff --git a/vignettes/ex_raw_expB_obsX.png b/vignettes/ex_raw_expB_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..d8fe409d7e21f5b5d9b9d9398fc795e7b91abe65 Binary files /dev/null and b/vignettes/ex_raw_expB_obsX.png differ diff --git a/vignettes/example.md b/vignettes/example.md new file mode 100644 index 0000000000000000000000000000000000000000..90fbe53a9bb637058cc19b88393772c768a0d586 --- /dev/null +++ b/vignettes/example.md @@ -0,0 +1,262 @@ +--- +author: "Nicolau Manubens" +date: "`r Sys.Date()`" +output: html_document +vignette: > + %\VignetteEngine{R.rsp::md} + %\VignetteIndexEntry{Example} + %\usepackage[utf8]{inputenc} +--- +Example +======= + +Next you can see an example of usage of s2dverification spanning its +four modules ([**Data retrieval**](data_retrieval.md), +[**Statistics**](statistics.md), [**Verification**](verification.md) +and [**Visualisation**](visualisation.md)). + +The goal of the example is to load and analyse the skill of two sample +forecasts of the near-surface air temperature over the North Pacific region +and assess their skill against a reference reanalysis. +The data used in this example has gone already through a first post-processing +stage in which the monthly means have been computed. + +Loading data +------------ + +First the package is loaded and attached. + +The goal is to load data for two experimental datasets and one observational +dataset to be used as reference. The distribution in the file system and +details of these datasets are explained in [**Data retrieval**]. + +A list is built with information on the location of each dataset to load. +A path pattern is specified for each dataset, in this case, with the wildcards +'$VAR_NAME$', '$START_DATE$', '$YEAR$', '$MONTH$', '$EXP_NAME$' and +'$OBS_NAME$'. These wildcards will be automatically replaced by `Load()` with +the corresponding values (the requested variable name, the requested starting +dates, years or months, the dataset names, ...). See the available wildcards in `?Load`. + +```r +library(s2dverification) + +expA <- list(name = 'experimentA', + path = file.path('/path/to/experiments/$EXP_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc')) +expB <- list(name = 'experimentB', + path = file.path('/path/to/experiments/$EXP_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc')) +obsX <- list(name = 'observationX', + path = file.path('/path/to/observations/$OBS_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$YEAR$$MONTH$.nc')) +``` + +Then the data is loaded with `Load()` providing the previously built lists +-to specify the desired datasets- and other parameters to select the Earth +surface region of interest, starting dates and forecast time steps to load +data from. +In this example, the requested format is 2-dimensional: all the loaded data +will be remapped onto the specified common 'grid' via CDO libraries. + +```r +sdates <- paste0(1991:2000, '1101') +data <- Load('tas', list(expA, expB), list(obsX), + sdates = sdates, + leadtimemin = 2, leadtimemax = 13, + latmin = -10, latmax = 60, + lonmin = 100, lonmax = 250, + output = 'lonlat', grid = 't106grid', + method = 'distance-weighted') +## * The load call you issued is: +## * Load(var = "tas", exp = list(structure(list(name = "experimentA", path = +## * "/path/to/experiments/$EXP_NAME$/monthly_mean/$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc"), +## * .Names = c("name", "path")), structure(list(name = +## * "experimentB", path = +## * "/path/to/experiments/$EXP_NAME$/monthly_mean/$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc"), +## * .Names = c("name", "path"))), obs = list(structure(list(name = +## * "observationX", path = +## * "/path/to/observations/$OBS_NAME$/monthly_mean/$VAR_NAME$/$VAR_NAME$_$YEAR$$MONTH$.nc"), +## * .Names = c("name", "path"))), sdates = c("19911101", +## * "19921101", ..., "20001101"), grid = "t106grid", output = +## * "lonlat", storefreq = "monthly", ...) +## * See the full call in '$load_parameters' after Load() finishes. +## * Fetching first experimental files to work out 'var_exp' size... +## * Exploring dimensions... /path/to/experiments/experimentA/monthly_mean/tas/tas_19911101.nc +## * Success. Detected dimensions of experimental data: 2, 5, 10, 12, 63, 134 +## * Fetching first observational files to work out 'var_obs' size... +## * Exploring dimensions... /path/to/observations/observationX/monthly_mean/tas/tas_199112.nc +## * Success. Detected dimensions of observational data: 1, 1, 10, 12, 63, 134 +## * Will now proceed to read and process 140 data files: +## * The list of files is long. You can check it after Load() finishes in the output '$source_files'. +## * Total size of requested data: 89147520 bytes. +## * - Experimental data: ( 2 x 5 x 10 x 12 x 63 x 134 ) x 8 bytes = 81043200 bytes. +## * - Observational data: ( 1 x 1 x 10 x 12 x 63 x 134 ) x 8 bytes = 8104320 bytes. +## * If size of requested data is close to or above the free shared RAM memory, R will crash. +## starting worker pid=3674 on localhost:11060 at 15:11:26.226 +## starting worker pid=3690 on localhost:11060 at 15:11:26.419 +## starting worker pid=3706 on localhost:11060 at 15:11:26.614 +## starting worker pid=3722 on localhost:11060 at 15:11:26.805 +## starting worker pid=3739 on localhost:11060 at 15:11:26.997 +## starting worker pid=3755 on localhost:11060 at 15:11:27.190 +## starting worker pid=3771 on localhost:11060 at 15:11:27.380 +## starting worker pid=3787 on localhost:11060 at 15:11:27.572 +## * Loading... This may take several minutes... +## * Progress: 0% + 10% + 10% + 10% + 10% + 10% + 10% + 10% + 10% + 10% + 10% +``` + +The output consists of two arrays of data (experimental and observational +data) with labelled dimensions, a list of loaded files, a list of not found +files and a call stamp to exactly reproduce as needed, among others. +See [**Data +retrieval**](data_retrieval.md) +for a full explanation of the capabilities and outputs of `Load()`. + +Keep in mind that the two arrays obtained from `Load()` will have the +following dimensions in the following order: +```r + c(n. of datasets, n. of members, n. of starting dates, n. of lead-times, n. of latitudes, n. of longitudes) +``` + +Checking and summarizing data +----------------------------- + + +Next, the data of the first member, starting date and leadtime of each dataset +can be visualised on a cylindrical equidistant projection with `PlotEquiMap()` +to check that the loaded data is as expected. +```r +PlotEquiMap(data$mod[1, 1, 1, 1, , ], data$lon, data$lat) +PlotEquiMap(data$mod[2, 1, 1, 1, , ], data$lon, data$lat) +PlotEquiMap(data$obs[1, 1, 1, 1, , ], data$lon, data$lat) +``` + + +See the full code used to obtain this figure in +[**Snippet 1**](snippets.md#snippet1). + +The values for some starting dates at a single grid point of an experimental +dataset can be plotted together with the observations with the function +`PlotAno()`: +```r +mod <- data$mod[, , , , 30, 60, drop = FALSE] +dim(mod) <- dim(mod)[1:4] +obs <- data$obs[, , , , 30, 60, drop = FALSE] +dim(obs) <- dim(obs)[1:4] +PlotAno(mod, obs, gsub('1101', '1201', sdates), + toptitle = paste0("Experiment ", c("A", "B"), + ": raw 'tas' at 166.5ºE, 27.47ºN."), + ytitle = "K", + fileout = c('ex_raw_expA_obsX.eps', 'ex_raw_expB_obsX.eps')) +``` + + + + +Each coloured region represents data corresponding to a single starting date. +The bold line represents the mean value and the thin lines represent the values +of each ensemble member. The upper and lower limits of each region are +defined, respectively, by the maximum and minimum values among the members. +The black line represents the reference reanalysis values. + +We can see in the plot how the 'tas' reaches a minimum in February/March. + +We can work out the exact coordinates of the selected grid cell as follows: +```r +data$lon[60] +## 166.5 +data$lat[30] +## 27.47649 +``` + +At this point, additional numerical checks could be carried out to ensure the +data retrieval stage has been sucessful. + +Bias correction +--------------- + +The next common step is to perform statistics on the raw data such as +computing climatologies and subtracting them to the raw data to obtain +anomalies that account for the inherent drift of the forecasts. +This is straightforward with the functions `Clim()` and `Ano()` (or +`Ano_CrossValid()`, see [**Statistics**]). Before, however, it is usual to +average the data over the region of interest instead of focusing on a single +grid cell. This can be done by directly requesting the data to `Load()` with +the 'areave' output type, which will apply area weights and calculate the +total average: +```r +data <- Load('tas', list(expA, expB), list(obsX), + sdates = sdates, + leadtimemin = 2, leadtimemax = 13, + latmin = -10, latmax = 60, + lonmin = 100, lonmax = 250, + output = 'areave', grid = 't106grid', + method = 'distance-weighted') +``` + +Then we calculate and plot the per-pair climatologies with `Clim()` and +`PlotClim()`: +```r +clim <- Clim(data$mod, data$obs) +PlotClim(clim$clim_exp, clim$clim_obs, monini = 12, + toptitle = "Per-pair climatologies of 'tas' over North Pacific region.", + listexp = c('Experiment A', 'Experiment B'), + listobs = c('Observation X'), + ytitle = "K", fileout = 'ex_clim_expA_expB_obsX.eps') +``` + + + +Each line in this plot represents the climatology of each member of the +corresponding dataset. A single climatology of the ensemble mean could be +obtained providing the parameter memb = FALSE to `Clim()`, if distinction +of members is irrelevant. + +We can obtain the anomalies with `Ano()`, which simply subtracts the +climatologies to the corresponding starting dates and members of the raw data. +These can be plotted with `PlotAno()`: +```r +ano_mod <- Ano(data$mod, clim$clim_exp) +ano_obs <- Ano(data$obs, clim$clim_obs) +PlotAno(ano_mod, ano_obs, gsub('1101', '1201', sdates), + toptitle = paste0("Experiment ", c("A", "B"), + ": drift-corrected 'tas' anomalies over North Pacific region."), + ytitle = 'K', linezero = TRUE, + fileout = c('ex_ano_expA_obsX.eps', 'ex_ano_expB_obsX.eps')) +``` + + + + +To fulfill the bias correction we would need to add the observed climatologies +to these anomalies. The working units of the package, however, are the +anomalies so this step is not needed. + +Other statistics such as trends, detrended anomalies, frequency filtering, +EOF/PCA and empirical model generation are implemented in the package. +Check [**Statistics**](statistics.md) for a thorough explanation. +Also see [**Visualisation**](visualisation.md) for details on visualisation +tools. + +Assessing the quality +--------------------- + +As a measure of the skill of the two forecasts we will compute their +correlation against the reanalysis. + +First we compute the ensemble mean and then calculate the Pearson correlation +with `Corr()` and plot the results with `PlotVsLTime()`, which allows to plot +any other computed variable with confidence intervals and signification values. +```r +corr <- Corr(Mean1Dim(ano_mod, 2), Mean1Dim(ano_obs, 2)) +PlotVsLTime(corr, toptitle = "Correlations with Observation X over North Pacific region.", + ytitle = 'corr', monini = 12, + listexp = c('Exp A', 'Exp B'), + fileout = 'ex_corr_expA_expB_obsX.eps') +``` + + + +See [**Verification**](verification.md) for a detailed explanation of the +available deterministic and probabilistic scores or +[**Plotting time series**](visualisation.md#time_series) for quick reference of +how each score can be plotted. diff --git a/vignettes/s2dv_modules.png b/vignettes/s2dv_modules.png new file mode 100644 index 0000000000000000000000000000000000000000..35465c8e0985fe4f08cf7dd0c578de0bd3e8fe04 Binary files /dev/null and b/vignettes/s2dv_modules.png differ diff --git a/vignettes/snip1_equi_map_raw_all.png b/vignettes/snip1_equi_map_raw_all.png new file mode 100644 index 0000000000000000000000000000000000000000..06e3c5ae4dcf10d37f080e80e5f5922fea06ea5a Binary files /dev/null and b/vignettes/snip1_equi_map_raw_all.png differ diff --git a/vignettes/snip2_anim_corr_expA_obsX.gif b/vignettes/snip2_anim_corr_expA_obsX.gif new file mode 100644 index 0000000000000000000000000000000000000000..cdf646c43fe1bd86b0a2d58fdeddeddbdb2d4ef2 Binary files /dev/null and b/vignettes/snip2_anim_corr_expA_obsX.gif differ diff --git a/vignettes/snip2_anim_corr_expB_obsX.gif b/vignettes/snip2_anim_corr_expB_obsX.gif new file mode 100644 index 0000000000000000000000000000000000000000..40f6d1c2203edd2d19be336c16472d8205e6ad36 Binary files /dev/null and b/vignettes/snip2_anim_corr_expB_obsX.gif differ diff --git a/vignettes/snip2_equimap_corr_raw_expA_obsX.png b/vignettes/snip2_equimap_corr_raw_expA_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..edfbb9ee61d024caa3995a49a953ae2c4f9d2b86 Binary files /dev/null and b/vignettes/snip2_equimap_corr_raw_expA_obsX.png differ diff --git a/vignettes/snip2_equimap_corr_raw_expB_obsX.png b/vignettes/snip2_equimap_corr_raw_expB_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..ecfeb0a5b3f58d820cda25359125aa1725576473 Binary files /dev/null and b/vignettes/snip2_equimap_corr_raw_expB_obsX.png differ diff --git a/vignettes/snippets.md b/vignettes/snippets.md new file mode 100644 index 0000000000000000000000000000000000000000..239ec6b4b61c09e91e4cd221cfea64be2ad89722 --- /dev/null +++ b/vignettes/snippets.md @@ -0,0 +1,117 @@ +--- +author: "Nicolau Manubens" +date: "`r Sys.Date()`" +output: html_document +vignette: > + %\VignetteEngine{R.rsp::md} + %\VignetteIndexEntry{Snippets} + %\usepackage[utf8]{inputenc} +--- +Snippets +======== +This page contains extended snippets of code used in some sections of the wiki +page to obtain figures or results. + +The example data used is the same used in [**Example**](example.md) and +throughout in the vignettes referenced in [**Overview**](../README.md): +```r +library(s2dverification) + +expA <- list(name = 'experimentA', + path = file.path('/path/to/experiments/$EXP_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc')) +expB <- list(name = 'experimentB', + path = file.path('/path/to/experiments/$EXP_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc')) +obsX <- list(name = 'observationX', + path = file.path('/path/to/observations/$OBS_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$YEAR$$MONTH$.nc')) +``` + +Snippet 1 +--------- +Plots 3 equidistant projection maps on a single figure with a single color bar. +See in [context](example.md#snippet1). +```r +sdates <- paste0(1991:2000, '1101') +data <- Load('tas', list(expA, expB), list(obsX), + sdates = sdates, + leadtimemin = 2, leadtimemax = 13, + latmin = -10, latmax = 60, + lonmin = 100, lonmax = 250, + output = 'lonlat', grid = 't106grid', + method = 'distance-weighted') +brks <- seq(min(data$mod, data$obs, na.rm = TRUE), + max(data$mod, data$obs, na.rm = TRUE), + length.out = 21) +cols <- clim.colors(20) +png('snip1_equi_map_raw_all.png', width = 800, height = 300, type = "Xlib") +layout(matrix(c(1, 1, 2, 2, 3, 3, + 0, 4, 4, 4, 4, 0), + 2, 6, byrow = TRUE), + heights = c(5, 1)) +PlotEquiMap(data$mod[1,1,1,1,,], data$lon, data$lat, brks = brks, cols = cols, drawleg = FALSE, toptitle = 'Experiment A: tas (K)', sizetit = 0.7) +PlotEquiMap(data$mod[2,1,1,1,,], data$lon, data$lat, brks = brks, cols = cols, drawleg = FALSE, toptitle = 'Experiment B: tas (K)', sizetit = 0.7) +PlotEquiMap(data$obs[1,1,1,1,,], data$lon, data$lat, brks = brks, cols = cols, drawleg = FALSE, toptitle = 'Observation X: tas (K)', sizetit = 0.7) +ColorBar(brks, cols, vert = FALSE, subsampleg = 5) +dev.off() +``` + + +Snippet 2 +--------- +Plots time correlations (along actual time) of Decembers of Experiment A and +Experiment B at each grid point over the Atlantic. The black dots mean the +correlation at that point exceeds the 95% significance level. See in +[context](visualisation.md#snippet2). +```r +map_data <- Load('tas', list(expA, expB), list(obsX), + sdates = sdates, + leadtimemin = 2, leadtimemax = 13, + latmin = -30, latmax = 60, + lonmin = -90, lonmax = 20, + output = 'lonlat', grid = 't106grid', + method = 'distance-weighted') +ano <- Ano_CrossValid(map_data$mod, map_data$obs) +corr <- Corr(Mean1Dim(ano$ano_exp, 2), Mean1Dim(ano$ano_obs, 2)) + +cols <- clim.colors(50) +corr_min <- min(corr, na.rm = TRUE) +corr_max <- max(corr, na.rm = TRUE) +corr_brks <- round(seq(corr_min, corr_max, length.out = length(cols) + 1), 2) + +png('snip2_equimap_corr_raw_expA_obsX.png', width = 800, height = 600) +PlotEquiMap(corr[1, 1, 2, 1, , ], map_data$lon, map_data$lat, + toptitle = "Exp. A: 'tas' correlation along Decembers with Obs. X.", + units = "K", brks = corr_brks, cols = cols, subsampleg = 5, + dots = t(corr[1, 1, 2, 1, , ] > corr[1, 1, 4, 1, , ])) +dev.off() + +png('snip2_equimap_corr_raw_expB_obsX.png', width = 800, height = 600) +PlotEquiMap(corr[2, 1, 2, 1, , ], map_data$lon, map_data$lat, + toptitle = "Exp. B: 'tas' correlation along Decembers with Obs. X.", + units = "K", brks = corr_brks, cols = cols, subsampleg = 5, + dots = t(corr[2, 1, 2, 1, , ] > corr[2, 1, 4, 1, , ])) +dev.off() +``` + + + +And generates the animations of the actual time correlations of Experiment A +and B against Observation X over the Atlantic, with black dots on values that +reach a 95% significance level: + +```r +AnimVsLTime(corr, map_data$lon, map_data$lat, monini = 12, + msk95lev = TRUE, + toptitle = c("Exp. A: actual time correlation with Obs. X.", + "Exp. B: actual time correlation with Obs. X."), + units = "K", + brk = corr_brks, col = cols, subsampleg = 5, + fileout = c("snip2_anim_corr_expA_obsX", + "snip2_anim_corr_expB_obsX")) +``` + + + + diff --git a/vignettes/stat_ano_expA_Y_obsX.png b/vignettes/stat_ano_expA_Y_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..2450ad7fb1e3da92db01d423f3fa266d6c0405c3 Binary files /dev/null and b/vignettes/stat_ano_expA_Y_obsX.png differ diff --git a/vignettes/stat_ano_expA_obsX.png b/vignettes/stat_ano_expA_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..3d37c05269ab1b0311e308a2cdd5858ff7a9f88a Binary files /dev/null and b/vignettes/stat_ano_expA_obsX.png differ diff --git a/vignettes/stat_ano_expB_obsX.png b/vignettes/stat_ano_expB_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..de4abb19af655e29c0c1b3189abe90f620fdccb0 Binary files /dev/null and b/vignettes/stat_ano_expB_obsX.png differ diff --git a/vignettes/stat_clim_expA_expB_obsX.png b/vignettes/stat_clim_expA_expB_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..b7a90d8b4495085719904b30982fa5deae95397d Binary files /dev/null and b/vignettes/stat_clim_expA_expB_obsX.png differ diff --git a/vignettes/stat_detr_ano_expA_obsX.png b/vignettes/stat_detr_ano_expA_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..576aaee5d8f9e853786346f9a99d154b18f26335 Binary files /dev/null and b/vignettes/stat_detr_ano_expA_obsX.png differ diff --git a/vignettes/stat_filter_ano_expA.png b/vignettes/stat_filter_ano_expA.png new file mode 100644 index 0000000000000000000000000000000000000000..21e57909725fe599e7a3974c47cd771cee0099e9 Binary files /dev/null and b/vignettes/stat_filter_ano_expA.png differ diff --git a/vignettes/stat_raw_expA_obsX.png b/vignettes/stat_raw_expA_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..6cd4ac41397daf51a628a6457054d147452d0730 Binary files /dev/null and b/vignettes/stat_raw_expA_obsX.png differ diff --git a/vignettes/stat_raw_expB_obsX.png b/vignettes/stat_raw_expB_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..9a7914496c73debd8506f62ce5630c7eb43c997f Binary files /dev/null and b/vignettes/stat_raw_expB_obsX.png differ diff --git a/vignettes/stat_season_mam_expA.png b/vignettes/stat_season_mam_expA.png new file mode 100644 index 0000000000000000000000000000000000000000..af20a8851a81fd17deaba125d852d05c15868035 Binary files /dev/null and b/vignettes/stat_season_mam_expA.png differ diff --git a/vignettes/stat_season_mam_obsX.png b/vignettes/stat_season_mam_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..9a7acc9cd09d6da05953d9e2c0efd2d5cd437373 Binary files /dev/null and b/vignettes/stat_season_mam_obsX.png differ diff --git a/vignettes/stat_smooth_ano_expA_obsX.png b/vignettes/stat_smooth_ano_expA_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..86f150f5c7f9b77c397dd33c6292b955537abe1f Binary files /dev/null and b/vignettes/stat_smooth_ano_expA_obsX.png differ diff --git a/vignettes/stat_toy_forecast_ano.png b/vignettes/stat_toy_forecast_ano.png new file mode 100644 index 0000000000000000000000000000000000000000..eba3f1a7eff45c71be43a78bfd14aad88aa33ed7 Binary files /dev/null and b/vignettes/stat_toy_forecast_ano.png differ diff --git a/vignettes/stat_trend_expA_expB.png b/vignettes/stat_trend_expA_expB.png new file mode 100644 index 0000000000000000000000000000000000000000..3e86f459f55c559ff9a7ee398ec840b6841f347d Binary files /dev/null and b/vignettes/stat_trend_expA_expB.png differ diff --git a/vignettes/statistics.md b/vignettes/statistics.md new file mode 100644 index 0000000000000000000000000000000000000000..861d4d364b9d82485fb0cf645ae2a6dd11d7f0a9 --- /dev/null +++ b/vignettes/statistics.md @@ -0,0 +1,558 @@ +--- +author: "Nicolau Manubens" +date: "`r Sys.Date()`" +output: html_document +vignette: > + %\VignetteEngine{R.rsp::md} + %\VignetteIndexEntry{Statistics} + %\usepackage[utf8]{inputenc} +--- +Statistics +========== +The **Statistics** module consists of functions that are commonly used in the +forecast verification process to modify, describe or generate **fields**, a +field being a series of modelled or observed measurements for a certain physic +variable of interest. + +These functions can be classified in 5 groups as a function of their purpose: + + - [**Normalizing fields**](#normalizing): `Clim()`, `Ano()` and +`Ano_Crossvalid()`. + - [**Describing fields**](#describing): `Trend()`, `Consist_Trend()`, +`Regression()`, `FitAcfCoef()`, `Alpha()`, `FitAutocor()`, `Spectrum()`, +`Eno()` and `EnoNew()`. + - [**Filtering fields**](#filtering): `Trend()`, `Consist_Trend()`, +`Regression()`, `Smoothing()` and `Filter()`. + - [**Generating derivative fields**](#generating_der): `Season()`, +`StatSeasAtlHurr()` and `ProbBins()`. + - [**Generating synthetic fields**](#generating_synth): `GenSeries()` and +`Toymodel()`. + +It is important, to master these functions, to have in mind the concepts +introduced in **Data retrieval** as well as the structure and meaning of the +dimensions of the data arrays provided by the **Data retrieval** module used +throughout in s2dverification: +```r + c(n. of datasets, n. of members, n. of starting dates, n. of lead-times, n. of latitudes, n. of longitudes) +``` +the last two being present in function of the parameter `output`. Review +[**Data retrieval**](data_retrieval.md) to settle these concepts. + +The explanations in the following sections will be based on the same data used +in **Data retrieval**. See [**Data retrieval**](data_retrieval.md) for details. +```r +library(s2dverification) +expA <- list(name = 'experimentA', + path = file.path('/path/to/experiments/$EXP_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc')) +expB <- list(name = 'experimentB', + path = file.path('/path/to/experiments/$EXP_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc')) +obsX <- list(name = 'observationX', + path = file.path('/path/to/observations/$OBS_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$YEAR$$MONTH$.nc')) +sdates <- paste0(1991:2000, '1101') +data <- Load('tas', list(expA, expB), list(obsX), + sdates = sdates, + leadtimemin = 2, leadtimemax = 13, + latmin = -10, latmax = 60, + lonmin = 100, lonmax = 250, + output = 'areave', grid = 't106grid', + method = 'distance-weighted') +``` +The dimensions of the data are: +```r +print(dim(data$mod)) +# [1] 2, 5, 10, 12 +print(dim(data$obs)) +# [1] 1, 1, 10, 12 +``` + +Normalizing fields +------------------ +Due to systematic errors in the models their mean behaviour along the forecast +time can differ or depart from the mean behaviour of the observed data over +that forecast time period, a phenomenon also known as **drift**. +It is hence a standard practice in climate forecast verification to compare +only the deviations of the experimental and observational data from their mean +behaviour along the forecast time. For this reason most of the functions in +s2dverification use series of deviations as inputs and provide series of +deviations as outputs. + +The mean behaviour along the forecast time of a data set, also known as the +**climatology**, can be computed with multiple methods. The deviation from +the climatology is known as the **anomaly** or, more explicitly, +drift-corrected anomaly, and it can be calculated by simply subtracting the +climatology of a data set to its raw data. + +A general purpose climatology could be computed as follows (the 3rd dimension +corresponds to the starting dates or model runs): +```r +clim_exp <- apply(data$mod, 1:length(dim(data$mod))[-3], mean, na.rm = TRUE) +``` +or with the function `Mean1Dim()` in s2dverification: +```r +clim_exp <- Mean1Dim(data$mod, 3) +``` + +s2dverification, however, builds upon the per-pair method to compute +climatologies, which is implemented in the function `Clim()`. +The `Clim()` parameters `kharin` and `NDV` allow to select two other more +refined methods: the trend method and the initial-conditions method. All of +them are documented in +``` +Fučkar N, Volpi D, Guemas V, Doblas-Reyes F, 2014, A posteriori adjustment of near-term climate predictions: Accounting for the drift dependence on the initial conditions. Geophysical Research Letters, 41 (14), 5200–5207, doi:10.1002/2014GL060815 +``` +Following the per-pair method the climatology of an experimental dataset is +obtained, as in the general purpose climatology, by calculating an average +across the starting dates at each forecast time, yielding a time series +defined along the forecast times of the experiment. The climatology of an +observational dataset is obtained by computing, for each forecast time, the +average of the observations that date-correspond that leadtime of each +starting date. +To make easy this process `Load()` dispones both experimental and +observational data in a forecast-like arrangement. + +`Clim()` and the per-pair method only account, at each forecast time, for the +starting dates for which data is available in all the experimental and +observational datasets, hence both experimental and observational data must be +provided as inputs: + +```r +clim <- Clim(data$mod, data$obs) +``` + +The raw data and the climatologies can be visualized with `PlotAno()` and +`PlotClim()`, respectively: +```r +selected_sdates <- gsub('1101', '1201', sdates) +PlotAno(data$mod, data$obs, selected_sdates, + toptitle = paste0(c("Experiment A", "Experiment B"), + ": Raw 'tas' over North Pacific region."), + ytitle = c("K", "K"), + fileout = paste0("stat_raw_exp", c("A", "B"), "_obsX.eps")) +PlotClim(clim$clim_exp, clim$clim_obs, monini = 12, + toptitle = "Per-pair 'tas' climatologies over the North Pacific region.", + ytitle = "K", + listexp = c('Experiment A', 'Experiment B'), + listobs = c('Observation X'), + fileout = "stat_clim_expA_expB_obsX.eps") +``` + + + +Each coloured curve in the `PlotAno()` figures corresponds to a starting date, +with the various ensemble members and the ensemble mean in bold. The coloured +area is delimited by the minimum and maximum ensemble values. + + + +Each plot in the `PlotClim()` figure corresponds to the climatology of a +member of the corresponding experiment or observation. + +See [**Visualisation**](visualisation.md) for a complete explanation of the +plotting functions. + +Usually, if working with atmospheric variables, it is not relevant when +computing climatologies to make a distinction among members because they are +considered to be equivalent. It is possible, after seeing they are indeed +equivalent, to compute the ensemble mean climatology with +```r +exp_ens_clim <- Mean1Dim(clim$clim_exp, 2) +obs_ens_clim <- Mean1Dim(clim$clim_obs, 2) +``` +or, if sure a priori of their equivalence, by setting the parameter +`memb = FALSE` in `Clim()`: +```r +clim <- Clim(data$mod, data$obs, memb = FALSE) +``` + +If working with oceanic variables this may not necessarly apply. See +``` +Du, H., F.J. Doblas-Reyes, J. García-Serrano, V. Guemas, Y. Soufflet and B. Wouters (2012). Sensitivity of decadal predictions to the initial atmospheric and oceanic perturbations. Climate Dynamics, 39, 2013-2023, doi:10.1007/s00382-011-1285-9. http://link.springer.com/article/10.1007%2Fs00382-011-1285-9 Fig 2d is a very good example. If only one climatology is used for this variable, the skill is substantially lower. +``` + +Once the climatologies are computed, the anomalies can be obtained with +`Ano()`, that subtracts the climatologies to each member, starting date, +latitude and longitude of their corresponding dataset: +```r +ano_exp <- Ano(data$mod, clim$clim_exp) +ano_obs <- Ano(data$obs, clim$clim_obs) +``` + +All the starting dates of an experiment contribute to the per-pair climatology +and, when computing the anomaly of that starting date, its contribution to the +climatology is subtracted, which can imply a loss of information [citation]. +To avoid this, the function `Ano_CrossValid()` automatically computes a +climatology for each starting date without taking that starting date into +account and subtracts it to the original data: +```r +ano <- Ano_CrossValid(data$mod, data$obs, memb = FALSE) +``` + +The anomalies can be plotted with `PlotAno()`: +```r +PlotAno(ano$ano_exp, ano$ano_obs, selected_sdates, + toptitle = paste0(c("Experiment A", "Experiment B"), + ": c.v. 'tas' anomalies over North Pacific region."), + ytitle = c("K", "K"), linezero = TRUE, + fileout = paste0("stat_ano_exp", c("A", "B"), "_obsX.eps")) +``` + + + +To fulfill the bias correction of the forecasts, i.e. transforming the forecast +data from the biased model mean state to the real observed mean state, the +climatology of the observation has to be added to the anomalies of the +forecasts. Since the score functions take anomalies as working units this step +is not needed for verification. + +Describing fields +----------------- +While the drift correction had effect on series along the forecast time, the +description and filtering functions are aimed to analyse and manipulate data +also along the actual time (or starting dates). +The functions to describe fields allow you to compute the trend of a field, +its regression with another field, its frequency spectrum and its +autocorrelation. + +### Linear trend and regression +`Trend()` fits first degree linear models by least square fitting taking as +dependent variable a field given to its argument `var` and a line of slope 1 +as independent variable. A linear model is fitted for each actual time-series +of each dataset, member, lead-time, latitude and longitude in `var`. The +coefficient and intercept of the linear model, the confidence interval relying +on a T-student distribution and the detrended data are provided as output. + +`Trend()` can be useful, for example, in global warming scenarios (frequent +in seasonal and decadal prediction) to detect, describe and remove any +constantly increasing or decreasing trend. The following example computes the +trends of all the datasets at each lead-time and plots the slope of the trends +of the experimental datasets with `PlotVsLTime()` (see +[**Visualisation**](visualisation.md)): +```r +trend_exp <- Trend(Mean1Dim(ano$ano_exp, 2)) +trend_obs <- Trend(Mean1Dim(ano$ano_obs, 2)) +PlotVsLTime(trend_exp$trend, + toptitle = "Slopes of trends of 'tas' over North Pacific region", + ytitle = "K/year", + monini = 12, freq = 1, + listexp = c('Experiment A', 'Experiment B'), + fileout = 'stat_trend_expA_expB.eps') +``` + + +In this case the slopes of the trends are nearly zero at all lead-times. The +raw anomalies of the experiment A and observations are plotted next, side to +side with the detrended ones: +```r +PlotAno(plyr::take(ano$ano_exp, 1, 1), ano$ano_obs, + selected_sdates, + toptitle = "Experiment A: c.v. 'tas' anomalies over North Pacific region.", + ytitle = "K", linezero = TRUE, + fileout = paste0("stat_ano_expA_obsX.eps")) +## PlotAno() requires that the members dimension is present in its inputs. +## Since we squeezed the members dimension before computing the trend, we need +## to put it again with InsertDim(): a dimension of length 1 at the 2nd place. +PlotAno(InsertDim(plyr::take(trend_exp$detrended, 1, 1), 2, 1), + InsertDim(trend_obs$detrended, 2, 1), + selected_sdates, + toptitle = "Experiment A: detrended c.v. 'tas' anomalies over North Pacific region.", + ytitle = "K", linezero = TRUE, + fileout = paste0("stat_detr_ano_expA_obsX.eps")) +``` + + + +Since the anomaly members have been averaged to compute the trend, the provided +detrended data by `Trend()` is also an ensemble average. + +The parameter `posTR` of `Trend()` allows to specify the position in `var` of +the actual time dimension, which by default is expected to be in second place +as it is also expected that the input field has been averaged along members +with `Mean1Dim()`. It also allows to compute the trend along other dimensions +than actual time; for example, forecast time. +The parameter `interval` allows to specify the number of time steps between +consecutive values along the `posTR`th dimension, `1` by default. + +The function `Consist_Trend()` only takes into account those lead-times and +starting dates at which observations are available (when using on +2-dimensional fields, a lead-time or starting date will only be discarded if +all its grid values are not available). + +`Regression()`, in contrast to `Trend()`, takes a second field as parameter +and uses it as independent variable to fit the linear model. It can be useful +to detect, describe and filter any relationship between two fields or to +quantify the +As example, let's suppose the experiment A is under the effect of the +phenomena X and Y but only the phenomenon Y is relevant for the study case. It +may be threfore needed to remove any variability of A related to X. Let's +suppose that the experiment B is only under the effect of the phenomena X. +Then the variability of A related to Y can be isolated by subtracting its +regression with B: +```r +ano_expA_Y <- Regression(plyr::take(ano$ano_exp, 1, 1), + plyr::take(ano$ano_exp, 1, 2), posREG = 3) +PlotAno(ano_expA_Y$filtered, ano$ano_obs, + selected_sdates, + toptitle = "Experiment A: c.v. 'tas' anomalies over North Pacific region, X subtracted.", + ytitle = "K", linezero = TRUE, + fileout = "stat_ano_expA_Y_obsX.eps") +``` + +### Frequency spectrum +The function `Spectrum()` estimates the frequency components of the input +time series relying on the R base `spectrum()` function. Additionally to the +frequency and power of each component it also computes the 95% and 99% +significance levels _____for accepting each frequency as being a component of the +provided series_____. These are estimated with a Monte-Carlo method. + +The spectrum of the ensemble mean of the first starting date of the experiment +A is computed next and the frequency, power and significance levels of the +most powerful component are printed: +```r +components <- Spectrum(Mean1Dim(plyr::take(ano$ano_exp, c(1, 3), c(1, 1)), 2)) +print(components[1, ]) +# [1] 0.083333333 0.001797855 0.035540044 0.043555352 +``` + +`Spectrum()` assumes the input data is evenly spaced in time. + +Check [**Filtering fields**](filtering.md) for an example of filtering +frequencies estimated with `Spectrum()`. + +### Autocorrelation and persistence +Knowing whether a field is time-correlated with itself is a crucial step to +determine whether the size of the sample is large enough to yield a robust +validation. It is also useful to build synthetic models with the same time +correlation behaviour as another. Knowing the best estimate of autocorrelation +at lag 1 is needed to _________. + +`FitAutocor()` estimates the autocorrelation from the first autocorrelation +estimate, that can be obtained with the R built-in function `acf()`, a +window where to seek in with a dichotomial method and a precision factor: +```r +estacf <- acf(ano$ano_obs[1, 1, , 1], plot = FALSE)$acf +autocorr <- FitAutocor(estacf, c(-1, 1), 0.01) +print(autocorr) +# [1] 0 +``` + +The obtained autocorrelation of 0 means that _____________. + +`FitAcfCoef()` computes _____the best estimate of the +persistence/autocorrelation at lag 1 from the first estimates of the +autocorrelation at lags 1 and 2, that can be +obtained providing the original time series to the R built-in function `acf()`: +```r +estacf <- acf(ano$ano_obs[1, 1, , 1], plot = FALSE)$acf +persistence <- FitAcfCoef(max(estacf[2], 0), max(estacf[3], 0)) +``` + +The function `Alpha()` automatically computes the persistence by calling +`FitAcfCoef()` after computing the first estimates with `acf()`: +```r +persistence <- Alpha(ano$ano_obs[1, 1, , 1]) +print(persistence) +# [1] 0 +``` + +`Alpha()`, additionally, has two parameters to enable linear detrending and/or +filtering of frequency peaks prior to the computation of the persistence, +convenient if ___________. + +### Effective sample size +`Eno()` and `EnoNew()` compute the effective number of independent data. +While `Eno()` relies on the `eno` function from "rclim.txt" from Caio Coelho +and operates on s2dverification common formatted arrays, `EnoNew()` relies on +the method described in +``` +Guemas V., Auger L., Doblas-Reyes F., JAMC, 2013 +``` +and operates on time series and applies, if specified in the parameters `detrend` +and `filter`, a detrending and filtering of frequency peaks, similarly to +`Alpha()`. +```r +eno <- Eno(data$mod, 3) +print(unique(as.vector(eno))) +# [1] 10.000000 9.104402 6.805723 8.270278 9.492232 8.783534 7.756835 +# [8] 9.270968 8.025609 + +``` +This means that __________. + + +Filtering fields +---------------- +Depending on the source and nature of the field to study and on the study +case it is often needed to isolate, remove or modulate its components. + +As seen in [**Describing fields**](#describing), the functions `Trend()` and +`Consist_Trend()` remove linear trends and provide detrended data in the +component `detrended` of their output. Likewise, `Regression()` removes +behaviours related to other fields and provides the filtered data in the +output component `filtered`. + +### Running means + +`Smoothing()` calculates running means to remove high frequency variability. +The parameter `runemanlen` allows to specify the running window width and +`numdimt` which dimension to smooth along (4 by default; along forecast time). +In this __not_very_useful___ example the anomalies are smoothed along the +actual time with a window of 3 months to remove the inter-seasonal variability. +The first and last forecast months are lost as appreciated in the figure: +```r +smoothed_ano_exp <- Smoothing(ano$ano_exp, 3) +smoothed_ano_obs <- Smoothing(ano$ano_obs, 3) +PlotAno(plyr::take(smoothed_ano_exp, 1, 1), smoothed_ano_obs, + selected_sdates, + toptitle = "Experiment A: smoothed c.v. 'tas' anomalies over North Pacific region.", + ytitle = "K", linezero = TRUE, + fileout = "stat_smooth_ano_expA_obsX.eps") +``` + + +### Frequency filtering +`Filter()` filters a specified frequency from the input data. The filtering is +performed by dichotomy, seeking for the frequency around the specified one and +for the phase that maximizes the signal to subtract to the data. +The maximization of the signal to subtract relies on a minimization of the +mean square differences between the data and a cosine of each frequency and +phase within the seek range. + +The following example filters all the frequencies in experiment A ___with a +power beyond the 99% signification level___ with the purpose of _____________: +```r +ens_mean_ano_expA <- Mean1Dim(plyr::take(ano$ano_exp, 1, 1), 2) +for (sdate in 1:length(sdates)) { + spectrum <- Spectrum(ens_mean_ano_expA[1, sdate, ]) + for (freq in 1:dim(spectrum)[1]) { + if (spectrum[freq, 2] > spectrum[freq, 4]) { + ens_mean_ano_expA[1, sdate, ] <- Filter(ens_mean_ano_expA[1, sdate, ], + spectrum[freq, 1]) + } + } +} +PlotAno(InsertDim(ens_mean_ano_expA, 2, 1), ano$ano_obs, + selected_sdates, + toptitle = "Experiment A: filtered c.v. 'tas' anomalies over North Pacific region.", + ytitle = "K", linezero = TRUE, + fileout = "stat_filter_ano_expA.eps") +``` + + +Generating derivative fields +---------------------------- +Some of the functions in the package generate fields based on data from other +fields, for example seasonal means from monthly time series or downscaled data. + +### Seasonal means +`Season()` computes seasonal means from data objects in the common +s2dverification structure along the forecast time dimension, which is expected +to be, by default, the 4th. The data is expected to be evenly spaced along +that dimension. +The initial month of the time-series must be specified, as well as the initial +and final months of the desired season. The parameter `posdim` allows to +specify the dimension to compute the seasonal mean along, 4 by default, +forecast time. + +The following example loads the data without area averaging, i.e. gridded data, +and computes and plots the MAM seasonal mean climatology of each grid cell over +the North Pacific region: +```r +data_map <- Load('tas', list(expA, expB), list(obsX), + sdates = sdates, + leadtimemin = 2, leadtimemax = 13, + latmin = -10, latmax = 60, + lonmin = 100, lonmax = 250, + output = 'lonlat', grid = 't106grid', + method = 'distance-weighted') +clim_map <- Clim(data_map$mod, data_map$obs, memb = FALSE) +mam_clim_exp <- Season(clim_map$clim_exp, posdim = 2, + monini = 12, moninf = 3, monsup = 5) +mam_clim_obs <- Season(clim_map$clim_obs, posdim = 2, + monini = 12, moninf = 3, monsup = 5) +brks <- round(seq(min(mam_clim_exp, mam_clim_obs, na.rm = TRUE), + max(mam_clim_exp, mam_clim_obs, na.rm = TRUE), + length.out = 51), 2) +cols <- clim.colors(50) +png('stat_season_mam_expA.png', width = 800, height = 600, type = "Xlib") +PlotEquiMap(mam_clim_exp[1, 1, , ], data_map$lon, data_map$lat, + toptitle = "Experiment A: Spring (MAM) 'tas' climatologies.", + units = "K", brks = brks, cols = cols, subsampleg = 10) +dev.off() +png('stat_season_mam_obsX.png', width = 800, height = 600, type = "Xlib") +PlotEquiMap(mam_clim_obs[1, 1, , ], data_map$lon, data_map$lat, + toptitle = "Observation X: Spring (MAM) 'tas' climatologies.", + units = "K", brks = brks, cols = cols, subsampleg = 10) +dev.off() +``` + + + +### Cathegorizing data +`ProbBins()` + +### Statistical downscaling +`StatSeasAtlHurr()` + +Generating synthetic fields +--------------------------- +To perform statistical/inference tests it is required to generate new data +obtained with empirical models that comply with a given set of distribution +parameters. + +`GenSeries()` generates a series with a specified length, autocorrelation at +lag 1, mean and standard deviation. For example, a synthetic model can be +parametrized to generate a forecast with the same parameters as the +experiment A: +```r +synth_exp_A <- apply(plyr::take(data$mod, 1, 1), c(1, 2, 3), + function(x) { + GenSeries(length(x), Alpha(x), mean(x, na.rm = TRUE), + sd(x, na.rm = TRUE)) + }) +``` + +`Toymodel()` directly generates forecasts in the s2dverification common data +structure, imitating seasonal to decadal forecasts. The toymodel is based on +the model presented in +``` +Weigel et al. (2008) QJRS +``` +with an extension to consider non-stationary distributions prescribing a +linear trend. The toymodel allows to generate an aritifical forecast based on +real obsevations provided by the input (from `Load()`) or artificially +generated observations based on the input parameters `sig` and `trend`. The +forecast can be specfied for any number of starting dates, lead-times and +ensemble members. It imitates components of a forecast: (1) predictabiltiy (2) +forecast error (3) non-stationarity and (4) ensemble generation. + +The following example generates a forecast for seasonal prediction with +synthetic observations, and plots its anomalies: +```r +nstartd <- 30 +sdates_toy <- paste0(1981:(1981 + nstartd), '1101') +toy_forecast <- Toymodel(alpha = 0.1, beta = 0.3, gamma = 1, sig = 1, + trend = 0.02, nstartd = nstartd, nleadt = 12, + nmemb = 10) +ano_toy <- Ano_CrossValid(toy_forecast$mod, toy_forecast$obs) +PlotAno(ano_toy$ano_exp, ano_toy$ano_obs, sdates_toy, + toptitle = "Toy forecast: anomalies.", + ytitle = "units", linezero = TRUE, + fileout = "stat_toy_forecast_ano.eps") +``` + + +It is possible, however, to generate model data from observational data from +`Load()`. The only required parameters are, then, the predictability, error +standard deviation and factor on linear trend to sample uncertainty: + +```r +toy_forecast_C <- Toymodel(alpha = 0.1, beta = 0.3, gamma = 1, nmemb = 10, + obsini = data$obs) +PlotAno(toy_forecast_C$mod, toy_forecast_C$obs, selected_sdates, + toptitle = "Toy forecast from observation X: anomalies.", + ytitle = "K", linezero = TRUE, + fileout = "stat_toy_expC_obsX_ano.eps") +``` diff --git a/vignettes/tutorial.md b/vignettes/tutorial.md new file mode 100644 index 0000000000000000000000000000000000000000..3a7201307b73936031b40b047caf428814ff9a0f --- /dev/null +++ b/vignettes/tutorial.md @@ -0,0 +1,201 @@ +--- +author: "Nicolau Manubens" +date: "`r Sys.Date()`" +output: html_document +vignette: > + %\VignetteEngine{R.rsp::md} + %\VignetteIndexEntry{Tutorial} + %\usepackage[utf8]{inputenc} +--- +Tutorial +======== + +Seasonal forecast of extreme events +----------------------------------- + +### Objectives +- Basics of R language +- Learning to use s2dverification + +### 1- Load files, compute basic statistics, plot map and time series + +#### Exercise 1 - First steps with R and s2dverification +- Make sure you have CDO tools installed in your system, or download and install +as described in: + +https://code.zmaw.de/projects/cdo/wiki/cdo + +- Open R (typing R in the terminal). +- Install s2dverification with: + +```r +install.packages('s2dverification') +``` + +- Load s2dverification with the following command: + +```r +library(s2dverification) +``` + +- The documentation of s2dverification is available online here: + +https://CRAN.R-project.org/package=s2dverification/s2dverification.pdf + +- You can see the list of available functions in the package by typing: +```r +help(package = s2dverification) +``` +- To see the help of a specific function, you can type: + +```r +help(FunctionName) +``` + +- Here I give you an example of how to use the `Load()` function. It assumes +you have some *tas* data for the experiments from all the ENSEMBLES models +(named *EnsCmccSeas*, *EnsIfmSeas*, *EnsEcmwfSeas*, *EnsMetfrSeas* and +*EnsUkmoSeas* at BSC-ES) and for the ERA-interim reanalysis (*ERAint*) in the +folders */example/experiments* and */example/observations*. +See the [**Example**](example.md) of use or +[**Data retrieval**](data_retrieval.md) for a full explanation of what is done +here. + +```r +exp_path_pattern <- file.path("/example/experiments/ENSEMBLES/$EXP_NAME$", + "$STORE_FREQ$_mean/$VAR_NAME$", + "$VAR_NAME$_$START_DATE$.nc") +cmcc_seas <- list(name = "EnsCmccSeas", path = exp_path_pattern) +ifm_seas <- list(name = "EnsIfmSeas", path = exp_path_pattern) +ecmwf_seas <- list(name = "EnsEcmwfSeas", path = exp_path_pattern) +metfr_seas <- list(name = "EnsMetfrSeas", path = exp_path_pattern) +ukmo_seas <- list(name = "EnsUkmoSeas", path = exp_path_pattern) + +obs_path_pattern <- file.path("/example/observations/$OBS_NAME$", + "$STORE_FREQ$_mean/$VAR_NAME$", + "$VAR_NAME$_$YEAR$$MONTH$.nc") +era_int <- list(name = "ERAint", path = obs_path_pattern) + +library(s2dverification) +Data <- Load("tas", + exp = list(cmcc_seas, ifm_seas, ecmwf_seas, metfr_seas, ukmo_seas), + obs = list(era_int), + sdates = c("19930501", "19940501"), + leadtimemin = 1, leadtimemax = 7, storefreq = "monthly", + sampleperiod = 1, nmember = 9, output = "areave", + lonmin = 190, lonmax = 240, latmin = -5, latmax = 5) +``` + +- You can copy paste it in R, run and check everything is working fine. Once it is finished have a look at the dimension of `Data` by typing this command in R: +``` +dim(Data$mod) +dim(Data$obs) +``` + +- With the help of the `Load()` function try to undestand what the different +dimensions are. +- Now run another time `Load()` with the same parameters, but change +`output = 'areave'` to `output = 'lonlat'`. +- What are the dimensions of `Data` now? +- What happens if you remove the `lonmin`, `lonmax`, `latmin`, `latmax` +options? +- What happens if you change the `leadtimemin` and `leadtimemax` options? + +#### Exercise 2 - First compute and plot 2m-temperature skill over Europe + +- In the directory named *handson* create a directory named *R* and browse into +that directory. +- Open a new file named *corrskill-europe.R*. +- At the beginning of the file, load the s2dverification package: + +```r +library(s2dverification) +``` + +- If you are using emacs and you don't have colour highlighting just execute +this line in terminal and reopen the file: + +``` +/usr/local/bin/ictp-install ess +``` + +- Generate a list of starting dates to be used in the `sdates` argument of +`Load()` which includes all May starting dates between 1979 and 2005 (you can +have a look at the help of the function `seq` and `paste` to do it). +- Based on the example of the previous exercise, try to load the lead-times 2, +3 and 4 of 2m-temperature (*tas*) of all ENSEMBLES models, over Europe +(20W70E-25N75N) for all May starting dates between 1979 and 2005. +- Calculate the June-July-August seasonal mean using the `Mean1Dim()` function +for both models and observations. +- With the same function, calculate the ensemble mean of each model. +- With the `Corr()` function, calculate the time correlation in +June-July-August between each model and the ERA-interim reanalysis. +- Using `PlotEquiMap()`, plot a map of the correlation coefficient for each +model and save it in a postscript (You can use the `postscript()` and +`dev.off()` functions). You can choose the following interval and the following +color bar: + +```r +colors <- clim.colors(20) +interval <- seq(-1, 1, length.out = length(colors) + 1) +``` + +or generate another one using the `brewer.pal()` function. +- If you want to generate a multipanel plot you can use the `layout()` +function (you can have a look at this document for more help: +http://seananderson.ca/courses/11-multipanel/multipanel.pdf). Be carefeul, +some features of `PlotEquiMap()` are not available with multipanel plots. + +#### Exercise 3 - Compare the skill over land over the mediterranean region + +- In the directory named *handson/R*, open a new file named +*Timeserie-skill-mediter.R*. +- The mediterranean region 3E25E-36N44N covers both sea and land, but we would +like to calculate the skill only over land. For this, you need to use a +have the land-sea masks of ENSEMBLES and ERA-interim in NetCDF files, for +example, */path/to/experiments/ENSEMBLES/masks/land_sea.nc* and +*/path/to/observations/ERAint/masks/land_sea.nc*. +- To create masks usable in the `maskobs` and `maskmod` arguments of the +`Load()` function, you can use the following lines: + +```r +fnc <- nc_open('/path/to/experiments/ENSEMBLES/masks/land_sea.nc') +maskERAI <- ncvar_get(fnc, 'LSM') +nc_close(fnc) +maskERAI[which(is.na(maskERAI))] <- 1 + +fnc <- nc_open('/path/to/observations/ERAint/masks/land_sea.nc') +maskENS <- ncvar_get(fnc, 'LSM') +nc_close(fnc) +maskENS[maskENS > 0.5] <- 1 +maskENS[maskENS <= 0.5] <- 0 + +listmaskmod <- list(maskENS, maskENS, maskENS, maskENS, maskENS) +``` + +- Use the `Load()` function with `maskobs` and `maskmod` arguments to load the +data averaged over the land in the mediterranean region for all ENSEMBLES +models and ERA-interim reanalisys, for all May starting dates between 1979 and +2005. +- With `Clim()` and `Ano()` functions of s2dverification, calculate the +anomalies of both models and observation. +- Calculate the ensemble mean of the models. +- Calculate the Root Mean Square Skill Score (RMSSS) for all lead-times, with +the RMSSS function of s2dverification. +- Use the `PlotVsLTime()` function to plot this score and save it in a +PostScript file. `PlotVsLTime()` expects a third dimension of the score of +size 4 (lower confidence interval, score, upper confidence interval and +significance level), while with `RMSSS()` you will have only 2 values (score +and p-value). To do it you can use the following commands to reshape your +score matrix: + +```r +skillreshape <- array(dim = c(5, 1, 4, 7)) +skillreshape[, , c(2, 4), ] <- skill[, , c(1, 2), ] +``` + +- Do the same plot, but instead of plotting a line for the p-values, mark with +a dot the significance values (p-values under 0.05). To do this you can have a +look at the tutorial at + +http://www.harding.edu/fmccown/r/ diff --git a/vignettes/vis_acc_expA_expB_obsX.png b/vignettes/vis_acc_expA_expB_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..f5f66774fb9c7c02d64ba453852249e20408db1d Binary files /dev/null and b/vignettes/vis_acc_expA_expB_obsX.png differ diff --git a/vignettes/vis_anim_clim_expA.gif b/vignettes/vis_anim_clim_expA.gif new file mode 100644 index 0000000000000000000000000000000000000000..7554148d5550fcb194effa41520b06b8a7624f4d Binary files /dev/null and b/vignettes/vis_anim_clim_expA.gif differ diff --git a/vignettes/vis_anim_clim_expA_world.gif b/vignettes/vis_anim_clim_expA_world.gif new file mode 100644 index 0000000000000000000000000000000000000000..f5980fc63ac320a6a99457e2e3c18fdc274d81c6 Binary files /dev/null and b/vignettes/vis_anim_clim_expA_world.gif differ diff --git a/vignettes/vis_anim_clim_expB.gif b/vignettes/vis_anim_clim_expB.gif new file mode 100644 index 0000000000000000000000000000000000000000..465a71528f933b7add55df5ab8d0c81572df1a94 Binary files /dev/null and b/vignettes/vis_anim_clim_expB.gif differ diff --git a/vignettes/vis_anim_clim_obsX.gif b/vignettes/vis_anim_clim_obsX.gif new file mode 100644 index 0000000000000000000000000000000000000000..c8d2aefe08a5b115e480434581210b44f5205da8 Binary files /dev/null and b/vignettes/vis_anim_clim_obsX.gif differ diff --git a/vignettes/vis_anim_clim_obsX_world.gif b/vignettes/vis_anim_clim_obsX_world.gif new file mode 100644 index 0000000000000000000000000000000000000000..1d94c28ce9ce9afa3c3c8f0a4c1b09a819fae6ea Binary files /dev/null and b/vignettes/vis_anim_clim_obsX_world.gif differ diff --git a/vignettes/vis_ano_exp_obs.png b/vignettes/vis_ano_exp_obs.png new file mode 100644 index 0000000000000000000000000000000000000000..2047c681ebdde2fcdd4c8d8e445f4a897a5f9df3 Binary files /dev/null and b/vignettes/vis_ano_exp_obs.png differ diff --git a/vignettes/vis_ano_exp_points.png b/vignettes/vis_ano_exp_points.png new file mode 100644 index 0000000000000000000000000000000000000000..cdcfd8e5c3c8ae2adea46ccb439adedda89f2f4e Binary files /dev/null and b/vignettes/vis_ano_exp_points.png differ diff --git a/vignettes/vis_clim_expA_expB_obsX.png b/vignettes/vis_clim_expA_expB_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..48dbcfb1302fbea352432917b42846d7b8e1a702 Binary files /dev/null and b/vignettes/vis_clim_expA_expB_obsX.png differ diff --git a/vignettes/vis_conf_interval_exp.png b/vignettes/vis_conf_interval_exp.png new file mode 100644 index 0000000000000000000000000000000000000000..85cf56d5a37ff14720c42316ad30a853cef6c89a Binary files /dev/null and b/vignettes/vis_conf_interval_exp.png differ diff --git a/vignettes/vis_corr_expA_expB_obsX.png b/vignettes/vis_corr_expA_expB_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..dcaa96aaefd82516cdc9ac8a0c18fd94b4b931eb Binary files /dev/null and b/vignettes/vis_corr_expA_expB_obsX.png differ diff --git a/vignettes/vis_corr_rms_expA_expB_obsX.png b/vignettes/vis_corr_rms_expA_expB_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..ec8e5fee5429d883949815f8e5b8e9f59b44f7b6 Binary files /dev/null and b/vignettes/vis_corr_rms_expA_expB_obsX.png differ diff --git a/vignettes/vis_eno_expA_expB.png b/vignettes/vis_eno_expA_expB.png new file mode 100644 index 0000000000000000000000000000000000000000..e5fc71dc41ad5fec242029c3d3b6b452bff51b52 Binary files /dev/null and b/vignettes/vis_eno_expA_expB.png differ diff --git a/vignettes/vis_equimap_box_expA.png b/vignettes/vis_equimap_box_expA.png new file mode 100644 index 0000000000000000000000000000000000000000..7c5686ad31e46de4e976fbb9e78e6ce505bdc7b5 Binary files /dev/null and b/vignettes/vis_equimap_box_expA.png differ diff --git a/vignettes/vis_equimap_cols_raw_expA.png b/vignettes/vis_equimap_cols_raw_expA.png new file mode 100644 index 0000000000000000000000000000000000000000..afd7550eba4cba35630387787faa43d425cc37db Binary files /dev/null and b/vignettes/vis_equimap_cols_raw_expA.png differ diff --git a/vignettes/vis_equimap_cols_raw_obsX.png b/vignettes/vis_equimap_cols_raw_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..b3cf03532291470b231ce90e295c8c79f5fc87f9 Binary files /dev/null and b/vignettes/vis_equimap_cols_raw_obsX.png differ diff --git a/vignettes/vis_equimap_contour_raw_expA.png b/vignettes/vis_equimap_contour_raw_expA.png new file mode 100644 index 0000000000000000000000000000000000000000..15b3a630fa62b4e49ec63e7529affcdb804ffeb7 Binary files /dev/null and b/vignettes/vis_equimap_contour_raw_expA.png differ diff --git a/vignettes/vis_equimap_contour_raw_obsX.png b/vignettes/vis_equimap_contour_raw_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..a526700e7c12c690de0b37fc943e5d58a48f82ec Binary files /dev/null and b/vignettes/vis_equimap_contour_raw_obsX.png differ diff --git a/vignettes/vis_equimap_raw_expA.png b/vignettes/vis_equimap_raw_expA.png new file mode 100644 index 0000000000000000000000000000000000000000..7f8441331ec7047b9e787f794abbe9a1b9077d36 Binary files /dev/null and b/vignettes/vis_equimap_raw_expA.png differ diff --git a/vignettes/vis_equimap_raw_obsX.png b/vignettes/vis_equimap_raw_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..f99ae26dd438ff43698d3d66872fcdbf5a5fb1dd Binary files /dev/null and b/vignettes/vis_equimap_raw_obsX.png differ diff --git a/vignettes/vis_error_bar.png b/vignettes/vis_error_bar.png new file mode 100644 index 0000000000000000000000000000000000000000..87718e965faac97ba5f1f866016372ea57c37719 Binary files /dev/null and b/vignettes/vis_error_bar.png differ diff --git a/vignettes/vis_iqr_expA_expB.png b/vignettes/vis_iqr_expA_expB.png new file mode 100644 index 0000000000000000000000000000000000000000..111893da0341bb854cfea68c2c63db2c7a80f565 Binary files /dev/null and b/vignettes/vis_iqr_expA_expB.png differ diff --git a/vignettes/vis_layout_complex.png b/vignettes/vis_layout_complex.png new file mode 100644 index 0000000000000000000000000000000000000000..dfbfc96b356901a0360b25512bd8b31da9947019 Binary files /dev/null and b/vignettes/vis_layout_complex.png differ diff --git a/vignettes/vis_layout_equimap_expA.png b/vignettes/vis_layout_equimap_expA.png new file mode 100644 index 0000000000000000000000000000000000000000..6804590a36d0c071f241c65519618f708d2fe4c1 Binary files /dev/null and b/vignettes/vis_layout_equimap_expA.png differ diff --git a/vignettes/vis_mad_expA_expB.png b/vignettes/vis_mad_expA_expB.png new file mode 100644 index 0000000000000000000000000000000000000000..23ff8a7fa62a9f234ee2e7555f41bec03c43db1b Binary files /dev/null and b/vignettes/vis_mad_expA_expB.png differ diff --git a/vignettes/vis_maxmin_expA_expB.png b/vignettes/vis_maxmin_expA_expB.png new file mode 100644 index 0000000000000000000000000000000000000000..00996ecc8093d15d915450c0ada52c25d912927b Binary files /dev/null and b/vignettes/vis_maxmin_expA_expB.png differ diff --git a/vignettes/vis_ratiorms_expA_expB_obsX.png b/vignettes/vis_ratiorms_expA_expB_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..3b5bafcbf3dd02c8d4d3e9279ff005b835733d1c Binary files /dev/null and b/vignettes/vis_ratiorms_expA_expB_obsX.png differ diff --git a/vignettes/vis_ratiosdrms_expA_expB_obsX.png b/vignettes/vis_ratiosdrms_expA_expB_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..47d65fdb78aa7c8e4015b80c685e7564f1b7b18d Binary files /dev/null and b/vignettes/vis_ratiosdrms_expA_expB_obsX.png differ diff --git a/vignettes/vis_ratiosdrms_expA_obsX_obsXrnorm.png b/vignettes/vis_ratiosdrms_expA_obsX_obsXrnorm.png new file mode 100644 index 0000000000000000000000000000000000000000..36dc1f03f737fae63fa3e0c0a9dd70c74575d713 Binary files /dev/null and b/vignettes/vis_ratiosdrms_expA_obsX_obsXrnorm.png differ diff --git a/vignettes/vis_raw_expA_obsX.png b/vignettes/vis_raw_expA_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..bec40259be7a2d4c0aa93f74bae39e6ce18f4437 Binary files /dev/null and b/vignettes/vis_raw_expA_obsX.png differ diff --git a/vignettes/vis_raw_expB_obsX.png b/vignettes/vis_raw_expB_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..de1e17e23cabf5136fef8c4cc65ad760a5704b96 Binary files /dev/null and b/vignettes/vis_raw_expB_obsX.png differ diff --git a/vignettes/vis_regression_expA_expB.png b/vignettes/vis_regression_expA_expB.png new file mode 100644 index 0000000000000000000000000000000000000000..41481789122710bbd4663c296f3f7e63dae53616 Binary files /dev/null and b/vignettes/vis_regression_expA_expB.png differ diff --git a/vignettes/vis_rms_expA_expB_obsX.png b/vignettes/vis_rms_expA_expB_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..cbc57cebe04135094f6100aa5b04181be5a3bb4b Binary files /dev/null and b/vignettes/vis_rms_expA_expB_obsX.png differ diff --git a/vignettes/vis_rmsss_expA_expB_obsX.png b/vignettes/vis_rmsss_expA_expB_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..5f2aa5a0450ebf163c6029131f00bfb5962472df Binary files /dev/null and b/vignettes/vis_rmsss_expA_expB_obsX.png differ diff --git a/vignettes/vis_sd_expA_expB.png b/vignettes/vis_sd_expA_expB.png new file mode 100644 index 0000000000000000000000000000000000000000..f5ec2917a256d135e922fbc798cb3b77f608ad53 Binary files /dev/null and b/vignettes/vis_sd_expA_expB.png differ diff --git a/vignettes/vis_stereomap_raw_expA.png b/vignettes/vis_stereomap_raw_expA.png new file mode 100644 index 0000000000000000000000000000000000000000..fb7b265f17f3208599f60be708cd619be2179b21 Binary files /dev/null and b/vignettes/vis_stereomap_raw_expA.png differ diff --git a/vignettes/vis_stereomap_raw_obsX.png b/vignettes/vis_stereomap_raw_obsX.png new file mode 100644 index 0000000000000000000000000000000000000000..4d1ffe56ed243fc73a7d136bbadf341347d5b58f Binary files /dev/null and b/vignettes/vis_stereomap_raw_obsX.png differ diff --git a/vignettes/vis_trend_expA_expB.png b/vignettes/vis_trend_expA_expB.png new file mode 100644 index 0000000000000000000000000000000000000000..1ec1cb998f2ac9f016cb1ee11c800019507f6320 Binary files /dev/null and b/vignettes/vis_trend_expA_expB.png differ diff --git a/vignettes/visualisation.md b/vignettes/visualisation.md new file mode 100644 index 0000000000000000000000000000000000000000..a296228f4008709d21cd6e99614563d5218442f2 --- /dev/null +++ b/vignettes/visualisation.md @@ -0,0 +1,772 @@ +--- +author: "Nicolau Manubens" +date: "`r Sys.Date()`" +output: html_document +vignette: > + %\VignetteEngine{R.rsp::md} + %\VignetteIndexEntry{Visualisation} + %\usepackage[utf8]{inputenc} +--- +Visualisation +============= + +s2dverification contains a set of functions to plot data at every stage of the +verification process, most based directly on R graphics plotting tools. +These functions are essential to: + - Quickly inspect the results of a newly produced experiment, i.e. to check +the physical consistency of the results. + - Assess the added value of a new prediction system, i.e. to compare new +results with a reference (observation, reconstruction or other experiment). + - Assess visually the significance of results, i.e. to display in a +user-friendly way confidence intervals and other statistics. + +The visualisation functions, most with a name following the pattern +`Plot*()`, can be grouped as a function of the kind of plot they provide: + - [**Plotting time series**](#time_series): `PlotClim()`, `PlotAno()`, +`PlotVsLtime()`, `Plot2VarsVsLTime()`, `PlotBoxWhisker()` and `PlotACC()`. + - [**Plotting maps**](#maps): `PlotEquiMap()`, +`PlotStereoMap()`, `AnimateMap()`, `PlotLayout()` and `PlotSection()`. + +To master these functions it is convenient to have in mind the common array +dimension structure used throughout in s2dverification and how it evolves as +the data objects go through the statistics and verification stages. For that +you can review the introduction in [**Data retrieval**](data_retrieval.md) and +the sections [**Statistics**](statistics.md) and +[**Verification**](verification.md). + +Next an explanation of which situations they fit the best, details of the +options they provide and short examples of usage. The data used hereunder will +be the same as in [**Data retrieval**](data_retrieval.md): +```r +library(s2dverification) +expA <- list(name = 'experimentA', + path = file.path('/path/to/experiments/$EXP_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc')) +expB <- list(name = 'experimentB', + path = file.path('/path/to/experiments/$EXP_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$START_DATE$.nc')) +obsX <- list(name = 'observationX', + path = file.path('/path/to/observations/$OBS_NAME$/monthly_mean', + '$VAR_NAME$/$VAR_NAME$_$YEAR$$MONTH$.nc')) +sdates <- paste0(1991:2000, '1101') +data <- Load('tas', list(expA, expB), list(obsX), + sdates = sdates, + leadtimemin = 2, leadtimemax = 13, + latmin = -10, latmax = 60, + lonmin = 100, lonmax = 250, + output = 'areave', grid = 't106grid', + method = 'distance-weighted') +ano <- Ano_CrossValid(data$mod, data$obs, memb = FALSE) +``` + +Plotting time series +-------------------- + +All the functions devoted to plotting time series have some common traits and +parameters: + - Aimed to plot monthly, seasonal or yearly time series, adjustable via the +parameter `freq`. + - All of them generate plots and save them to PostScript files the path and +file name of which you have to provide to the `fileout` parameter. Can plot in +a presentation oriented style or in a paper oriented style, adjustable with the +parameter `biglab`. + - Can plot a legend automatically, adjustable via the parameters `leg`, +`listexp` and `listobs`/`listvar` in `PlotVsLTime()`, `Plot2VarVsLtime()` and +`PlotClim()` or via the parameter `legends` in `PlotAno()` and `PlotACC()`. + - Accept any additional parameters via the parameter `...` to be sent to +the underlying R graphics `plot()` function for a fine tuning. + +**Note:** A general purpose time-series plotting function, `PlotTimeSeries()`, +is currently being developed. This function will agglomerate all the +functionality required to generate the plots resulting from all the currently +available functions in `s2dverification` and will be based on the `ggplot2` +package. The current functions will be kept as they are but will simply be an +interface to `PlotTimeSeries()`. See +[**this report**](https://earth.bsc.es/gitlab/es/s2dverification/blob/develop-PlotTimeSeries/inst/doc/PlotTimeSeries/PlotTimeSeries.pdf) +for an extended explanation and examples of the new +time-series plotting function. + +### Plotting climatologies + +`PlotClim()` takes as inputs an experimental and an observational array of 2 +to 3 dimensions each: +```r + c(n. of datasets, n. of members, n. of forecast times) +``` +the second one being optional. This means that the latitude and longitude +dimensions must have been removed either because working on area-averages or +because working on a single grid point and that the starting dates dimension +must have also been removed either because willing to plot data of a single +starting date or because working on averages across starting dates, i.e. +climatologies (see [**Statistics**](statistics.md)). + +A curve is plotted for each member of each experiment or observation. The +members of a same experiment or observation are equally coloured whereas the +different experiments and observations are plotted with different colours. +`PlotClim()` assumes by default that the provided climatologies start in +January. Otherwise the initial month can be specified with the parameter +`monini`: + +```r +clim <- Clim(data$mod, data$obs) +PlotClim(clim$clim_exp, clim$clim_obs, monini = 12, + toptitle = "Per-pair 'tas' climatologies, North Pacific", + ytitle = "K", + listexp = c('Experiment A', 'Experiment B'), + listobs = c('Observation X'), + fileout = "vis_clim_expA_expB_obsX.png") +``` + + +### Plotting multi-member raw data or anomalies + +`PlotAno()` takes also two data arrays as inputs with 4 dimensions each: +```r + c(n. of datasets, n. of members, n. of starting dates, n. of forecast times) +``` +all being mandatory. It means that if willing to plot single member data or +data from a single starting date, the corresponding dimensions still have to +be present in the provided arrays; `InsertDim()` helps in that. + +The data in the provided arrays can be raw data, anomalies or any other kind +of data, as long as it is arranged with the required dimensions. +For each run of a model (i.e. for each group of members of a same starting +date from one experiment) a thick curve with the ensemble mean is plotted +together with a finer curve for each ensemble member, unless any of these are +disabled with `ensmean` or `memb`, respectively. The area delimited by the +minimum and maximum ensemble values at each forecast time is filled in with a +different colour for each starting date, unless disabled with `fill`. +A fine black curve is plotted with the observational data alongside each +starting date. +A separate new plot is created for each experimental dataset provided in the +experimental data array. + +The starting dates the provided data corresponds to must be provided with the +parameter `sdates` for them to be plotted accordingly along the x axis. + +`linezero` draws a line at y = 0 to help perceiving whether the anomaly is +positive or negative, `vlines` draws vertical lines at any specified abscissae +and `points` will draw all the curves in pointed style. + +```r +selected_sdates <- gsub('1101', '1201', sdates) +PlotAno(data$mod, data$obs, selected_sdates, + toptitle = paste0(c("Experiment A", "Experiment B"), + ": Raw 'tas' over North Pacific"), + ytitle = c("K", "K"), + fileout = paste0("vis_raw_exp", c("A", "B"), "_obsX.png")) +``` + + + +### Plotting statistics and scores + +The functions `PlotVsLTime()`, `Plot2VarsVsLTime()` and `PlotACC()` serve to +plot time series of variables obtained with the +[**Statistics**](statistics.md) or [**Verification**](verification) modules +that yield forecast time series of indices or scores that usually come +together with confidence intervals and significance levels. + +`PlotVsLTime()` takes as main input an array of data with the following +dimensions: +```r + c(n. of experiments, n. of observational datasets, 3 or 4, n. of forecast times) +``` +the second one being optional and the third one corresponding to the lower +limit of a confidence interval, the measurement or index or score, the upper +limit of a confidence interval and, optionally, a significance level. This +dimension format requires that the index or score to be plotted is computed +along the actual time (starting dates) and over the average of the ensemble +members, either at a single grid point or from an area average. + +A curve is drawn for each pair of (experiment, observation) with a different +colour and line style. The scores or indices that correspond to a same +experiment are coloured equally but a different line style is used for each +observation that experiment has been compared to when computing the index or +score. The confidence intervals are plotted with a finer line of the same +colour and line style, unless disabled with `show_conf`. To plot the +signification level instead the parameter `siglev` can be set to `TRUE`. + +As in `PlotClim()`, each curve is defined along the forecast times and is +assumed to start at January. Otherwise it can be adjusted via `monini`. +Additionally the number of ticks along the x axis can be adjusted with +`nticks` and any number of horizontal lines can be drawn by specifying the +target ordinates to `hlines`. + +`PlotVsLTime()` can be useful in the following situations: + - To plot `Trend()` or `Regression()` fitted coefficients of ensemble averages +of data at a single grid cell or averaged over a region: + +```r +trend_exp <- Trend(Mean1Dim(ano$ano_exp, 2)) +trend_obs <- Trend(Mean1Dim(ano$ano_obs, 2)) +PlotVsLTime(trend_exp$trend, + toptitle = "Slopes of trends of 'tas' over North Pacific", + ytitle = "K/year", + monini = 12, freq = 1, + listexp = c('Experiment A', 'Experiment B'), + fileout = 'vis_trend_expA_expB.png') +``` +```r +ano_expA_Y <- Regression(Mean1Dim(plyr::take(ano$ano_exp, 1, 1), 2), + Mean1Dim(plyr::take(ano$ano_exp, 1, 2), 2)) +PlotVsLTime(ano_expA_Y$regression, + toptitle = "Regr. slopes of exp. A with exp. B, N. Pacific", + ytitle = "K/year", + monini = 12, freq = 1, leg = FALSE, + fileout = 'vis_regression_expA_expB.png') +``` + + + - To plot the `Spread()` across ensemble members and starting dates of area +averaged data (interquartile range, maximum minus minimum, standard deviation +or median absolute deviation): + +```r +spread <- Spread(ano$ano_exp, posdim = c(2, 3)) +PlotVsLTime(spread$iqr, + toptitle = "IQR across members and starting dates over North Pacific", + ytitle = "K", sizetit = 0.7, + monini = 12, freq = 1, + listexp = c('Experiment A', 'Experiment B'), + fileout = 'vis_iqr_expA_expB.png') +PlotVsLTime(spread$maxmin, + toptitle = "Max.-min. across members and s. dates over North Pacific", + ytitle = "K", sizetit = 0.7, + monini = 12, freq = 1, + listexp = c('Experiment A', 'Experiment B'), + fileout = 'vis_maxmin_expA_expB.png') +PlotVsLTime(spread$sd, + toptitle = "S. deviation across members and s. dates over North Pacific", + ytitle = "K", sizetit = 0.7, + monini = 12, freq = 1, + listexp = c('Experiment A', 'Experiment B'), + fileout = 'vis_sd_expA_expB.png') +PlotVsLTime(spread$iqr, + toptitle = "MAD across members and starting dates over North Pacific", + ytitle = "K", sizetit = 0.7, + monini = 12, freq = 1, + listexp = c('Experiment A', 'Experiment B'), + fileout = 'vis_mad_expA_expB.png') +``` + + + + + - To plot the correlation (`Corr()`) and RMSE (`RMS()`) between experiments +(averaged across ensemble members) and observations: + +```r +corr <- Corr(Mean1Dim(ano$ano_exp, 2), Mean1Dim(ano$ano_obs, 2)) +PlotVsLTime(corr, + toptitle = "Time correlation with observation X, over North Pacific", + ytitle = "K", sizetit = 0.7, + monini = 12, freq = 1, + listexp = c('Experiment A', 'Experiment B'), + fileout = 'vis_corr_expA_expB_obsX.png') +``` +```r +rms <- RMS(Mean1Dim(ano$ano_exp, 2), Mean1Dim(ano$ano_obs, 2)) +PlotVsLTime(rms, + toptitle = "RMSE with observation X, over North Pacific", + ytitle = "K", sizetit = 0.7, + monini = 12, freq = 1, + listexp = c('Experiment A', 'Experiment B'), + fileout = 'vis_rms_expA_expB_obsX.png') +``` + + + - To plot the ratio between the RMSE of the ensemble mean of two experiments +with a same observation at a single grid point or area averaged: + +```r +ratio_rms <- RatioRMS(Mean1Dim(ano$ano_exp[1, , , ], 1), + Mean1Dim(ano$ano_exp[2, , , ], 1), + ano$ano_obs[1, 1, , ]) +ratio_rms2 <- array(dim = c(1, 4, dim(ratio_rms)[2])) +ratio_rms2[1, c(2, 4), ] <- ratio_rms +PlotVsLTime(ratio_rms2, + toptitle = "RMSE exp. A / RMSE exp. B (against obs. X), over N. Pacific", + ytitle = "K", sizetit = 0.7, + monini = 12, freq = 1, siglev = TRUE, leg = FALSE, + fileout = 'vis_ratiorms_expA_expB_obsX.png') +``` + + - To plot the ratio between the ensemble spread of the experiments and their +RMSE against the observations (`RatioSDRMS()`) at a single grid point or area +averaged: + +```r +ratio_sdrms <- RatioSDRMS(ano$ano_exp, ano$ano_obs) +ratio_sdrms2 <- array(dim = c(dim(ratio_sdrms)[1:2], 4, dim(ratio_sdrms)[4])) +ratio_sdrms2[, , c(2, 4), ] <- ratio_sdrms +PlotVsLTime(ratio_sdrms2, + toptitle = "S. dev. over members and s. dates / RMSE, over North Pacific", + ytitle = "K", sizetit = 0.7, + monini = 12, freq = 1, siglev = TRUE, + listexp = c('Experiment A', 'Experiment B'), + fileout = 'vis_ratiosdrms_expA_expB_obsX.png') +``` + + +In this example, the ratio SD / RMS is calculated for the experiment A only +but against two observational datasets: + +```r +ratio_sdrms <- RatioSDRMS(plyr::take(ano$ano_exp, 1, 1), + abind::abind(ano$ano_obs, + ano$ano_obs + rnorm(length(ano$ano_obs), 0, 0.1), + along = 1)) +ratio_sdrms2 <- array(dim = c(dim(ratio_sdrms)[1:2], 4, dim(ratio_sdrms)[4])) +ratio_sdrms2[, , c(2, 4), ] <- ratio_sdrms +PlotVsLTime(ratio_sdrms2, + toptitle = "S. dev. over members and s. dates / RMSE, over North Pacific", + ytitle = "K", sizetit = 0.7, + monini = 12, freq = 1, siglev = TRUE, + listexp = c('Experiment A'), + listobs = c('Observation X', 'Observation X + rnorm(n, 0, 0.1)'), + fileout = 'vis_ratiosdrms_expA_obsX_obsXrnorm.png') +``` + + - To plot `RMSSS()` of ensemble mean at a single grid point or area averaged: + +```r +rmsss <- RMSSS(Mean1Dim(ano$ano_exp, 2), Mean1Dim(ano$ano_obs, 2)) +rmsss2 <- array(dim = c(dim(rmsss)[1:2], 4, dim(rmsss)[4])) +rmsss2[, , c(2, 4), ] <- rmsss +PlotVsLTime(rmsss2, + toptitle = "RMSSS with observation X, over North Pacific", + ytitle = "K", sizetit = 0.7, + monini = 12, freq = 1, siglev = TRUE, + listexp = c('Experiment A', 'Experiment B'), + fileout = 'vis_rmsss_expA_expB_obsX.png') +``` + + - To plot effective number of independent data (`Eno()`): + +```r +eno <- Eno(Mean1Dim(ano$ano_exp, 2), posdim = 2) +eno2 <- array(dim = c(dim(eno)[1], 4, dim(eno)[2])) +eno2[, 2, ] <- eno +PlotVsLTime(eno2, + toptitle = "Effective n. of independent data, over North Pacific", + sizetit = 0.7, + monini = 12, freq = 1, siglev = FALSE, show_conf = FALSE, + listexp = c('Experiment A', 'Experiment B'), + fileout = 'vis_eno_expA_expB.png') +``` + + +`Plot2VarsVsLTime()` allows to plot two indices or scores at a time on the same +plot, each with its confidence intervals. It accepts as inputs arrays of only +3 dimensions: +```r + c(n. of experiments, 3/4, n. of forecast times) +``` +i.e., it accepts only indices or scores from a set of experimental datasets +against a single observation. +Its options are identical to the ones of `PlotVsLTime()` except that the +parameter for specifying legends for observations (`listobs`) is replaced by a +parameter to put a legend to distinguish the plotted variables, `listvars`. +``` +Plot2VarsVsLTime(corr[, 1, 1:3, ], rms[, 1, , ], + toptitle = "Time correlation and RMSE with obs. X, over North Pacific", + ytitle = "K", sizetit = 0.7, + monini = 12, freq = 1, + listexp = c('Experiment A', 'Experiment B'), + listvars = c('Corr', 'RMSE'), + fileout = 'vis_corr_rms_expA_expB_obsX.png') +``` + + +`PlotACC()`, in contrast to `PlotVsLTime()`, accepts an additional dimension +for the starting dates in the input and the dimension of the confidence +intervals is positioned at the end in place of the latitudes and longitudes, +as provided by `ACC()`: +```r + c(n. of experiments, n. of observational datasets, n. of starting dates, n. of forecast times, 4) +``` +The ACCs and confidence intervals of all forecast times and starting dates of +all experiments are plotted. In contrast to `PlotAno()`, the consecutive +starting dates are overlapped so as to make it easy to compare their behaviour +overall. + +All the starting dates of an experiment are painted with the same colour. +There are two possible styles, switchable via the parameter `points`: + - Drawing curves along forecast times, as in `PlotVsLTime()`. In that case +`fill` can enable filling the area limited by the confidence intervals. + - Drawing a point for each forecast time, with vertical lines joined to the +extremes of the confidence interval, limited with notches (default). +Besides, `linezero` draws a line at ordinate 0 and `vlines` draws vertical +lines at any specified set of abscissae. + +```r +map_data <- Load('tas', list(expA, expB), list(obsX), + sdates = sdates, + leadtimemin = 2, leadtimemax = 13, + latmin = -30, latmax = 60, + lonmin = -90, lonmax = 20, + output = 'lonlat', grid = 't106grid', + method = 'distance-weighted') +ano <- Ano_CrossValid(map_data$mod, map_data$obs, memb = FALSE) +acc <- ACC(ano$ano_exp, ano$ano_obs) +PlotACC(acc$ACC, selected_sdates, + toptitle = "Spatial anomaly corr. coeff. with obs X over Atlantic", + ytitle = "K", sizetit = 0.7, freq = 1, + legends = c('Experiment A', 'Experiment B'), + fileout = 'vis_acc_expA_expB_obsX.png') +``` + + +`PlotBoxWhisker()` + +PlotTimeSeries +------------- +**Note:** PlotTimeSeries is currently still under development, but a beta-version can be downloaded from the s2dverification gitlab page, from the develop-PlotTimeSeries branch. + +The function, `PlotTimeSeries`, is an agglomerate of the time series plotting functions from previous versions of `s2dverification`, with enhanced functionality. The function is still evolving, to ensure that it is compatible with the common R data structure which is currently being discussed within the QA4Seas project. + +In brief, the `PlotTimeSeries` function takes common data model (CDM) objects or arrays of any number of (named) dimensions and creates scatter plots or draws curves along them, as well as plotting their means and max/mins. The inputted data can include metadata that is automatically incorporated into the plot, such as the name of the plotted variable, the start dates of a set of provided seasonal forecasts, the date at each forecast time step or the names of the members. The user can manually specify which dimensions to colour or style along, but the function is designed to anticipate the user's requirements. The advantages of the new function are its increased ease-of-use and versatility, as well as the improved quality of the final plots due to the use of the `ggplot2` package. + +### Plotting anomalies and climatologies + +`PlotTimeSeries` takes CDM objects or arrays as input, with the following dimensions: + +```r +c(dataset, member, start dates, forecast times). +``` +Objects with different numbers of dimensions, or different orderings can also be handled by the function, by appropriate specification of the `curves_along`, `mean_along`, and `colour_along` parameters. When a CDM is inputted, the default setting is for the function to calculate the multi-member means for each dataset and start date and plot them in different colours, and shade between the min/max. + +```r +library(s2dverification) +PlotTimeSeries(ano_exp) + PlotTimeSeries(ano_obs, add = T) +``` + + + +-------------------- + +The `PlotTimeSeries` function has read the x- and y- axis labels, the title and the legend from the metadata automatically. By default, a horizontal line is plotted along `y = 0`, and this line can be shifted or removed with `intercept`. Layers can be added by selecting `add = TRUE`, as in the above example, where the dataset of observations have been added to the plot. The user can plot the geometric objects (the mean, confidence intervals, curves etc.) along any of the dimensions, as well as adding points, changing the linestyles and removing any of the objects. For example the curves for the individual members can be replaced with points, with different shapes for the different members, and the shading between the minimum and maximum can be removed as follows. + +```r +PlotTimeSeries(ano_exp, minmax_along = NA, points = T, shape_along = 2, curves_along = NA) +``` + + + + + +### Plotting scores and sample statistics + +If the input to the function is a CDM object containing sample statistics, e.g. the correlation, then a plot of these statistics and their associated confidence intervals (if availble) will automatically be plotted as error bars. Alternatively, the user can specify the confidence intervals to be plotted as dashed lines (and the sample statistic as a solid line) with `interval_type`. + +```r +PlotTimeSeries(Corr) +PlotTimeSeries(Corr, interval_type = "line") +``` + + + + + + +### Conclusions + +The use of the R package `ggplot2` in `PlotTimeSeries` results in high quality time series plots, as well as simplifying the creation of sophisticated graphics. Another benefit of the `ggplot2` package is that the user can easily modify the output of `PlotTimeSeries` to configure the plot title, axis labels and legends. + +The function is being refined to accommodate a wide range of user requirements. This work is being conducted alongside changes currently being made to the `s2dverification` and `downscaleR` packages to ensure a common R data structure. + + +Plotting maps +------------- + +This group of functions allows to plot grid data (i.e. defined over latitudes +and longitudes) on a rectangular equidistant projection or on a stereographic +projection (as of s2dverification v2.5.0) as well as depth sections (i.e. +defined over latitudes/longitudes and depth levels). + +Regarding the functions to plot maps, by default each grid point is drawn on a +world map with a colour as a function of the magnitude of the provided field, +index or score at that point. While `PlotEquiMap()` and `PlotStereoMap()` plot +data of a single forecast time, `AnimateMap()` creates an animation with the +plotted maps of all forecast times obtained with one of the previous functions at +choice. + +`PlotEquiMap()` and `PlotStereoMap()` share some traits: + - Both expect a data matrix in the parameter `var` of dimensions +`c(n. of latitudes, n. of longitudes)`, `lat` with a vector with the latitudes +of the centers of the grid cells and `lon`, a vector with the longitudes. +`Load()` already provides these in the correct format, through the components +`$lon` and `$lat`. + - The N colours to paint the grid cells with (via `cols`) as well as the N + +1 threshold magnitudes (via `brks`) that will allow to assign each grid cell +value one of the N colours. The colour for any missing values can be adjusted +with `colNA`. + - Whether to fill the map continents or only draw coast wires can be adjusted +with `filled.continents`. + - A matrix of `dots` can be plotted over the drawn map to, e.g., highlight +cells where a skill has been greatest or most significant. + - A legend can be drawn (unless disabled with `drawleg`) and the amount of +ticks on that legend can be adjusted with `subsampleg`. + - They accept any additional parameters via the parameter `...` to be sent to +the underlying R graphics `image()` function for a fine tuning. + +### PlotEquiMap() + +Next some examples of maps obtained with `PlotEquiMap()`: +```r +PlotEquiMap(Mean1Dim(map_data$mod, 2)[1, 1, 1, , ], map_data$lon, map_data$lat, + toptitle = "Exp. A 'tas', s. date: 1990-11-01, f. time: 1 month", + units = "K", fileout = 'vis_equimap_raw_expA.png') + +PlotEquiMap(Mean1Dim(map_data$obs, 2)[1, 1, 1, , ], map_data$lon, map_data$lat, + toptitle = "Obs. X: 'tas', 1990-12-01", + units = "K", filled.continents = FALSE, fileout = 'vis_equimap_raw_obsX.png') +``` + + + +```r +PlotEquiMap(Mean1Dim(map_data$mod, 2)[1, 1, 1, , ], map_data$lon, map_data$lat, + toptitle = "Exp. A 'tas', s. date: 1990-11-01, f. time: 1 month", + units = "K", brks = 100, bar_limits = c(255, 300), + filled.continents = 'black', + fileout = 'vis_equimap_cols_raw_expA.png') + +PlotEquiMap(Mean1Dim(map_data$obs, 2)[1, 1, 1, , ], map_data$lon, map_data$lat, + toptitle = "Obs. X 'tas', 1990-12-01", + units = "K", brks = 100, bar_limits = c(255, 300), + filled.continents = FALSE, + fileout = 'vis_equimap_cols_raw_obsX.png') +``` + + + + +Or, as seen in the example from [**Snippet 2**](snippets.md#snippet2): + + + + + +`PlotEquiMap()` has some other additional features: + - Drawing contour lines: `square` allows to smooth the borders of each +coloured region and draws contour lines and figures. The contours can be +adjusted by providing a matrix to `contours` with dimensions +`c(n. of longitudes, n. of latitudes)`. The contours of the data in that +matrix will be drawn over the map, using the thresholds provided in `brks2`. +This is useful to plot the contours of the original data without border +smoothing, to plot only some of the default contours or to plot other contours +defined by another field. + +```r +PlotEquiMap(Mean1Dim(map_data$mod, 2)[1, 1, 1, , ], map_data$lon, map_data$lat, + toptitle = "Exp. A 'tas', s. date: 1990-11-01, f. time: 1 month", + units = "K", brks = 100, bar_limits = c(255, 300), square = FALSE, + fileout = 'vis_equimap_contour_raw_expA.png') + +PlotEquiMap(Mean1Dim(map_data$obs, 2)[1, 1, 1, , ], map_data$lon, map_data$lat, + toptitle = "Obs. X 'tas', 1990-12-01", + units = "K", brks = 100, bar_limits = c(255, 300), + square = FALSE, contour_lty = 2, + fileout = 'vis_equimap_contour_raw_obsX.png') +``` + + + + + - Drawing boxes on the map: `boxlim`, `boxcol` and `boxlwd` allow to +specify the position of the corners, colour and thickness of a box to be drawn +on the map. + +```r +PlotEquiMap(Mean1Dim(map_data$obs, 2)[1, 1, 1, , ], map_data$lon, map_data$lat, + toptitle = "Obs. X 'tas', 1990-12-01", + units = "K", brks = 100, bar_limits = c(255, 300), + boxlim = list(c(-62, -22, -23, 15), c(-30, 10, 0, 32)), + boxcol = c('purple', 'blue'), boxlwd = c(8, 4), + coast_color = 'black', + fileout = 'vis_equimap_box_expA.png') +``` + + + + - Ticks on the longitude/latitude axes can be adjusted with `axelab`, `labW`, +`intylat` and `intxlon`. + - `numbfig` allows to specify the number of figures in the final layout in +order to adjust some spacing parameters. + +### PlotStereoMap() + +`PlotStereoMap()` can be called almost identically to `PlotEquiMap()`, except +that it requires the `latlims` parameter to specify which range of latitudes to +be plotted on the stereographic projection map. +The only special parameter is the `intlat`, to set the interval in degrees +between consecutive latitude circles on the plot. + +Next a couple of examples: + +```r +## Loading data over the entire globe +world_data <- Load('tas', list(expA, expB), list(obsX), + sdates = sdates[1], + leadtimemin = 2, leadtimemax = 13, + latmin = -90, latmax = 90, + lonmin = 0, lonmax = 360, + output = 'lonlat', grid = 't106grid', + method = 'distance-weighted') +PlotStereoMap(Mean1Dim(world_data$mod, 2)[1, 1, 1, , ], + world_data$lon, world_data$lat, c(40, 90), + brks = 100, bar_limits = c(240, 290), + toptitle = "Exp. A 'tas', s. date: 1990-11-01, f. time: 1 month", + title_scale = 0.5, + units = "K", filled.continents = 'black', + fileout = 'vis_stereomap_raw_expA.png') + +PlotStereoMap(Mean1Dim(world_data$obs, 2)[1, 1, 10, , ], + world_data$lon, world_data$lat, c(-90, -60), intlat = 10, + brks = 100, bar_limits = c(240, 290), + boxlim = list(c(0, -85, 135, -70)), + toptitle = "Obs. X 'tas', 1990-12-01", + title_scale = 0.5, + units = "K", fileout = 'vis_stereomap_raw_obsX.png') +``` + + + + +### AnimateMap() + +`AnimateMap()` generates animations of maps from `PlotEquiMap()` or +`PlotStereoMap()` and saves them in GIF files at the paths specified in the +parameter `fileout`. It receives as main parameter, similarly as +`PlotVsLTimes()`, an array with the dimensions: +```r + c(n. of experiments/observations, n. of observations, 3, n. of forecast times, n. of latitudes, n. of longitudes) +``` +the first 3 being optional and the 3rd dimension corresponding to the lower +confidence interval, the actual value and the upper confidence interval. This +means it can plot maps of a field of one or multiple experimental or +observational datasets, averaged across ensemble members and starting dates +(climatologies) or indices or scores computed across starting dates against +one or more observations. The confidence intervals, if provided, are used to +draw black dots on grid cells that reach a 95% significance level, if +requested via `msk95lev`. + +The provided data is assumed to be in a monthly frequency and to start at +January. Otherwise it can be adjusted with `monini` and `freq`. + +Whether to plot with a equidistant rectangular projection or a stereographic +projection can be chosen with the parameter `equi`. The plot can be limited to +a sub-region of the provided data via the parameters `lonmin`, `lonmax`, +`latmin` and `latmax`. + +`AnimateMap()` allows for the typical adjustments in the map plotting +functions: specifying the palette colors and breaks (`col` and `brk`), +displaying or not a legend (`drawleg`, `subsampleg`), selecting a color for +the NA values (`colNA`) and filling or not the continents +(`filled.continents`). + +Further title and axis tick adjustments can be achieved with `toptitle`, +`sizetit`, `units`, `intlon` and `intlat`. +brk col drawleg subsampleg colNA filled.continents + +Next a few examples: + +```r +map_clim <- Clim(map_data$mod, map_data$obs, memb = FALSE) + +cols <- clim.colors(50) + +data_min <- min(map_data$mod, map_data$obs, na.rm = TRUE) +data_max <- max(map_data$mod, map_data$obs, na.rm = TRUE) +brks <- round(seq(data_min, data_max, length.out = length(cols) + 1), 2) + +AnimateMap(Subset(map_clim$clim_exp, 'dataset', 1), + map_data$lon, map_data$lat, monini = 12, + toptitle = "Exp. A climatologies.", + units = "K", brks = brks, cols = cols, + fileout = "vis_anim_clim_expA.gif") +``` + + +And, as seen in [**Snippet 2**](snippets.md#snippet2), the animations of the +actual time correlations of Experiment A and B against Observation X over the +Atlantic, with black dots on values that reach a 95% significance level: + + + + + +Also the entire globe and stereographic projection maps can be animated: + +```r +world_clim <- Clim(world_data$mod, world_data$obs, memb = FALSE) +AnimateMap(Subset(world_clim$clim_exp, 'dataset', 1), + world_data$lon, world_data$lat, + filled.continents = FALSE, monini = 12, + toptitle = "Exp. A climatologies.", units = "K", + brks = brks, cols = cols, + fileout = 'vis_anim_clim_expA_world.gif') +AnimateMap(world_clim$clim_obs, + world_data$lon, world_data$lat, equi = FALSE, + latmin = 60, latmax = 90, monini = 12, + toptitle = "Obs. X climatologies.", + units = "K", brks = brks, cols = cols, + fileout = "vis_anim_clim_obsX_world.gif") +``` + + + + + +### PlotLayout() + +This function allows to easily combine plots from any R plot function in +layouts. The input data can be provided in multiple data arrays, and a common +colour bar can be set for the multi-panel. + +A common use case is to plot the ensemble members at a given time step: + +```r +PlotLayout(PlotEquiMap, c('lat', 'lon'), + Subset(map_data$mod, + list('dataset', 'sdate', 'ftime'), + list(1, 1, 1)), + map_data$lon, map_data$lat, units = 'K', + toptitle = "Exp. A 'tas', s. date: 1990-11-01, f.time: 1 month", + titles = paste("Member", 1:dim(map_data$mod)[2]), + brks = 50, bar_limits = c(250, 300), coast_color = 'black', + fileout = "vis_layout_equimap_expA.png") +``` + + + +But really complex layouts can be achieved thanks to the great number of +available parameters: + +```r +ens_mean <- Subset(world_data$mod, + list('dataset', 'sdate', 'ftime'), + list(1, 1, 1)) +layout <- PlotLayout(fun = c('PlotEquiMap', 'plot', 'plot', 'PlotStereoMap'), + plot_dims = c('lat', 'lon'), + var = list(ens_mean, array(1:10), + array(10:1), ens_mean), + lon = world_data$lon, lat = world_data$lat, + fill = 'black', sizetit = 0.5, axes_label_scale = 0.6, + coast_color = 'yellow', + titles = paste('Fig.', 1:12), toptitle = 'Multipanel', + row_titles = paste('Row', 1:3), col_titles = paste('Col', 1:4), + drawleg = 'E', units = 'K', units_scale = 2, + bar_limits = c(275, 300), brks = 26, + bar_extra_labels = 281:284, + width = 12, height = 10, res = 200, + fileout = 'vis_layout_complex.png') +``` + + + +### PlotSection() + +