Issue with Calibration , nans and precipitation
Hi @aho @nperez @lpalma @amartine
I have some news about this code with some additions to it suggested by @cdelgado . It's about finding a way to solve problems derived from data with values of 0 mm/day (where it doesn't rain)
We've found that one of the ensemble members has points where there's no rain. This messes up with the calculation of some metrics and we have used another function inside the code: tryCatch. We are saying: try to compute the calibration, and if there are Nans, return the Nans.
It still doesn't work completely - I'm working on it - as it returns only the raw data and not the rest of the metrics. Another option I had is to edit the Calibration function setting na.rm to FALSE and call that function instead of CSTools_Calibration. I think it could be great if we find a way to overcome this problem as it's a common issue when calibrating precipitation forecasts.
Here's the code updated with the tryCatch function:
Thanks a lot for your help in advance
# Load startR
library(startR)
library(s2dv)
library(multiApply)
ncores <- 1
lons.min <- -20
lons.max <- 41
lats.min <- -40
lats.max <- 40
# Declaration of data sources
var_name = 'prlr'
retrieve <- TRUE
methods <- c('raw','bias','evmos','mse_min','crps_min','rpc-based')
exp_path <- '/esarchive/exp/ecmwf/system5c3s/monthly_mean/$var$_s0-24h/$var$_$sdate$01.nc'
grid <- '/esarchive/exp/ecmwf/system5c3s/monthly_mean/prlr_s0-24h/prlr_20000501.nc'
exp <- startR::Start(dat = exp_path,
var = var_name,
sdate = c("200011","200111","200211","200311"),
ensemble = 'all',
time = indices(1:6),
latitude = values(list(lats.min, lats.max)),
latitude_reorder = Sort(decreasing = T),
longitude = values(list(lons.min, lons.max)),
longitude_reorder = CircularSort(-180,180),
transform = CDORemapper,
transform_extra_cells = 8,
transform_params = list(grid = grid,
method = 'conservative',
crop = c(lons.min, lons.max, lats.min, lats.max)),
transform_vars = c('latitude', 'longitude'),
synonims = list(latitude = c('lat', 'latitude'),
longitude = c('lon', 'longitude')),
return_vars = list(latitude = 'dat',
longitude = 'dat',
time = 'sdate'),
retrieve = retrieve)
dates <- attr(exp, 'Variables')$common$time
obs_path <- '/esarchive/recon/ecmwf/era5/monthly_mean/$var$_s0-24h/$var$_$date$.nc'
#dates <- format(dates, '%Y%m')
dates_file <- format(dates, '%Y%m')
dim(dates_file) <- list(sdate = 4,
time = 6)
obs <- Start(dat = obs_path,
var = var_name,
# date = unique(format(dates, '%Y%m')),
date = dates_file,
# time = values(dates),
latitude = values(list(lats.min, lats.max)),
latitude_reorder = Sort(decreasing = T),
longitude = values(list(lons.min, lons.max)),
longitude_reorder = CircularSort(-180,180),
synonims = list(longitude = c('lon', 'longitude'),
latitude = c('lat', 'latitude')),
transform = CDORemapper,
transform_extra_cells = 8,
transform_params = list(grid = grid,
method = 'conservative',
crop = c(lons.min, lons.max, lats.min, lats.max)),
transform_vars = c('latitude', 'longitude'),
split_multiselected_dims = TRUE,
# time_across = 'date',
# merge_across_dims = TRUE,
return_vars = list(time = 'date',
latitude = 'dat',
longitude = 'dat'),
retrieve = retrieve)
#Function
skill_calibration <- function(exp, obs, methods) {
library(s2dv)
library(CSTools)
library(multiApply)
# source('/esarchive/scratch/ptrascas/gitlab/pasdfasdf/correlation_eno.R')
correlation_eno <- function(exp, obs, time_dim = 'sdate', member_dim = 'ensemble', alpha = 0.05){
exp <- s2dv::MeanDims(data = exp, dims = member_dim, na.rm = FALSE)
cor = NULL
cor$r = cor(exp,obs) # Correlation coefficient
n_eff = s2dv::Eno(data = obs, time_dim = time_dim, na.action = na.pass, ncores = 1)
t_alpha2_n2 = qt(p=alpha/2, df = n_eff-2, lower.tail = FALSE)
t = abs(cor$r) * sqrt(n_eff-2) / sqrt(1-cor$r^2)
if (anyNA(c(t,t_alpha2_n2)) == FALSE & t >= t_alpha2_n2){
cor$sign = TRUE
} else {
cor$sign = FALSE
}
return(cor)
}
exp <- s2dv::MeanDims(data = exp, dims = 'time', na.rm = FALSE)
obs <- s2dv::MeanDims(data = obs, dims = 'time', na.rm = FALSE)
# Create an empty list that will contain the output of the "wrap_cal":
# One list per calibration method, with the different statistical metrics embedded on it
metrics <- list()
calibrated <- list() #, rpss = NULL)
print ("line 104")
# Loop over each calibration method to calculate its metrics
for (method in methods){
if (!identical(method,'raw')){
# Calibration function
calibrated[[method]] <- tryCatch(
{
return(CSTools::Calibration(exp = exp, obs = obs, cal.method = method, eval.method = "leave-one-out",
multi.model = FALSE, na.fill = TRUE, na.rm = FALSE, memb_dim = 'ensemble', sdate_dim = 'sdate', apply_to = 'sign', alpha = 0.1, ncores = 1))
},
error = function(cond) {
return(NA*exp)
}
)
} else {
calibrated[[method]] <- exp
}
# Metrics for each method
metrics$acc[[method]] <- correlation_eno(exp = calibrated[[method]], obs = obs, time_dim = 'sdate', member_dim = 'ensemble', alpha = 0.05)
}
print ("line 118")
return(list(raw_calib = s2dv::Reorder(data = calibrated$raw, order = c('sdate','ensemble')),
acc_raw_r = metrics$acc$raw$r,
acc_raw_sign = metrics$acc$raw$sign,
bias_calib = s2dv::Reorder(data = calibrated$bias, order = c('sdate', 'ensemble')),
acc_bias_r = metrics$acc$bias$r,
acc_bias_sign = metrics$acc$bias$sign,
evmos_calib = s2dv::Reorder(data = calibrated$evmos, order = c('sdate', 'ensemble')),
acc_evmos_r = metrics$acc$evmos$r,
acc_evmos_sign = metrics$acc$evmos$sign,
mse_min_calib = s2dv::Reorder(data = calibrated$mse_min,order = c('sdate', 'ensemble')),
acc_mse_min_r = metrics$acc$mse_min$r,
acc_mse_min_sign = metrics$acc$mse_min$sign,
crps_min_calib = s2dv::Reorder(data = calibrated$crps_min,order = c('sdate', 'ensemble')),
acc_crps_min_r = metrics$acc$crps_min$r,
acc_crps_min_sign = metrics$acc$crps_min$sign,
rpc_based_calib = s2dv::Reorder(data = calibrated[['rpc-based']],order = c('sdate', 'ensemble')),
acc_rpc_based_r = metrics$acc[['rpc-based']]$r,
acc_rpc_based_sign = metrics$acc[['rpc-based']]$sign))
}
print ("line 140")
if (isTRUE(retrieve)){
## for one grid point
# metrics <- skill_calibration(exp = drop(exp), obs = drop(obs), methods=methods)
## paralelised
output <- multiApply::Apply(data = list(exp = exp, obs = obs), target_dims = list(exp = c('sdate','ensemble','time'), obs = c('sdate','time')), fun = skill_calibration, methods=methods, ncores = ncores)
} else if (isFALSE(retrieve)){
print ("line 150")
# Target dimensions are 'years' and ensemble for the forecast and 'year' for observations
# Chunking will take place, where different latitude and longitude points will be sent as different jobs to Nord3
# step <- startR::Step(fun = skill_calibration,
# target_dims = list(exp = c('year','month','ensemble'), obs = c('year','month')),
# output_dims = list(raw_calib = NULL, acc_raw_r = NULL, acc_raw_sign = NULL,
# bias_calib = NULL,acc_bias_r = NULL, acc_bias_sign = NULL,
# evmos_calib = NULL,acc_evmos_r = NULL, acc_evmos_sign = NULL,
# mse_min_calib = NULL,acc_mse_min_r = NULL, acc_mse_min_sign= NULL,
# crps_min_calib = NULL,acc_crps_min_r = NULL, acc_crps_min_sign = NULL,
# rpc_based_calib = NULL,acc_rpc_based_r = NULL, acc_rpc_based_sign = NULL))
step <- startR::Step(fun = skill_calibration,
target_dims = list(exp = c('sdate','ensemble','time'), obs = c('sdate','time')),
# Especificar output dims:
output_dims = list(raw_calib = c('sdate','ensemble'),
acc_raw_r = NULL, acc_raw_sign = NULL,
bias_calib = c('sdate','ensemble'),
acc_bias_r = NULL, acc_bias_sign = NULL,
evmos_calib = c('sdate','ensemble'),
acc_evmos_r = NULL, acc_evmos_sign = NULL,
mse_min_calib = c('sdate','ensemble'),
acc_mse_min_r = NULL, acc_mse_min_sign= NULL,
crps_min_calib = c('sdate','ensemble'),
acc_crps_min_r = NULL, acc_crps_min_sign = NULL,
rpc_based_calib = c('sdate','ensemble'),
acc_rpc_based_r = NULL, acc_rpc_based_sign = NULL))
print ("Line 160")
wf <- startR::AddStep(inputs = list(exp = exp, obs = obs),
methods=methods,
step_fun = step)[[1]]
print ("Line 164")
#output <- startR::Compute(workflow = wf,
# chunks = list(longitude = 2),
# threads_load = 1,
# threads_compute = 2)
output <- startR::Compute(workflow = wf,
chunks = list(longitude = 2),
threads_load = 1,
threads_compute = ncores,
cluster = list(queue_host = 'nord4',
queue_type = 'slurm',
temp_dir = '/gpfs/scratch/bsc32/bsc32413/startR_hpc/', #############################3
CDO_module = 'CDO/1.9.8-foss-2019b',
cores_per_job = ncores,
job_wallclock = '02:00:00',
max_jobs = 10,
bidirectional = FALSE,
polling_period = 10),
ecflow_suite_dir = "/home/Earth/ptrascas/Desktop/startR_local/", ################################
wait = TRUE)
print ("Line 180")
} else {stop('retrieve must be TRUE or FALSE')}
lat <- as.numeric(attr(obs,'Variables')$dat1$latitude)
lon <- as.numeric(attr(obs,'Variables')$dat1$longitude)
saveRDS(object = list(output = output, lat = lat, lon = lon),
file = 'test_nuria.rds')
# Close device