ex1_2_exp_obs_attr.R 10.2 KB
Newer Older
#---------------------------------------------------------------------
# This script tells you how to load experimental and observational data in a 
# consistent way, facilating the following comparison. We use the attributes of
# the experimental data to define the selectors of obs Start() call, so they 
# can have the same dimension structure.

# Spatial dimensions:
# The exp and obs data happen to have the same spatial resolution (256x512) and
# the grids are not shifted, so we don't need to regrid them. However, their latitude
# orders are opposite. exp has ascending order while obs has descending order. 
# To make them consistent, we need to use "_reorder" parameter. In fact, it is
# always recommended using "lat_reorder" and "lon_reorder" to ensure you get
# the expected latitude and longitude.

# Temporal dimensions:
# The exp and obs files have different date/time structure. exp has one file per year and 
# each file has 12 time steps. obs has one file per month and each file has 1 time step.
# To shape the obs array as exp, we need to use the time attribute of exp to define
# the date/time selector of obs. You can see how to use parameter '*_across', 'merge_across_dims',
# and 'split_multiselected_dims' to achieve the goal.

#---------------------------------------------------------------------
library(startR)

# exp
repos_exp <- paste0('/esarchive/exp/ecearth/a1tr/cmorfiles/CMIP/EC-Earth-Consortium/',
                    'EC-Earth3/historical/r24i1p1f1/Amon/$var$/gr/v20190312/',
                    '$var$_Amon_EC-Earth3_historical_r24i1p1f1_gr_$sdate$01-$sdate$12.nc')

exp <- Start(dat = repos_exp,
             var = 'tas',
             sdate = as.character(c(2005:2008)),
             time = indices(1:3),
             lat = 'all',
             lat_reorder = Sort(),
             lon_reorder = CircularSort(0, 360),
             synonims = list(lat = c('lat', 'latitude'),
                             lon = c('lon', 'longitude')),
             return_vars = list(lon = NULL,
                                lat = NULL,
                                time = 'sdate'),
             retrieve = FALSE)

# Retrieve attributes for observational data retrieval.
## Because latitude order in experiment is [-90, 90] but in observation is [90, -90],
## latitude values need to be retrieved and used below.
lats <- attr(exp, 'Variables')$common$lat
lons <- attr(exp, 'Variables')$common$lon
## The 'time' attribute is a two-dim array
dates <- attr(exp, 'Variables')$common$time
aho's avatar
aho committed
#sdate time 
#    4    3 
dates
# [1] "2005-01-16 12:00:00 UTC" "2006-01-16 12:00:00 UTC"
# [3] "2007-01-16 12:00:00 UTC" "2008-01-16 12:00:00 UTC"
# [5] "2005-02-15 00:00:00 UTC" "2006-02-15 00:00:00 UTC"
# [7] "2007-02-15 00:00:00 UTC" "2008-02-15 12:00:00 UTC"
# [9] "2005-03-16 12:00:00 UTC" "2006-03-16 12:00:00 UTC"
#[11] "2007-03-16 12:00:00 UTC" "2008-03-16 12:00:00 UTC"

#-------------------------------------------

# obs
# 1. For lat, use the experiment attribute or reversed indices. For lon, it is not necessary 
# because their lons are identical, but either way works.
# 2. For dimension 'date', it is a vector involving the 3 months (ftime) of the four years (sdate).
# 3. Dimension 'time' is assigned by the matrix, so we can split it into 'sdate' and 'time' 
# by 'split_multiselected_dims'.
# 4. Because 'time' is actually across all the files, so we need to specify 'time_across'. 
# Then, use 'merge_across_dims' to make dimension 'date' disappears. 
# At this moment, the dimension is 'time = 12'. 
# 5. However, we want to seperate year and month (which are 'sdate' and 'ftime' in 
# experimental data). So we use 'split_multiselected_dims' to split 'time' into the two dimensions.

repos_obs <- '/esarchive/recon/ecmwf/erainterim/monthly_mean/$var$_f6h/$var$_$date$.nc'

obs <- Start(dat = repos_obs,
             var = 'tas',
             date = unique(format(dates, '%Y%m')),
             time = values(dates),  #dim: [sdate = 4, time = 3]
             lat = 'all',
             lat_reorder = Sort(),
             lon = 'all',
             lon_reorder = CircularSort(0, 360),
             time_across = 'date',
             merge_across_dims = TRUE,
             split_multiselected_dims = TRUE,
             synonims = list(lat = c('lat', 'latitude'),
                             lon = c('lon', 'longitude')),
             return_vars = list(lon = NULL,
                                lat = NULL,
                                time = 'date'),
             retrieve = FALSE)

#====================================================
# Check the attributes. They should be all identical.
#====================================================

##-----dimension-----
print(attr(exp, 'Dimensions'))
#  dat   var sdate  time   lat   lon 
#    1     1     4     3   256   512 

print(attr(obs, 'Dimensions'))
#  dat   var sdate  time   lat   lon 
#    1     1     4     3   256   512 

##-----time----- 
##  They're not identical but the years and months are. See below for more details.
print(attr(exp, 'Variables')$common$time)
aho's avatar
aho committed
# [1] "2005-01-16 12:00:00 UTC" "2006-01-16 12:00:00 UTC"
# [3] "2007-01-16 12:00:00 UTC" "2008-01-16 12:00:00 UTC"
# [5] "2005-02-15 00:00:00 UTC" "2006-02-15 00:00:00 UTC"
# [7] "2007-02-15 00:00:00 UTC" "2008-02-15 12:00:00 UTC"
# [9] "2005-03-16 12:00:00 UTC" "2006-03-16 12:00:00 UTC"
#[11] "2007-03-16 12:00:00 UTC" "2008-03-16 12:00:00 UTC"
print(attr(obs, 'Variables')$common$time)
# [1] "2005-01-31 18:00:00 UTC" "2006-01-31 18:00:00 UTC"
# [3] "2007-01-31 18:00:00 UTC" "2008-01-31 18:00:00 UTC"
# [5] "2005-02-28 18:00:00 UTC" "2006-02-28 18:00:00 UTC"
# [7] "2007-02-28 18:00:00 UTC" "2008-02-29 18:00:00 UTC"
# [9] "2005-03-31 18:00:00 UTC" "2006-03-31 18:00:00 UTC"
#[11] "2007-03-31 18:00:00 UTC" "2008-03-31 18:00:00 UTC"

##-----lat-----
print(attr(exp, 'Variables')$common$lat[1:3])
#[1] -89.46282 -88.76695 -88.06697
print(attr(exp, 'Variables')$common$lat[256])
#[1] 89.46282

print(attr(obs, 'Variables')$common$lat[1:3])
#[1] -89.46282 -88.76695 -88.06697
print(attr(obs, 'Variables')$common$lat[256])
#[1] 89.46282

##-----lon-----
print(attr(exp, 'Variables')$common$lon[1:3])
#[1] 0.000000 0.703125 1.406250
print(attr(obs, 'Variables')$common$lon[1:3])
#[1] 0.000000 0.703125 1.406250


#=======================
# About time attributes
#=======================
# You may notice that the date and time of exp and obs are not the same.
# In this case, the data are monthly data, so only the years and months matter.
# The thing worth noticing is that the actual time values of obs are half month different
# from the values we assigned. For example, the first time from exp is "2005-01-16 12:00:00 UTC",
# and the obs time we get is "2005-01-31 18:00:00 UTC". 
# If the provided selector is value, Start() looks for the closest value in the data.
# So for "2005-01-16 12:00:00 UTC", the two closest obs values are "2004-12-31 18:00:00 UTC" and
# "2005-01-31 18:00:00 UTC", and the later one is the closest and happen to be the desired one. 
# It's fortunate that in this case, all the provided values are closer to the values we want.

#----- 1. Manually adjust the values -----
# It is always necessary to check the data attributes before and after data retrieval.
# If the provided exp values are quite in the middle of two values in obs, we can adjust a bit to
# make exp values closer to the desired obs values.
dates_adjust <- dates + 86400*15
dates_adjust
# [1] "2005-01-31 12:00:00 UTC" "2006-01-31 12:00:00 UTC"
# [3] "2007-01-31 12:00:00 UTC" "2008-01-31 12:00:00 UTC"
# [5] "2005-03-02 00:00:00 UTC" "2006-03-02 00:00:00 UTC"
# [7] "2007-03-02 00:00:00 UTC" "2008-03-01 12:00:00 UTC"
# [9] "2005-03-31 12:00:00 UTC" "2006-03-31 12:00:00 UTC"
#[11] "2007-03-31 12:00:00 UTC" "2008-03-31 12:00:00 UTC"

obs2 <- Start(dat = repos_obs,
             var = 'tas',
             date = unique(format(dates, '%Y%m')),
             time = values(dates_adjust),  # use the adjust ones
             lat = 'all',
             lat_reorder = Sort(),
             lon = 'all',
             lon_reorder = CircularSort(0, 360),
             time_across = 'date',
             merge_across_dims = TRUE,
             split_multiselected_dims = TRUE,
             synonims = list(lat = c('lat', 'latitude'),
                             lon = c('lon', 'longitude')),
             return_vars = list(lon = NULL,
                                lat = NULL,
                                time = 'date'),
             retrieve = FALSE)

# The time should be the same as obs above.
print(attr(obs2, 'Variables')$common$time)
# [1] "2005-01-31 18:00:00 UTC" "2006-01-31 18:00:00 UTC"
# [3] "2007-01-31 18:00:00 UTC" "2008-01-31 18:00:00 UTC"
# [5] "2005-02-28 18:00:00 UTC" "2006-02-28 18:00:00 UTC"
# [7] "2007-02-28 18:00:00 UTC" "2008-02-29 18:00:00 UTC"
# [9] "2005-03-31 18:00:00 UTC" "2006-03-31 18:00:00 UTC"
#[11] "2007-03-31 18:00:00 UTC" "2008-03-31 18:00:00 UTC"

#----- 2. Set the tolerance -----
# Sometimes, it may be useful to set the tolerance. If the provided values are too much different
# from the values in obs, Start() returns an error directly (if none of the data found) or returns
# incorrect time attributes.

obs3 <- Start(dat = repos_obs,
             var = 'tas',
             date = unique(format(dates, '%Y%m')),
             time = values(dates),
             lat = 'all',
             lat_reorder = Sort(),
             lon = 'all',
             lon_reorder = CircularSort(0, 360),
             time_across = 'date',
             time_tolerance = as.difftime(15, units = 'days'), 
             merge_across_dims = TRUE,
             split_multiselected_dims = TRUE,
             synonims = list(lat = c('lat', 'latitude'),
                             lon = c('lon', 'longitude')),
             return_vars = list(lon = NULL,
                                lat = NULL,
                                time = 'date'),
             retrieve = FALSE)

# We lose many data because there are no data within 15 days from the provided time values.
print(attr(obs3, 'Variables')$common$time)
#[1] "2005-02-28 18:00:00 UTC" "2006-02-28 18:00:00 UTC"
#[3] "2007-02-28 18:00:00 UTC" "2008-02-29 18:00:00 UTC"

# If 'time_tolerance' is changed to "as.difftime(1, units = 'days')", an error shows:
#  Selectors do not match any of the possible values for the dimension 'time'.