Newer
Older
#'Convert Shapefile to Mask Array
#'
#'This function reads a shapefile (.shp) containing information about polygonal
#'regions. It then transfers the shapefile data into an array and subsets the
#'output based on requested region names or IDs. The accepted shapefile
#'databases are 'NUTS', 'LAU', and 'GADM', each with its own unique format.
#'However, the function can use other shapefiles databases with specifying the
#'categories names with the parameter 'shp_system_name'.
#'
#'To ensure accurate comparison with the shapefile, the function loads a
#'reference dataset that provides longitude and latitude information. By
#'intersecting each subset of the shapefile with the reference coordinates, the
#'function selects only the desired regions. The final step involves creating a
#'mask array. Depending on the chosen option, the mask array is either returned
#'as the function's output or saved into a NetCDF format in the specified
#'directory.
#'
#'Note: Modules GDAL, PROJ and GEOS are required.
#'
#'@param shp_file A character string indicating the shp file path.
#'@param ref_grid A character string indicating the path to the reference
#' data. Either (1) a netCDF file or (2) a list of lon and lat to provide the
#' reference grid points. It is NULL by default.
#'@param shp_system A character string containing the Shapefile System Database
#' name. The accepted systems are: 'NUTS', 'LAU', and 'GADM'. It is set to
#' 'NUTS' by default.
#'@param shp_system_name A character string indicating the column name of the
#' column in where the specified 'reg_ids' will be taken.
#'@param reg_ids A character string indicating the unique ID in shapefile.
#'@param reg_names A named list of character string vectors indicating the
#' country and the region name. The name of the list stands for the country
#' name code and the vector character strings indicate the region name for
#' each country. It is NULL by default.
#'@param reg_level An integer number from 1 to 3 indicating the 'NUTS' dataset
#' level. For other datasets this parameter is not used. One mask can only have
#' a unique level. It is set to 3 by default.
#'@param lat_dim A character string indicating the latitudinal dimension. If it
#' is NULL, the latitudinal name will be searched using an internal function
#' with the following possible names: 'lat', 'latitude', 'y', 'j' and
#' 'nav_lat'. It is NULL by default.
#'@param lon_dim A character string indicating the longitudinal dimension. If it
#' is NULL, the longitudinal name will be searched using an internal function
#' with the following possible names: 'lon', 'longitude', 'x', 'i' and
#'@param target_crs A character string indicating the target 'Coordinate
#' Reference System'.
#'@param region A logical value indicating if we want a dimension for the
#' regions in the resulting mask array. It is FALSE by default.
#'@param check_valid A logical value that when it is TRUE it uses the function
#' 'sf::st_make_valid' applied to the shapefile and to the coordinates.
#'@param find_min_dist A logical value indicating if we want to look for the
#' nearest coordinate between the shapefile region and the reference grid when
#' there is no intersection between the shapefile and the reference grid. It is
#' FALSE by default.
#'@param max_dist A numeric value indicating the maximum distance is accepted
#' to the closest gridpoint when there is no intersection between the shapefile
#' and the reference grid.
#'@param ... Arguments passed on to 's2_options' in function 'st_intersection'.
#' See 's2 package'.
#'
#'@return A multidimensional array containing a mask array with longitude and
#'latitude dimensions. If 'region' is TRUE, there will be a dimension for
#'the region.
#'
#'@examples
#'# Exmple (1): NUTS
#'shp_file <- paste0('/esarchive/shapefiles/NUTS3/NUTS_RG_60M_2021_4326.shp/',
#'ref_grid <- list(lon = seq(10, 40, 0.5), lat = seq(40, 85, 0.5))
#'NUTS_name <- list(FI = c('Lappi', 'Kainuu'), SI = c('Pomurska', 'Podravska'))
#'mask2 <- ShapeToMask(shp_file = shp_file, ref_grid = ref_grid,
#' reg_names = NUTS_name)
ShapeToMask <- function(shp_file, ref_grid,
shp_system = "NUTS", shp_system_name = NULL,
reg_ids = NULL, reg_names = NULL,
reg_level = 3, lat_dim = NULL, lon_dim = NULL,
region = FALSE, target_crs = NULL,
check_valid = FALSE, find_min_dist = FALSE,
max_dist = 50, ncores = NULL, ...) {
# TODO: Suppress warnings?
# TODO: Substitute packages
# TODO: Substitute loop for each region with multiApply
# TODO: Add initial checks
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# TODO: Add saving option?
# Initial checks
# shp_file
if (is.null(shp_file)) {
stop("Parameter 'shp_file' cannot be NULL.")
}
if (!is.character(shp_file)) {
stop("Parameter 'shp_file' must be a character string.")
}
# ref_grid
if (is.null(ref_grid)) {
stop("Parameter 'ref_grid' cannot be NULL.")
}
if (!any(is.character(ref_grid) | is.list(ref_grid))) {
stop("Parameter 'ref_grid' must be a character string or a list.")
}
# shp_system
# shp_system_name
# reg_ids
# reg_names
# reg_level
# lat_dim
# lon_dim
# region
# target_crs
# check_valid
# find_min_dist
# max_dist
# ncores
if (!is.null(ncores)) {
if (!is.numeric(ncores)) {
stop("Parameter 'ncores' must be numeric.")
}
ncores <- round(ncores)
if (ncores < 2) {
ncores <- NULL
}
}
shp <- sf::st_read(shp_file) # class sf
if (!is.null(target_crs)) {
transformed_shapefile <- st_transform(shp, crs = target_crs)
shp <- transformed_shapefile
}
if (all(is.null(reg_ids), is.null(reg_names))) {
stop("Either provide parameter 'reg_ids' or 'reg_names'.")
} else if (!is.null(reg_ids)) {
if (!is.null(shp_system_name)) {
if (shp_system_name %in% names(shp)) {
shp <- subset(shp, get(shp_system_name) %in% reg_ids)
} else {
stop("Shape system name not found in shapefile names.")
}
} else if (shp_system == "NUTS") {
shp <- subset(shp, NUTS_ID %in% reg_ids)
shp_system_name <- "NUTS_ID"
} else if (shp_system == "ADM") {
shp <- subset(shp, ADM1_PCODE %in% reg_ids)
shp_system_name <- "ADM1_PCODE"
} else if (shp_system == "GADM") {
shp <- subset(shp, ISO %in% reg_ids)
shp_system_name <- "ISO"
stop("shp_system ", shp_system, " is not defined yet.")
if (!is.null(reg_names)) {
warning("Only use 'reg_ids' to get the shape region. 'reg_names' is not used.")
} else if (!is.null(reg_names)) {
shp_system_name <- NULL
for (cntr_i in 1:length(reg_names)) {
if (shp_system == "NUTS") {
tmp <- subset(shp, CNTR_CODE == names(reg_names)[cntr_i])
tmp <- subset(tmp, NUTS_NAME %in% reg_names[[cntr_i]])
shp_system_name <- 'NUTS_NAME'
} else if (shp_system == "ADM") {
tmp <- subset(shp, ADM0_EN == names(reg_names)[cntr_i])
tmp <- subset(tmp, ADM1_EN %in% reg_names[[cntr_i]])
shp_system_name <- 'ADM1_EN'
} else if (shp_system == "GADM") {
tmp <- subset(shp, Name %in% reg_names)
shp_system_name <- 'Name'
}
if (cntr_i == 1) {
shp_tmp <- tmp
} else {
shp_tmp <- rbind(shp_tmp, tmp)
}
}
if (shp_system == "NUTS") {
shp <- subset(shp_tmp, LEVL_CODE == reg_level)
} else if (shp_system == "ADM" | shp_system == "GADM") {
shp <- shp_tmp
}
}
# Step 2: Use the reference file to get lat and lon
if (all(tools::file_ext(ref_grid) == 'nc')) {
if (!file.exists(ref_grid)) {
stop("ref_grid file does not exist.")
## Method 1: ref_grid is a netCDF file
var_names <- easyNCDF::NcReadVarNames(ref_grid)
Eva Rifà
committed
lat_dim <- var_names[which(var_names %in% .KnownLatNames())]
lon_dim <- var_names[which(var_names %in% .KnownLonNames())]
latlon <- NcToArray(ref_grid, vars_to_read = c(lat_dim, lon_dim))
lat <- NcToArray(ref_grid, vars_to_read = lat_dim)[1, ]
lon <- NcToArray(ref_grid, vars_to_read = lon_dim)[1, ]
} else if (is.list(ref_grid)) {
## Method 2: ref_grid is a list of lon and lat
if (length(ref_grid) != 2) {
stop("If 'ref_grid' is a list, it must have two items for longitude and latitude.")
Eva Rifà
committed
lon_known_names <- c(.KnownLonNames(), 'lons')
lat_known_names <- c(.KnownLatNames(), 'lats')
lon_dim <- lon_known_names[which(lon_known_names %in% names(ref_grid))]
lat_dim <- lat_known_names[which(lat_known_names %in% names(ref_grid))]
if (identical(lon_dim, character(0)) | identical(lat_dim, character(0))) {
stop("longitude and latitude names are not recognized in 'ref_grid'. Please specify 'lon_dim' and 'lat_dim'.")
lat <- ref_grid[[lat_dim]]
lon <- ref_grid[[lon_dim]]
stop("Parameter 'ref_grid' must be either a netCDF file or a list of lon and lat.")
}
## Create data frame & sp class for ref grid
ref.df <- data.frame(data = 0,
lon = rep(lon, times = length(lat)),
lat = rep(lat, each = length(lon)))
coord <- as.matrix(data.frame(x = ref.df$lon, y = ref.df$lat))
xy.sfg <- sf::st_multipoint(coord)
xy.sfc <- sf::st_sfc(xy.sfg)
# Assign crs of original shapefile
if (!is.null(target_crs)) {
st_crs(xy.sfc) <- sf::st_crs(target_crs) #initial_crs # asign crs of original shapefile
xy.sfc <- sf::st_transform(xy.sfc, st_crs(shp))
} else {
st_crs(xy.sfc) <- sf::st_crs(shp)
}
# Step 3: Create mask
if (check_valid) {
xy.sfc <- st_make_valid(xy.sfc)
shp <- st_make_valid(shp)
}
## Loop through each shp region
if (region) {
cfun <- function(a, b) {
if (length(dim(a)) == 2) {
dims <- c(dim(a), 2)
dims <- c(dim(a)[1:2], dim(a)[length(dim(a))]+1)
return(array(c(a,b), dim = dims))
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
} else {
cfun <- function(a, b) {
a[which(b != 0)] <- b[which(b != 0)]
return(a)
}
}
if (is.null(ncores)) {
mask <- foreach(shp_i = 1:nrow(shp), .combine = 'cfun') %do%
.shapetomask(shp = shp, xy.sfc = xy.sfc,
find_min_dist = find_min_dist,
shp_system_name = shp_system_name,
max_dist = max_dist, n = shp_i,
lon = lon, lat = lat, region = region)
} else {
registerDoParallel(ncores)
mask <- foreach(shp_i = 1:nrow(shp), .combine = 'cfun', .packages='sf') %dopar%
.shapetomask(shp = shp, xy.sfc = xy.sfc,
find_min_dist = find_min_dist,
shp_system_name = shp_system_name,
max_dist = max_dist, n = shp_i,
lon = lon, lat = lat, region = region)
registerDoSEQ()
}
if (region) {
names(dim(mask)) <- c(lon_dim, lat_dim, 'region')
} else {
names(dim(mask)) <- c(lon_dim, lat_dim)
}
# Step 4: Add attributes
# attr(mask, lon_dim) <- lon
# attr(mask, lat_dim) <- lat
# if (shp_system == "NUTS") {
# } else if (shp_system == "ADM") {
# } else if (shp_system == "GADM") {
# attr(mask, "index") <- as.list(shp$ISO)
# }
# names(attr(mask, "index")) <- 1:nrow(shp)
# ## Return all the info from shp
# attr(mask, "shapefile") <- attributes(shp)
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
return(mask)
}
.shapetomask <- function(shp, xy.sfc, find_min_dist = FALSE,
shp_system_name = NULL, max_dist = 50, n, lon, lat,
region = FALSE) {
mask <- array(0, dim = c(length(lon), length(lat)))
shpi <- shp[n, ]
# NOTE: Don't know it's a problem in st_intersection or st_coordinates, tmp_coords may
# not be identical as lon/lat. E.g., (29, 65) may be (29 - -3.552714e-15, 65).
shp_pol <- sf::st_intersection(xy.sfc, shpi)
# shp_pol <- sf::st_intersection(xy.sfc, shpi, ...)
tmp_coords <- sf::st_coordinates(shp_pol)[, 1:2]
if (length(tmp_coords) == 0) {
dim(tmp_coords) <- NULL
} else if (is.null(dim(tmp_coords))) {
tmp_coords <- array(tmp_coords, dim = c(1, length(tmp_coords)))
}
if (!is.null(dim(tmp_coords))) {
# polygon_instersection
for (ii in 1:nrow(tmp_coords)) {
# pt_x <- which(lon == tmp_coords[ii, 1])
# pt_y <- which.min(abs(lat - tmp_coords[ii, 2]))
if (!region) {
# min(abs(lon - tmp_coords[ii, 1]))
# min(abs(lat - tmp_coords[ii, 2]))
mask[which.min(abs(lon - tmp_coords[ii, 1])),
which.min(abs(lat - tmp_coords[ii, 2]))] <- n
} else {
mask[which.min(abs(lon - tmp_coords[ii, 1])),
which.min(abs(lat - tmp_coords[ii, 2]))] <- 1
}
}
} else if (find_min_dist) {
x.centroid.shpi <- sf::st_coordinates(sf::st_centroid(shpi))[,1]
y.centroid.shpi <- sf::st_coordinates(sf::st_centroid(shpi))[,2]
dist <- sqrt((xy.sfg[,1] - x.centroid.shpi)**2 + (xy.sfg[,2] - y.centroid.shpi)**2)
tmp_coords <- array(xy.sfg[which(dist == min(dist, na.rm = TRUE)),], dim = c(1,2))
colnames(tmp_coords) <- c('X', 'Y')
if (max(dist) <= max_dist & (any(round(lat,2) == round(tmp_coords[1,2],2)) &
any(round(lon,2) == round(tmp_coords[1,1],2))) ) {
if (length(dim(mask)) == 2) {
mask[which.min(abs(lon - tmp_coords[, 1])),
which.min(abs(lat - tmp_coords[, 2]))] <- shp_i
} else {
mask[shp_i, which.min(abs(lon - tmp_coords[, 1])),
which.min(abs(lat - tmp_coords[, 2]))] <- 1
}
warning(paste0('The reference grid has no intersection with region ',
ifelse(is.character(shp_system_name), shpi[[shp_system_name]], paste0('n° ', shp_i)),
' from the shapefile; the provided grid cell is at a distance of ', dist[which(dist == min(dist, na.rm = TRUE))],
' to the centroid of the region (units are: ° or meters depending on the crs of the shapefile).'))
} else {
warning(paste0('The reference grid has no intersection with region ',
ifelse(is.character(shp_system_name), shpi[[shp_system_name]], paste0('n° ', shp_i))))
}