Newer
Older
#'Convert Shapefile to Mask Array
#'
#'This function reads a shapefile (.shp) containing information about polygonal
#'regions. It then transfers the shapefile data into an array and subsets the
#'output based on requested region names or IDs. The accepted shapefile
#'databases are 'NUTS', 'LAU', and 'GADM', each with its own unique format.
#'However, the function can use other shapefiles databases with specifying the
#'
#'To ensure accurate comparison with the shapefile, the function loads a
#'reference dataset that provides longitude and latitude information. By
#'intersecting each subset of the shapefile with the reference coordinates, the
#'function selects only the desired regions. The final step involves creating a
#'mask array. Depending on the chosen option, the mask array is either returned
#'as the function's output or saved into a NetCDF format in the specified
#'directory.
#'
#'Note: Modules GDAL, PROJ and GEOS are required.
#'
#'@param shp_file A character string indicating the shp file path.
#'@param ref_grid A character string indicating the path to the reference
#' data. Either (1) a netCDF file or (2) a list of lon and lat to provide the
#' reference grid points. It is NULL by default.
#'@param shp_system A character string containing the Shapefile System Database
#' Name used to subset the shapefile into regions by using parameters 'reg_ids'
#' or 'reg_names'. The accepted systems are: 'NUTS', 'LAU', and 'GADM'. When it
#' is used, you must specify either 'reg_ids' or 'reg_names'; if you don't need
#' to subset different regions, set it to NULL. It is set to "NUTS" by default
#' (optional).
#'@param shp_col_name_ids A character string indicating the column name of the
#' column in where the specified 'reg_ids' will be taken (optional).
#'@param reg_ids A character string indicating the unique ID in shapefile.
#' It is NULL by default (optional).
#'@param reg_names A named list of character string vectors indicating the
#' country and the region name. The name of the list stands for the country
#' name code and the vector character strings indicate the region name for
#' each country. It is NULL by default (optional).
#'@param reg_level An integer number from 1 to 3 indicating the 'NUTS' dataset
#' level. For other datasets this parameter is not used. One mask can only have
#' a unique level. It is set to 3 by default (optional).
#'@param lat_dim A character string indicating the latitudinal dimension. If it
#' is NULL, the latitudinal name will be searched using an internal function
#' with the following possible names: 'lat', 'latitude', 'y', 'j' and
#'@param lon_dim A character string indicating the longitudinal dimension. If it
#' is NULL, the longitudinal name will be searched using an internal function
#' with the following possible names: 'lon', 'longitude', 'x', 'i' and
#'@param target_crs A character string indicating the target 'Coordinate
#' Reference System'.
#'@param region A logical value indicating if we want a dimension for the
#' regions in the resulting mask array. It is FALSE by default.
#'@param check_valid A logical value that when it is TRUE it uses the function
#' 'sf::st_make_valid' applied to the shapefile and to the coordinates.
#'@param find_min_dist A logical value indicating if we want to look for the
#' nearest coordinate between the shapefile region and the reference grid when
#' there is no intersection between the shapefile and the reference grid. It is
#' FALSE by default.
#'@param max_dist A numeric value indicating the maximum distance is accepted
#' to the closest gridpoint when there is no intersection between the shapefile
#' and the reference grid.
#'@param ncores The number of parallel processes to spawn for the use for
#' parallel computation in multiple cores.
#'@param ... Arguments passed on to 's2_options' in function 'st_intersection'.
#' See 's2 package'.
#'
#'@return A multidimensional array containing a mask array with longitude and
#'latitude dimensions. If 'region' is TRUE, there will be a dimension for
#'the region.
#'
#'@examples
#'# Exmple (1): NUTS
#'shp_file <- paste0('/esarchive/shapefiles/NUTS3/NUTS_RG_60M_2021_4326.shp/',
#'ref_grid <- list(lon = seq(10, 40, 0.5), lat = seq(40, 85, 0.5))
#'NUTS_name <- list(FI = c('Lappi', 'Kainuu'), SI = c('Pomurska', 'Podravska'))
#'mask2 <- ShapeToMask(shp_file = shp_file, ref_grid = ref_grid,
#' reg_names = NUTS_name)
#'@import foreach
#'@importFrom doParallel registerDoParallel
ShapeToMask <- function(shp_file, ref_grid,
shp_system = "NUTS", reg_names = NULL, reg_ids = NULL,
shp_col_name_ids = NULL, reg_level = 3,
region = FALSE, target_crs = NULL,
check_valid = FALSE, find_min_dist = FALSE,
max_dist = 50, ncores = NULL, ...) {
# TODO: Add saving option?
# Initial checks
# shp_file
if (is.null(shp_file)) {
stop("Parameter 'shp_file' cannot be NULL.")
}
if (!is.character(shp_file)) {
stop("Parameter 'shp_file' must be a character string.")
}
# lon_dim, lat_dim
if (!is.null(lon_dim)) {
if (!is.character(lon_dim)) {
stop("Parameter 'lon_dim' must be a character string.")
}
}
if (!is.null(lat_dim)) {
if (!is.character(lat_dim)) {
stop("Parameter 'lat_dim' must be a character string.")
}
}
# ref_grid
if (is.null(ref_grid)) {
stop("Parameter 'ref_grid' cannot be NULL.")
}
if (!any(is.character(ref_grid) | is.list(ref_grid))) {
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
stop("Parameter 'ref_grid' must be either a netCDF file or a list of lon and lat.")
}
if (is.character(ref_grid)) {
if (!file.exists(ref_grid)) {
stop("ref_grid file does not exist.")
}
}
if (is.list(ref_grid)) {
if (length(ref_grid) != 2) {
stop("If 'ref_grid' is a list, it must have two items for longitude and latitude.")
}
if (is.null(lat_dim) | is.null(lon_dim)) {
lon_known_names <- c(.KnownLonNames(), 'lons')
lat_known_names <- c(.KnownLatNames(), 'lats')
lon_dim <- lon_known_names[which(lon_known_names %in% names(ref_grid))]
lat_dim <- lat_known_names[which(lat_known_names %in% names(ref_grid))]
if (identical(lon_dim, character(0)) | identical(lat_dim, character(0))) {
stop("longitude and latitude names are not recognized in 'ref_grid'. ",
"Please specify 'lon_dim' and 'lat_dim'.")
}
}
}
# shp_system, reg_ids, reg_names, shp_col_name_ids
if (!is.null(shp_system)) {
if (!is.character(shp_system)) {
stop("Parameter 'shp_system' must be a character strinig.")
}
if (all(is.null(reg_ids), is.null(reg_names))) {
stop("If 'shp_system' is used, you must provide either parameter ",
"'reg_ids' or 'reg_names'.")
} else if (!is.null(shp_col_name_ids)) {
if (is.null(reg_ids)) {
stop("If 'shp_col_name_ids' is used, parameter 'reg_ids' must be provided.")
}
if (!is.character(shp_col_name_ids)) {
stop("Parameter 'shp_col_name_ids' must be a character strinig.")
}
}
if (all(!is.null(reg_ids), !is.null(reg_names))) {
warning("Only use 'reg_ids' to get the shape region. 'reg_names' is not used.")
if (!is.null(reg_level)) {
if (!is.numeric(reg_level)) {
stop("Parameter 'reg_level' must be numeric.")
}
}
if (!is.logical(region)) {
stop("Parameter 'region' must be a logical value.")
}
if (!is.null(target_crs)) {
if (!is.character(target_crs)) {
stop("Parameter 'target_crs' must be a character string.")
}
}
if (!is.logical(check_valid)) {
stop("Parameter 'check_valid' must be a logical value.")
}
if (!is.logical(find_min_dist)) {
stop("Parameter 'find_min_dist' must be a logical value.")
}
if (!is.null(max_dist)) {
if (!is.numeric(max_dist)) {
stop("Parameter 'max_dist' must be a numeric.")
}
}
# ncores
if (!is.null(ncores)) {
if (!is.numeric(ncores)) {
stop("Parameter 'ncores' must be numeric.")
}
ncores <- round(ncores)
if (ncores < 2) {
ncores <- NULL
}
}
shp <- sf::st_read(shp_file) # class sf
if (!is.null(target_crs)) {
transformed_shapefile <- st_transform(shp, crs = target_crs)
shp <- transformed_shapefile
}
NUTS_ID <- ADM1_PCODE <- ISO <- NULL
CNTR_CODE <- NUTS_NAME <- ADM0_EN <- ADM1_EN <- Name <- LEVL_CODE <- NULL
if (!is.null(shp_col_name_ids)) {
if (shp_col_name_ids %in% names(shp)) {
shp <- subset(shp, get(shp_col_name_ids) %in% reg_ids)
} else {
stop("Shape system name not found in shapefile names.")
}
} else if (shp_system == "NUTS") {
shp <- subset(shp, NUTS_ID %in% reg_ids)
} else if (shp_system == "ADM") {
shp <- subset(shp, ADM1_PCODE %in% reg_ids)
} else if (shp_system == "GADM") {
shp <- subset(shp, ISO %in% reg_ids)
stop("shp_system ", shp_system, " is not defined yet.")
} else if (!is.null(reg_names)) {
for (cntr_i in 1:length(reg_names)) {
if (shp_system == "NUTS") {
tmp <- subset(shp, CNTR_CODE == names(reg_names)[cntr_i])
tmp <- subset(tmp, NUTS_NAME %in% reg_names[[cntr_i]])
} else if (shp_system == "ADM") {
tmp <- subset(shp, ADM0_EN == names(reg_names)[cntr_i])
tmp <- subset(tmp, ADM1_EN %in% reg_names[[cntr_i]])
} else if (shp_system == "GADM") {
tmp <- subset(shp, Name %in% reg_names)
}
if (cntr_i == 1) {
shp_tmp <- tmp
} else {
shp_tmp <- rbind(shp_tmp, tmp)
}
}
if (shp_system == "NUTS") {
shp <- subset(shp_tmp, LEVL_CODE == reg_level)
} else if (shp_system == "ADM" | shp_system == "GADM") {
shp <- shp_tmp
}
}
# Step 2: Use the reference file to get lat and lon
if (all(tools::file_ext(ref_grid) == 'nc')) {
## Method 1: ref_grid is a netCDF file
if (is.null(lat_dim) | is.null(lon_dim)) {
var_names <- easyNCDF::NcReadVarNames(ref_grid)
lat_dim <- var_names[which(var_names %in% .KnownLatNames())]
lon_dim <- var_names[which(var_names %in% .KnownLonNames())]
latlon <- easyNCDF::NcToArray(ref_grid, vars_to_read = c(lat_dim, lon_dim))
lat <- easyNCDF::NcToArray(ref_grid, vars_to_read = lat_dim)[1, ]
lon <- easyNCDF::NcToArray(ref_grid, vars_to_read = lon_dim)[1, ]
} else if (is.list(ref_grid)) {
## Method 2: ref_grid is a list of lon and lat
lat <- ref_grid[[lat_dim]]
lon <- ref_grid[[lon_dim]]
}
## Create data frame & sp class for ref grid
ref.df <- data.frame(data = 0,
lon = rep(lon, times = length(lat)),
lat = rep(lat, each = length(lon)))
coord <- as.matrix(data.frame(x = ref.df$lon, y = ref.df$lat))
xy.sfg <- sf::st_multipoint(coord)
xy.sfc <- sf::st_sfc(xy.sfg)
# Assign crs of original shapefile
if (!is.null(target_crs)) {
st_crs(xy.sfc) <- sf::st_crs(target_crs) #initial_crs # asign crs of original shapefile
xy.sfc <- sf::st_transform(xy.sfc, st_crs(shp))
} else {
st_crs(xy.sfc) <- sf::st_crs(shp)
}
# Step 3: Create mask
if (check_valid) {
xy.sfc <- st_make_valid(xy.sfc)
shp <- st_make_valid(shp)
}
## Loop through each shp region
if (region) {
cfun <- function(a, b) {
if (length(dim(a)) == 2) {
dims <- c(dim(a), 2)
dims <- c(dim(a)[1:2], dim(a)[length(dim(a))]+1)
return(array(c(a,b), dim = dims))
} else {
cfun <- function(a, b) {
a[which(b != 0)] <- b[which(b != 0)]
return(a)
}
}
if (is.null(ncores)) {
mask <- foreach(shp_i = 1:nrow(shp), .combine = 'cfun') %do%
.shapetomask(shp = shp, n = shp_i, lon = lon, lat = lat,
xy.sfc = xy.sfc, find_min_dist = find_min_dist,
shp_col_name_ids = shp_col_name_ids,
max_dist = max_dist, region = region, ...)
} else {
registerDoParallel(ncores)
mask <- foreach(shp_i = 1:nrow(shp), .combine = 'cfun', .packages='sf') %dopar%
.shapetomask(shp = shp, n = shp_i, lon = lon, lat = lat,
xy.sfc = xy.sfc, find_min_dist = find_min_dist,
shp_col_name_ids = shp_col_name_ids,
max_dist = max_dist, region = region, ...)
registerDoSEQ()
}
if (region) {
if (length(dim(mask)) == 2) dim(mask) <- c(dim(mask), 1)
names(dim(mask)) <- c(lon_dim, lat_dim, 'region')
} else {
names(dim(mask)) <- c(lon_dim, lat_dim)
}
# Step 4: Add attributes
# attr(mask, lon_dim) <- lon
# attr(mask, lat_dim) <- lat
# if (shp_system == "NUTS") {
# } else if (shp_system == "ADM") {
# } else if (shp_system == "GADM") {
# attr(mask, "index") <- as.list(shp$ISO)
# }
# names(attr(mask, "index")) <- 1:nrow(shp)
# ## Return all the info from shp
# attr(mask, "shapefile") <- attributes(shp)
.shapetomask <- function(shp, n, lon, lat, xy.sfc, find_min_dist = FALSE,
shp_col_name_ids = NULL, max_dist = 50,
region = FALSE, ...) {
mask <- array(0, dim = c(length(lon), length(lat)))
shpi <- shp[n, ]
# NOTE: Don't know it's a problem in st_intersection or st_coordinates, tmp_coords may
# not be identical as lon/lat. E.g., (29, 65) may be (29 - -3.552714e-15, 65).
tmp_coords <- sf::st_coordinates(shp_pol)[, 1:2]
if (length(tmp_coords) == 0) {
dim(tmp_coords) <- NULL
} else if (is.null(dim(tmp_coords))) {
tmp_coords <- array(tmp_coords, dim = c(1, length(tmp_coords)))
}
if (!is.null(dim(tmp_coords))) {
# polygon_instersection
for (ii in 1:nrow(tmp_coords)) {
# pt_x <- which(lon == tmp_coords[ii, 1])
# pt_y <- which.min(abs(lat - tmp_coords[ii, 2]))
if (!region) {
# min(abs(lon - tmp_coords[ii, 1]))
# min(abs(lat - tmp_coords[ii, 2]))
mask[which.min(abs(lon - tmp_coords[ii, 1])),
which.min(abs(lat - tmp_coords[ii, 2]))] <- n
} else {
mask[which.min(abs(lon - tmp_coords[ii, 1])),
which.min(abs(lat - tmp_coords[ii, 2]))] <- 1
}
}
} else if (find_min_dist) {
x.centroid.shpi <- sf::st_coordinates(sf::st_centroid(shpi))[,1]
y.centroid.shpi <- sf::st_coordinates(sf::st_centroid(shpi))[,2]
dist <- sqrt((xy.sfc[,1] - x.centroid.shpi)**2 + (xy.sfc[,2] - y.centroid.shpi)**2)
tmp_coords <- array(xy.sfc[which(dist == min(dist, na.rm = TRUE)),], dim = c(1,2))
colnames(tmp_coords) <- c('X', 'Y')
if (max(dist) <= max_dist & (any(round(lat,2) == round(tmp_coords[1,2],2)) &
any(round(lon,2) == round(tmp_coords[1,1],2))) ) {
if (length(dim(mask)) == 2) {
mask[which.min(abs(lon - tmp_coords[, 1])),
which.min(abs(lat - tmp_coords[, 2]))] <- n
mask[which.min(abs(lon - tmp_coords[, 1])),
which.min(abs(lat - tmp_coords[, 2]))] <- 1
}
# warning(paste0('The reference grid has no intersection with region ',
# ifelse(is.character(shp_col_name_ids), shpi[[shp_col_name_ids]], paste0('n° ', n)),
# ' from the shapefile; the provided grid cell is at a distance of ', dist[which(dist == min(dist, na.rm = TRUE))],
# ' to the centroid of the region (units are: ° or meters depending on the crs of the shapefile).'))
# warning(paste0('The reference grid has no intersection with region ',
# ifelse(is.character(shp_col_name_ids), shpi[[shp_col_name_ids]], paste0('n° ', n))))