#'Convert Shapefile to Mask Array #' #'This function reads a shapefile (.shp) containing information about polygonal #'regions. It then transfers the shapefile data into an array and subsets the #'output based on requested region names or IDs. The accepted shapefile #'databases are 'NUTS', 'LAU', and 'GADM', each with its own unique format. #'However, the function can use other shapefiles databases with specifying the #'categories names with the parameter 'shp_col_name_ids'. #' #'To ensure accurate comparison with the shapefile, the function loads a #'reference dataset that provides longitude and latitude information. By #'intersecting each subset of the shapefile with the reference coordinates, the #'function selects only the desired regions. The final step involves creating a #'mask array. Depending on the chosen option, the mask array is either returned #'as the function's output or saved into a NetCDF format in the specified #'directory. #' #'Note: Modules GDAL, PROJ and GEOS are required. #' #'@param shp_file A character string indicating the shp file path. #'@param ref_grid A character string indicating the path to the reference #' data. Either (1) a netCDF file or (2) a list of lon and lat to provide the #' reference grid points. It is NULL by default. #'@param compute_area_coverage A logical value indicating the method to find #' the intersection of the reference grid and the shapefile. When it is TRUE, #' the method used is the calculation of the area coverage fraction of #' intersection. If it is FALSE, the method used is searching if the centroid #' of the grid cell falls inside the shapefile or not. It is FALSE by default. #'@param shp_system A character string containing the Shapefile System Database #' Name used to subset the shapefile into regions by using parameters 'reg_ids' #' or 'reg_names'. The accepted systems are: 'NUTS', 'LAU', and 'GADM'. When it #' is used, you must specify either 'reg_ids' or 'reg_names'; if you don't need #' to subset different regions, set it to NULL. It is set to "NUTS" by default #' (optional). #'@param shp_col_name_ids A character string indicating the column name of the #' column in where the specified 'reg_ids' will be taken (optional). #'@param reg_ids A character string indicating the unique ID in shapefile. #' It is NULL by default (optional). #'@param reg_names A named list of character string vectors indicating the #' country and the region name. The name of the list stands for the country #' name code and the vector character strings indicate the region name for #' each country. It is NULL by default (optional). #'@param reg_level An integer number from 1 to 3 indicating the 'NUTS' dataset #' level. For other datasets this parameter is not used. One mask can only have #' a unique level. It is set to 3 by default (optional). #'@param lat_dim A character string indicating the latitudinal dimension. If it #' is NULL, the latitudinal name will be searched using an internal function #' with the following possible names: 'lat', 'latitude', 'y', 'j' and #' 'nav_lat'. It is set to NULL by default. #'@param lon_dim A character string indicating the longitudinal dimension. If it #' is NULL, the longitudinal name will be searched using an internal function #' with the following possible names: 'lon', 'longitude', 'x', 'i' and #' 'nav_lon'. It is set to NULL by default. #'@param region A logical value indicating if we want a dimension for the #' regions in the resulting mask array. It is FALSE by default. #'@param check_valid A logical value that when it is TRUE it uses the function #' 'sf::st_make_valid' applied to the shapefile and to the coordinates. #'@param find_min_dist A logical value indicating if we want to look for the #' nearest coordinate between the shapefile region and the reference grid when #' there is no intersection between the shapefile and the reference grid. It is #' FALSE by default. #'@param max_dist A numeric value indicating the maximum distance is accepted #' to the closest gridpoint when there is no intersection between the shapefile #' and the reference grid. #'@param ncores The number of parallel processes to spawn for the use for #' parallel computation in multiple cores. #'@param fileout A character string of the path to save the NetCDF mask. If not #' specified (default), only the mask array will be returned. #'@param ... Arguments passed on to 's2_options' in function 'st_intersection'. #' See 's2 package'. #' #'@return A multidimensional array containing a mask array with longitude and #'latitude dimensions. If 'region' is TRUE, there will be a dimension for #'the region. #' #'@examples #'# Exmple (1): NUTS #'shp_file <- paste0('/esarchive/shapefiles/NUTS3/NUTS_RG_60M_2021_4326.shp/', #' 'NUTS_RG_60M_2021_4326.shp') #'ref_grid <- list(lon = seq(10, 40, 0.5), lat = seq(40, 85, 0.5)) #'NUTS_name <- list(FI = c('Lappi', 'Kainuu'), SI = c('Pomurska', 'Podravska')) #'mask2 <- ShapeToMask(shp_file = shp_file, ref_grid = ref_grid, #' reg_names = NUTS_name) #'@import easyNCDF #'@import sf #'@import foreach #'@import dplyr #'@importFrom doParallel registerDoParallel #'@export ShapeToMask <- function(shp_file, ref_grid, compute_area_coverage = FALSE, shp_system = "NUTS", reg_names = NULL, reg_ids = NULL, shp_col_name_ids = NULL, reg_level = 3, lat_dim = NULL, lon_dim = NULL, region = FALSE, check_valid = FALSE, find_min_dist = FALSE, max_dist = 50, ncores = NULL, fileout = NULL, ...) { # TODO: Suppress warnings? # TODO: Add saving option? # sf_use_s2(FALSE) # Initial checks # shp_file if (is.null(shp_file)) { stop("Parameter 'shp_file' cannot be NULL.") } if (!is.character(shp_file)) { stop("Parameter 'shp_file' must be a character string.") } # lon_dim, lat_dim if (!is.null(lon_dim)) { if (!is.character(lon_dim)) { stop("Parameter 'lon_dim' must be a character string.") } } if (!is.null(lat_dim)) { if (!is.character(lat_dim)) { stop("Parameter 'lat_dim' must be a character string.") } } # ref_grid if (is.null(ref_grid)) { stop("Parameter 'ref_grid' cannot be NULL.") } if (!any(is.character(ref_grid) | is.list(ref_grid))) { stop("Parameter 'ref_grid' must be either a netCDF file or a list of lon and lat.") } if (is.character(ref_grid)) { if (!file.exists(ref_grid)) { stop("ref_grid file does not exist.") } } if (is.list(ref_grid)) { if (length(ref_grid) != 2) { stop("If 'ref_grid' is a list, it must have two items for longitude and latitude.") } if (is.null(lat_dim) | is.null(lon_dim)) { lon_known_names <- c(.KnownLonNames(), 'lons') lat_known_names <- c(.KnownLatNames(), 'lats') lon_dim <- lon_known_names[which(lon_known_names %in% names(ref_grid))] lat_dim <- lat_known_names[which(lat_known_names %in% names(ref_grid))] if (identical(lon_dim, character(0)) | identical(lat_dim, character(0))) { stop("longitude and latitude names are not recognized in 'ref_grid'. ", "Please specify 'lon_dim' and 'lat_dim'.") } } } # shp_system, reg_ids, reg_names, shp_col_name_ids if (!is.null(shp_system)) { if (!is.character(shp_system)) { stop("Parameter 'shp_system' must be a character strinig.") } if (all(is.null(reg_ids), is.null(reg_names))) { stop("If 'shp_system' is used, you must provide either parameter ", "'reg_ids' or 'reg_names'.") } } else if (!is.null(shp_col_name_ids)) { if (is.null(reg_ids)) { stop("If 'shp_col_name_ids' is used, parameter 'reg_ids' must be provided.") } if (!is.character(shp_col_name_ids)) { stop("Parameter 'shp_col_name_ids' must be a character strinig.") } } if (all(!is.null(reg_ids), !is.null(reg_names))) { warning("Only use 'reg_ids' to get the shape region. 'reg_names' is not used.") } # reg_level if (!is.null(reg_level)) { if (!is.numeric(reg_level)) { stop("Parameter 'reg_level' must be numeric.") } } # region if (!is.logical(region)) { stop("Parameter 'region' must be a logical value.") } # check_valid if (!is.logical(check_valid)) { stop("Parameter 'check_valid' must be a logical value.") } # find_min_dist if (!is.logical(find_min_dist)) { stop("Parameter 'find_min_dist' must be a logical value.") } # max_dist if (!is.null(max_dist)) { if (!is.numeric(max_dist)) { stop("Parameter 'max_dist' must be a numeric.") } } # ncores if (!is.null(ncores)) { if (!is.numeric(ncores)) { stop("Parameter 'ncores' must be numeric.") } ncores <- round(ncores) if (ncores < 2) { ncores <- NULL } } # Step 1: Load the shapefile shp <- sf::st_read(shp_file) # class sf shp_crs <- sf::st_crs(shp) NUTS_ID <- ADM1_PCODE <- ISO <- NULL CNTR_CODE <- NUTS_NAME <- ADM0_EN <- ADM1_EN <- Name <- LEVL_CODE <- NULL if (!is.null(reg_ids)) { ## Method 1: Directly use IDs if (!is.null(shp_col_name_ids)) { if (shp_col_name_ids %in% names(shp)) { shp <- subset(shp, get(shp_col_name_ids) %in% reg_ids) } else { stop("Shape system name not found in shapefile names.") } } else if (shp_system == "NUTS") { shp <- subset(shp, NUTS_ID %in% reg_ids) shp_col_name_ids <- "NUTS_ID" } else if (shp_system == "ADM") { shp <- subset(shp, ADM1_PCODE %in% reg_ids) shp_col_name_ids <- "ADM1_PCODE" } else if (shp_system == "GADM") { shp <- subset(shp, ISO %in% reg_ids) shp_col_name_ids <- "ISO" } else { stop("shp_system ", shp_system, " is not defined yet.") } } else if (!is.null(reg_names)) { shp_col_name_ids <- NULL ## Method 2: Use country code & region name for (cntr_i in 1:length(reg_names)) { if (shp_system == "NUTS") { tmp <- subset(shp, CNTR_CODE == names(reg_names)[cntr_i]) tmp <- subset(tmp, NUTS_NAME %in% reg_names[[cntr_i]]) shp_col_name_ids <- 'NUTS_NAME' } else if (shp_system == "ADM") { tmp <- subset(shp, ADM0_EN == names(reg_names)[cntr_i]) tmp <- subset(tmp, ADM1_EN %in% reg_names[[cntr_i]]) shp_col_name_ids <- 'ADM1_EN' } else if (shp_system == "GADM") { tmp <- subset(shp, Name %in% reg_names) shp_col_name_ids <- 'Name' } if (cntr_i == 1) { shp_tmp <- tmp } else { shp_tmp <- rbind(shp_tmp, tmp) } } if (shp_system == "NUTS") { shp <- subset(shp_tmp, LEVL_CODE == reg_level) } else if (shp_system == "ADM" | shp_system == "GADM") { shp <- shp_tmp } } # Step (2.1): Use the reference file to get lat and lon if (all(tools::file_ext(ref_grid) == 'nc')) { ## Method 1: ref_grid is a netCDF file if (is.null(lat_dim) | is.null(lon_dim)) { var_names <- easyNCDF::NcReadVarNames(ref_grid) lat_dim <- var_names[which(var_names %in% .KnownLatNames())] lon_dim <- var_names[which(var_names %in% .KnownLonNames())] } latlon <- easyNCDF::NcToArray(ref_grid, vars_to_read = c(lat_dim, lon_dim)) lat <- easyNCDF::NcToArray(ref_grid, vars_to_read = lat_dim)[1, ] lon <- easyNCDF::NcToArray(ref_grid, vars_to_read = lon_dim)[1, ] } else if (is.list(ref_grid)) { ## Method 2: ref_grid is a list of lon and lat lat <- ref_grid[[lat_dim]] lon <- ref_grid[[lon_dim]] } # Step (2.2): Create the grid if (compute_area_coverage) { locations <- array(1:(length(lon)*length(lat)), c(lon = length(lon), lat = length(lat))) # Build data dataframe lonlat_df <- data.frame(lon = rep(as.vector(lon), length(lat)), lat = sort(rep(as.vector(lat), length(lon)), decreasing = TRUE)) data_df <- lonlat_df %>% mutate(dat = as.vector(locations)) lonlat_df_ori <- NULL # NOTE: if target_proj = "ESRI:54030", Nord3v2 has different behavior from hub and ws!! data_df <- st_as_sf(data_df, coords = c("lon", "lat"), crs = shp_crs) data_df <- st_transform(data_df, crs = shp_crs) data_df <- data_df %>% mutate(long = st_coordinates(data_df)[, 1], lat = st_coordinates(data_df)[, 2]) xy.sfc <- polygonize(lonlat_df = lonlat_df, data_df = data_df, lon = lon, lat = lat, target_proj = shp_crs, original_proj = shp_crs) region <- TRUE } else { ref.df <- data.frame(data = 0, lon = rep(lon, times = length(lat)), lat = rep(lat, each = length(lon))) coord <- as.matrix(data.frame(x = ref.df$lon, y = ref.df$lat)) xy.sfg <- sf::st_multipoint(coord) xy.sfc <- sf::st_sfc(xy.sfg) # Assign crs of original shapefile st_crs(xy.sfc) <- sf::st_crs(shp) # asign crs of shapefile # Check valid if (check_valid) { xy.sfc <- st_make_valid(xy.sfc) shp <- st_make_valid(shp) } } ## Step (3): Loop through each shp region to create the mask if (region) { cfun <- function(a, b) { if (length(dim(a)) == 2) { dims <- c(dim(a), 2) } else { dims <- c(dim(a)[1:2], dim(a)[length(dim(a))]+1) } return(array(c(a,b), dim = dims)) } } else { cfun <- function(a, b) { a[which(b != 0)] <- b[which(b != 0)] return(a) } } shp_i <- NULL if (is.null(ncores)) { mask <- foreach(shp_i = 1:nrow(shp), .combine = 'cfun') %do% .shapetomask(shp = shp, n = shp_i, lon = lon, lat = lat, xy.sfc = xy.sfc, compute_area_coverage = compute_area_coverage, find_min_dist = find_min_dist, shp_col_name_ids = shp_col_name_ids, max_dist = max_dist, region = region, ...) } else { registerDoParallel(ncores) mask <- foreach(shp_i = 1:nrow(shp), .combine = 'cfun', .packages = c('sf', 'dplyr')) %dopar% .shapetomask(shp = shp, n = shp_i, lon = lon, lat = lat, xy.sfc = xy.sfc, compute_area_coverage = compute_area_coverage, find_min_dist = find_min_dist, shp_col_name_ids = shp_col_name_ids, max_dist = max_dist, region = region, ...) registerDoSEQ() } if (region) { if (length(dim(mask)) == 2) dim(mask) <- c(dim(mask), 1) names(dim(mask)) <- c(lon_dim, lat_dim, 'region') } else { names(dim(mask)) <- c(lon_dim, lat_dim) } # Step 4: Add attributes attr(mask, lon_dim) <- lon attr(mask, lat_dim) <- lat if (!is.null(shp_system)) { if (shp_system == "NUTS") { attr(mask, "index") <- as.list(shp$NUTS_ID) } else if (shp_system == "ADM") { attr(mask, "index") <- as.list(shp$ADM1_PCODE) } else if (shp_system == "GADM") { attr(mask, "index") <- as.list(shp$ISO) } } attr(mask, "index") <- 1:nrow(shp) ## Return all the info from shp attr(mask, "shapefile") <- attributes(shp) # Step 5: Save to NetCDF if (!is.null(fileout)) { ArrayToNc(list(mask = mask), fileout) } return(mask) } .shapetomask <- function(shp, n, lon, lat, xy.sfc, compute_area_coverage = FALSE, find_min_dist = FALSE, shp_col_name_ids = NULL, max_dist = 50, region = FALSE, ...) { shpi <- shp[n, ] if (compute_area_coverage) { mask <- xy.sfc %>% dplyr::mutate(int = areacov(geometry, shpi)) %>% dplyr::arrange(value) %>% dplyr::pull(int) dim(mask) <- c(length(lon), length(lat)) } else { mask <- array(0, dim = c(length(lon), length(lat))) # NOTE: Don't know it's a problem in st_intersection or st_coordinates, tmp_coords may # not be identical as lon/lat. E.g., (29, 65) may be (29 - -3.552714e-15, 65). shp_pol <- sf::st_intersection(x = xy.sfc, y = shpi, ...) tmp_coords <- sf::st_coordinates(shp_pol)[, 1:2] if (length(tmp_coords) == 0) { dim(tmp_coords) <- NULL } else if (is.null(dim(tmp_coords))) { tmp_coords <- array(tmp_coords, dim = c(1, length(tmp_coords))) } if (!is.null(dim(tmp_coords))) { # polygon_instersection for (ii in 1:nrow(tmp_coords)) { # pt_x <- which(lon == tmp_coords[ii, 1]) # pt_y <- which.min(abs(lat - tmp_coords[ii, 2])) if (!region) { # min(abs(lon - tmp_coords[ii, 1])) # min(abs(lat - tmp_coords[ii, 2])) mask[which.min(abs(lon - tmp_coords[ii, 1])), which.min(abs(lat - tmp_coords[ii, 2]))] <- n } else { mask[which.min(abs(lon - tmp_coords[ii, 1])), which.min(abs(lat - tmp_coords[ii, 2]))] <- 1 } } } else if (find_min_dist) { x.centroid.shpi <- sf::st_coordinates(sf::st_centroid(shpi))[,1] y.centroid.shpi <- sf::st_coordinates(sf::st_centroid(shpi))[,2] dist <- sqrt((xy.sfc[,1] - x.centroid.shpi)**2 + (xy.sfc[,2] - y.centroid.shpi)**2) tmp_coords <- array(xy.sfc[which(dist == min(dist, na.rm = TRUE)),], dim = c(1,2)) colnames(tmp_coords) <- c('X', 'Y') if (max(dist) <= max_dist & (any(round(lat,2) == round(tmp_coords[1,2],2)) & any(round(lon,2) == round(tmp_coords[1,1],2))) ) { if (length(dim(mask)) == 2) { mask[which.min(abs(lon - tmp_coords[, 1])), which.min(abs(lat - tmp_coords[, 2]))] <- n } else { mask[which.min(abs(lon - tmp_coords[, 1])), which.min(abs(lat - tmp_coords[, 2]))] <- 1 } # warning(paste0('The reference grid has no intersection with region ', # ifelse(is.character(shp_col_name_ids), shpi[[shp_col_name_ids]], paste0('n° ', n)), # ' from the shapefile; the provided grid cell is at a distance of ', dist[which(dist == min(dist, na.rm = TRUE))], # ' to the centroid of the region (units are: ° or meters depending on the crs of the shapefile).')) } else { # warning(paste0('The reference grid has no intersection with region ', # ifelse(is.character(shp_col_name_ids), shpi[[shp_col_name_ids]], paste0('n° ', n)))) } } } return(mask) } # Function to create polygons from VizRobinson code polygonize <- function(lonlat_df, data_df, lon, lat, target_proj, original_proj) { # Calculate polygon points from regular lat/lon # NOTE: The original grid must be regular grid with same space d_lon <- abs(lon[2] - lon[1]) / 2 d_lat <- abs(lat[2] - lat[1]) / 2 lon_poly <- lat_poly <- rep(NA, 4 * dim(lonlat_df)[1]) for (ii in 1:dim(lonlat_df)[1]) { lon_poly[(ii*4-3):(ii*4)] <- c(lonlat_df$lon[ii] - d_lon, lonlat_df$lon[ii] + d_lon, lonlat_df$lon[ii] + d_lon, lonlat_df$lon[ii] - d_lon) lat_poly[(ii*4-3):(ii*4)] <- c(lonlat_df$lat[ii] - d_lat, lonlat_df$lat[ii] - d_lat, lonlat_df$lat[ii] + d_lat, lonlat_df$lat[ii] + d_lat) } # To prevent out-of-global lon # lon_poly[which(lon_poly >= 180)] <- 179.9 # lon_poly[which(lon_poly < -180)] <- -180 # To prevent out-of-global lat lat_poly[which(lat_poly > 90)] <- 90 lat_poly[which(lat_poly < -90)] <- -90 lonlat_df <- data.frame(lon = lon_poly, lat = lat_poly) # Transfer lon/lat to projection proj_lonlat <- st_as_sf(lonlat_df, coords = c("lon", "lat"), crs = original_proj) # NOTE: if target_proj = "ESRI:54030", on Nord3v2, st_transform has lon and lat swapped! proj_lonlat <- st_transform(proj_lonlat, crs = target_proj) lonlat_df_proj <- st_coordinates(proj_lonlat) # Use id to create groups for each polygon ids <- factor(paste0("id_", 1:dim(data_df)[1])) values <- data.frame(id = ids, value = data_df$dat) positions <- data.frame(id = rep(ids, each = 4), x = lonlat_df_proj[, 1], y = lonlat_df_proj[, 2]) datapoly <- merge(values, positions, by = "id") datapoly <- st_as_sf(datapoly, coords = c("x", "y"), crs = target_proj) datapoly <- datapoly %>% dplyr::group_by(.data$id) %>% dplyr::summarise() %>% # NOTE: VERY SLOW if plot global dplyr::mutate(value = values[order(values$id), ]$value) %>% st_cast("POLYGON") %>% st_convex_hull() # maintain outer polygen (no bowtie shape) return(datapoly) } # Function to compute the coverage area ratio between the grid and the shapefile areacov <- function(grid, shp) { shp_int_grid <- sf::st_intersection(x = grid, y = shp) idx <- attributes(shp_int_grid)$idx i <- 0 fr <- array(0, length(grid)) for (idx_i in idx[, 1]) { i <- i + 1 area_grid <- sf::st_area(grid[idx_i]) area_int <- sf::st_area(shp_int_grid[i]) fr[idx_i] <- round(as.numeric(area_int/area_grid), digits = 6) } return(fr) }