#'Convert Shapefile to Mask Array #' #'This function reads a shapefile (.shp) containing information about polygonal #'regions. It then transfers the shapefile data into an array and subsets the #'output based on requested region names or IDs. The accepted shapefile #'databases are 'NUTS', 'LAU', and 'GADM', each with its own unique format. #'However, the function can use other shapefiles databases with specifying the #'categories names with the parameter 'shp_system_name'. #' #'To ensure accurate comparison with the shapefile, the function loads a #'reference dataset that provides longitude and latitude information. By #'intersecting each subset of the shapefile with the reference coordinates, the #'function selects only the desired regions. The final step involves creating a #'mask array. Depending on the chosen option, the mask array is either returned #'as the function's output or saved into a NetCDF format in the specified #'directory. #' #'Note: Modules GDAL, PROJ and GEOS are required. #' #'@param shp_file A character string indicating the shp file path. #'@param ref_grid A character string indicating the path to the reference #' data. Either (1) a netCDF file or (2) a list of lon and lat to provide the #' reference grid points. It is NULL by default. #'@param shp_system A character string containing the Shapefile System Database #' name. The accepted systems are: 'NUTS', 'LAU', and 'GADM'. It is set to #' 'NUTS' by default. #'@param shp_system_name A character string indicating the column name of the #' column in where the specified 'reg_ids' will be taken. #'@param reg_ids A character string indicating the unique ID in shapefile. #' It is NULL by default. #'@param reg_names A named list of character string vectors indicating the #' country and the region name. The name of the list stands for the country #' name code and the vector character strings indicate the region name for #' each country. It is NULL by default. #'@param reg_level An integer number from 1 to 3 indicating the 'NUTS' dataset #' level. For other datasets this parameter is not used. One mask can only have #' a unique level. It is set to 3 by default. #'@param lat_dim A character string indicating the latitudinal dimension. If it #' is NULL, the latitudinal name will be searched using an internal function #' with the following possible names: 'lat', 'latitude', 'y', 'j' and #' 'nav_lat'. It is NULL by default. #'@param lon_dim A character string indicating the longitudinal dimension. If it #' is NULL, the longitudinal name will be searched using an internal function #' with the following possible names: 'lon', 'longitude', 'x', 'i' and #' 'nav_lon'. It is NULL by default. #'@param target_crs A character string indicating the target 'Coordinate #' Reference System'. #'@param region A logical value indicating if we want a dimension for the #' regions in the resulting mask array. It is FALSE by default. #'@param check_valid A logical value that when it is TRUE it uses the function #' 'sf::st_make_valid' applied to the shapefile and to the coordinates. #'@param find_min_dist A logical value indicating if we want to look for the #' nearest coordinate between the shapefile region and the reference grid when #' there is no intersection between the shapefile and the reference grid. It is #' FALSE by default. #'@param max_dist A numeric value indicating the maximum distance is accepted #' to the closest gridpoint when there is no intersection between the shapefile #' and the reference grid. #'@param savefile A logical value indicating wether to save the mask array into #' a NetCDF format file (TRUE) or to return an array (FALSE). It is FALSE by #' default. This functionality is not developed yet. #'@param ... Arguments passed on to 's2_options' in function 'st_intersection'. #' See 's2 package'. #' #'@return A multidimensional array containing a mask array with longitude and #'latitude dimensions. If 'region' is TRUE, there will be a dimension for #'the region. #' #'@examples #'# Exmple (1): NUTS #'shp_file <- paste0('/esarchive/shapefiles/NUTS3/NUTS_RG_60M_2021_4326.shp/', #' 'NUTS_RG_60M_2021_4326.shp') #'# shp_file <- paste0('/esarchive/scratch/cdelgado/focus_outputs/Shapefiles/', #'# 'tza_admbnda_adm1/tza_admbnda_adm1_20181019.shp') #'# ref_grid <- paste0('/esarchive/exp/ecmwf/system5c3s/monthly_mean/', #'# 'tas_f6h/tas_20170201.nc') #'# ref_grid <- paste0('/esarchive/exp/ecmwf/s2s-monthly_ensfor/weekly_mean/', #'# 'tas_f6h/tas_20191212.nc') #'ref_grid <- paste0('/esarchive/recon/ecmwf/era5land/monthly_mean/', #' 'tas_f1h/tas_201006.nc') #'# ref_grid <- list(lon = seq(10, 40, 0.5), lat = seq(40, 85, 0.5)) #' #'NUTS.id <- paste0("FI1D", c(1:3, 5, 7:9)) #'NUTS.name <- list(FI = c('Lappi', 'Kainuu'), SI = c('Pomurska', 'Podravska')) #'mask1 <- ShapeToMask(shp_file, ref_grid, reg_ids = NUTS.id) #'mask2 <- ShapeToMask(shp_file = shp_file, ref_grid = ref_grid, #' reg_names = NUTS.name) #' #'# Exmple (2): GADM #'shp_file <- "/esarchive/shapefiles/gadm_country_mask/gadm_country_ISO3166.shp" #'ref_grid <- paste0('/esarchive/exp/ecmwf/s2s-monthly_ensfor/weekly_mean/', #' 'tas_f6h/tas_20191212.nc') #'GADM.id <- c("ESP", "ITA") #'GADM.name <- c("Spain", "Italy") #'mask1 <- ShapeToMask(shp_file = shp_file, ref_grid = ref_grid, #' reg_ids = GADM.id, shp_system = "GADM") #'mask2 <- ShapeToMask(shp_file = shp_file, ref_grid = ref_grid, #' reg_names = GADM.name, shp_system = "GADM") #'@import easyNCDF #'@import sf #'@export ShapeToMask <- function(shp_file, ref_grid = NULL, shp_system = "NUTS", shp_system_name = NULL, reg_ids = NULL, reg_names = NULL, reg_level = 3, lat_dim = NULL, lon_dim = NULL, savefile = FALSE, region = FALSE, target_crs = NULL, check_valid = FALSE, find_min_dist = FALSE, max_dist = 50, ...) { # NOTE: One region is one number; need to have the option to combine them? # TODO: Suppress warnings? # TODO: Substitute packages # TODO: Substitute loop for each region with multiApply # TODO: Add initial checks # Step 1: Load the shapefile shp <- sf::st_read(shp_file) # class sf if (!is.null(target_crs)) { transformed_shapefile <- st_transform(shp, crs = target_crs) shp <- transformed_shapefile } if (all(is.null(reg_ids), is.null(reg_names))) { stop("Either provide parameter 'reg_ids' or 'reg_names'.") } else if (!is.null(reg_ids)) { ## Method 1: Directly use IDs if (!is.null(shp_system_name)) { if (shp_system_name %in% names(shp)) { shp <- subset(shp, get(shp_system_name) %in% reg_ids) } else { stop("Shape system name not found in shapefile names.") } } else if (shp_system == "NUTS") { shp <- subset(shp, NUTS_ID %in% reg_ids) shp_system_name <- "NUTS_ID" } else if (shp_system == "ADM") { shp <- subset(shp, ADM1_PCODE %in% reg_ids) shp_system_name <- "ADM1_PCODE" } else if (shp_system == "GADM") { shp <- subset(shp, ISO %in% reg_ids) shp_system_name <- "ISO" } else { stop("shp_system ", shp_system, " is not defined yet.") } if (!is.null(reg_names)) { warning("Only use 'reg_ids' to get the shape region. 'reg_names' is not used.") } } else if (!is.null(reg_names)) { shp_system_name <- NULL ## Method 2: Use country code & region name for (cntr_i in 1:length(reg_names)) { if (shp_system == "NUTS") { tmp <- subset(shp, CNTR_CODE == names(reg_names)[cntr_i]) tmp <- subset(tmp, NUTS_NAME %in% reg_names[[cntr_i]]) shp_system_name <- NUTS_NAME } else if (shp_system == "ADM") { tmp <- subset(shp, ADM0_EN == names(reg_names)[cntr_i]) tmp <- subset(tmp, ADM1_EN %in% reg_names[[cntr_i]]) shp_system_name <- ADM1_EN } else if (shp_system == "GADM") { tmp <- subset(shp, Name %in% reg_names) shp_system_name <- Name } if (cntr_i == 1) { shp_tmp <- tmp } else { shp_tmp <- rbind(shp_tmp, tmp) } } if (shp_system == "NUTS") { shp <- subset(shp_tmp, LEVL_CODE == reg_level) } else if (shp_system == "ADM" | shp_system == "GADM") { shp <- shp_tmp } } # Step 2: Use the reference file to get lat and lon if (all(tools::file_ext(ref_grid) == 'nc')) { if (!file.exists(ref_grid)) { stop("ref_grid file does not exist.") } else { ## Method 1: ref_grid is a netCDF file if (is.null(lat_dim) | is.null(lon_dim)) { var_names <- easyNCDF::NcReadVarNames(ref_grid) lat_dim <- var_names[which(var_names %in% s2dv:::.KnownLatNames())] lon_dim <- var_names[which(var_names %in% s2dv:::.KnownLonNames())] } latlon <- NcToArray(ref_grid, vars_to_read = c(lat_dim, lon_dim)) lat <- NcToArray(ref_grid, vars_to_read = lat_dim)[1, ] lon <- NcToArray(ref_grid, vars_to_read = lon_dim)[1, ] } } else if (is.list(ref_grid)) { ## Method 2: ref_grid is a list of lon and lat if (length(ref_grid) != 2) { stop("If 'ref_grid' is a list, it must have two items for longitude and latitude.") } if (is.null(lat_dim) | is.null(lon_dim)) { # NOTE: the names come from s2dv:::.KnownLonNames and .KnownLatNames lon_known_names <- c(s2dv:::.KnownLonNames(), 'lons') lat_known_names <- c(s2dv:::.KnownLatNames(), 'lats') lon_dim <- lon_known_names[which(lon_known_names %in% names(ref_grid))] lat_dim <- lat_known_names[which(lat_known_names %in% names(ref_grid))] if (identical(lon_dim, character(0)) | identical(lat_dim, character(0))) { stop("longitude and latitude names are not recognized in 'ref_grid'. Please specify 'lon_dim' and 'lat_dim'.") } } lat <- ref_grid[[lat_dim]] lon <- ref_grid[[lon_dim]] } else { stop("Parameter 'ref_grid' must be either a netCDF file or a list of lon and lat.") } ## Create data frame & sp class for ref grid ref.df <- data.frame(data = 0, lon = rep(lon, times = length(lat)), lat = rep(lat, each = length(lon))) coord <- as.matrix(data.frame(x = ref.df$lon, y = ref.df$lat)) xy.sfg <- st_multipoint(coord) xy.sfc <- st_sfc(xy.sfg) # Assign crs of original shapefile if (!is.null(target_crs)) { st_crs(xy.sfc) <- sf::st_crs(target_crs) #initial_crs # asign crs of original shapefile xy.sfc <- sf::st_transform(xy.sfc, st_crs(shp)) } else { st_crs(xy.sfc) <- sf::st_crs(shp) } # Step 3: Create mask ## Create mask array with 0; 1, 2, etc. will be filled in for each shp region mask <- array(0, dim = c(length(lon), length(lat))) names(dim(mask)) <- c(lon_dim, lat_dim) if (region) { mask <- array(0, dim = c(nrow(shp), length(lon), length(lat))) names(dim(mask)) <- c('region', lon_dim, lat_dim) } if (check_valid) { xy.sfc <- st_make_valid(xy.sfc) shp <- st_make_valid(shp) } ## Loop through each shp region for (shp_i in 1:nrow(shp)) { # NOTE: Don't know it's a problem in st_intersection or st_coordinates, tmp_coords may # not be identical as lon/lat. E.g., (29, 65) may be (29 - -3.552714e-15, 65). shp_pol <- sf::st_intersection(xy.sfc, shp[shp_i, ]) # shp_pol <- sf::st_intersection(xy.sfc, shp[shp_i, ], ...) tmp_coords <- st_coordinates(shp_pol)[, 1:2] if (length(tmp_coords) == 0) { dim(tmp_coords) <- NULL } else if (is.null(dim(tmp_coords))) { tmp_coords <- array(tmp_coords, dim = c(1, length(tmp_coords))) } if (!is.null(dim(tmp_coords))) { # polygon_instersection for (ii in 1:nrow(tmp_coords)) { # pt_x <- which(lon == tmp_coords[ii, 1]) # pt_y <- which.min(abs(lat - tmp_coords[ii, 2])) if (length(dim(mask)) == 2) { # min(abs(lon - tmp_coords[ii, 1])) # min(abs(lat - tmp_coords[ii, 2])) mask[which.min(abs(lon - tmp_coords[ii, 1])), which.min(abs(lat - tmp_coords[ii, 2]))] <- shp_i } else { mask[shp_i, which.min(abs(lon - tmp_coords[ii, 1])), which.min(abs(lat - tmp_coords[ii, 2]))] <- 1 } } } else if (find_min_dist) { x.centroid.shpi <- sf::st_coordinates(sf::st_centroid(shp[shp_i,]))[,1] y.centroid.shpi <- sf::st_coordinates(sf::st_centroid(shp[shp_i,]))[,2] dist <- sqrt((xy.sfg[,1] - x.centroid.shpi)**2 + (xy.sfg[,2] - y.centroid.shpi)**2) tmp_coords <- array(xy.sfg[which(dist == min(dist, na.rm = TRUE)),], dim = c(1,2)) colnames(tmp_coords) <- c('X', 'Y') if (max(dist) <= max_dist & (any(round(lat,2) == round(tmp_coords[1,2],2)) & any(round(lon,2) == round(tmp_coords[1,1],2))) ) { if (length(dim(mask)) == 2) { mask[which.min(abs(lon - tmp_coords[, 1])), which.min(abs(lat - tmp_coords[, 2]))] <- shp_i } else { mask[shp_i, which.min(abs(lon - tmp_coords[, 1])), which.min(abs(lat - tmp_coords[, 2]))] <- 1 } warning(paste0('The reference grid has no intersection with region ', ifelse(is.character(shp_system_name), shp[shp_i,][[shp_system_name]], paste0('n° ', shp_i)), ' from the shapefile; the provided grid cell is at a distance of ', dist[which(dist == min(dist, na.rm = TRUE))], ' to the centroid of the region (units are: ° or meters depending on the crs of the shapefile).')) } else { warning(paste0('The reference grid has no intersection with region ', ifelse(is.character(shp_system_name), shp[shp_i,][[shp_system_name]], paste0('n° ', shp_i)))) } } } # Step 4: Add attributes # attr(mask, lon_dim) <- lon # attr(mask, lat_dim) <- lat # if (shp_system == "NUTS") { # attr(mask, "index") <- as.list(shp$NUTS_ID) # } else if (shp_system == "ADM") { # attr(mask, "index") <- as.list(shp$ADM1_PCODE) # } else if (shp_system == "GADM") { # attr(mask, "index") <- as.list(shp$ISO) # } # names(attr(mask, "index")) <- 1:nrow(shp) # ## Return all the info from shp # attr(mask, "shapefile") <- attributes(shp) # Step 5: Save the file or return the array if (!savefile) { return(mask) } else { warning("This functionality is not developed yet.") # TODO # ArrayToNc() } }