ShapeToMask.R 14 KB
Newer Older
Eva Rifà's avatar
Eva Rifà committed
#'Convert Shapefile to Mask Array
#' 
#'This function reads a shapefile (.shp) containing information about polygonal 
#'regions. It then transfers the shapefile data into an array and subsets the 
#'output based on requested region names or IDs. The accepted shapefile 
#'databases are 'NUTS', 'LAU', and 'GADM', each with its own unique format. 
#'However, the function can use other shapefiles databases with specifying the 
#'categories names with the parameter 'shp.system.name'. 
#'
#'To ensure accurate comparison with the shapefile, the function loads a 
#'reference dataset that provides longitude and latitude information. By 
#'intersecting each subset of the shapefile with the reference coordinates, the 
#'function selects only the desired regions. The final step involves creating a 
#'mask array. Depending on the chosen option, the mask array is either returned 
#'as the function's output or saved into a NetCDF format in the specified 
#'directory.
#'
#'Note: Modules GDAL, PROJ and GEOS are required.
#' 
#'@param shp.file A character string indicating the shp file path.
#'@param ref.grid A character string indicating the path to the reference 
#'  data. Either (1) a netCDF file or (2) a list of lon and lat to provide the 
#'  reference grid points. It is NULL by default.
#'@param shp.system A character string containing the Shapefile System Database 
#'  name. The accepted systems are: 'NUTS', 'LAU', and 'GADM'. It is set to 
#'  'NUTS' by default.
#'@param shp.system.name A character string indicating the column name of the 
#'  column in where the specified 'reg.ids' will be taken.
#'@param reg.ids A character string indicating the unique ID in shapefile. 
#'  It is NULL by default.
#'@param reg.names A named list of character string vectors indicating the 
#'  country and the region name. The name of the list stands for the country 
#'  name code and the vector character strings indicate the region name for 
#'  each country. It is NULL by default.
#'@param reg.level An integer number from 1 to 3 indicating the 'NUTS' dataset 
#'  level. For other datasets this parameter is not used. One mask can only have 
#'  a unique level. It is set to 3 by default.
#'@param lat_dim A character string indicating the latitudinal dimension. If it 
#'  is NULL, the latitudinal name will be searched using an internal function 
#'  with the following possible names: 'lat', 'latitude', 'y', 'j' and 
#'  'nav_lat'. It is NULL by default.
#'@param lon_dim A character string indicating the longitudinal dimension. If it 
#'  is NULL, the longitudinal name will be searched using an internal function 
#'  with the following possible names: 'lon', 'longitude', 'x', 'i' and 
#'  'nav_lon'. It is NULL by default.
#'@param target_crs A character string indicating the target 'Coordinate 
#'  Reference System'.
#'@param region A logical value indicating if we want a dimension for the 
#'  regions in the resulting mask array. It is FALSE by default.
#'@param check_valid A logical value that when it is TRUE it uses the function 
#'  'sf::st_make_valid' applied to the shapefile and to the coordinates.
#'@param max_dist A numeric value indicating the maximum distance is accepted 
#'  to the closest gridpoint when there is no intersection between the shapefile 
#'  and the reference grid. 
#'@param savefile A logical value indicating wether to save the mask array into 
#'  a NetCDF format file (TRUE) or to return an array (FALSE). It is FALSE by 
#'  default. This functionality is not developed yet.
#'@param ... Arguments passed on to 's2_options' in function 'st_intersection'.  
#'  See 's2 package'.
#' 
#'@return A multidimensional array containing a mask array with longitude and 
#'latitude dimensions. If 'region' is TRUE, there will be a dimension for 
#'the region. 
#' 
#'@examples 
#'# Exmple (1): NUTS
#'shp.file <- paste0('/esarchive/shapefiles/NUTS3/NUTS_RG_60M_2021_4326.shp/', 
#'                   'NUTS_RG_60M_2021_4326.shp')
#'# shp.file <- paste0('/esarchive/scratch/cdelgado/focus_outputs/Shapefiles/',
#'#                    'tza_admbnda_adm1/tza_admbnda_adm1_20181019.shp')
#'# ref.grid <- paste0('/esarchive/exp/ecmwf/system5c3s/monthly_mean/',
#'#                    'tas_f6h/tas_20170201.nc')
#'# ref.grid <- paste0('/esarchive/exp/ecmwf/s2s-monthly_ensfor/weekly_mean/',
#'#                    'tas_f6h/tas_20191212.nc')
#'ref.grid <- paste0('/esarchive/recon/ecmwf/era5land/monthly_mean/',
#'                   'tas_f1h/tas_201006.nc')
#'# ref.grid <- list(lon = seq(10, 40, 0.5), lat = seq(40, 85, 0.5))
#'
#'NUTS.id <- paste0("FI1D", c(1:3, 5, 7:9))  
#'NUTS.name <- list(FI = c('Lappi', 'Kainuu'), SI = c('Pomurska', 'Podravska'))
#'mask1 <- ShapeToMask(shp.file, ref.grid, reg.ids = NUTS.id)
#'mask2 <- ShapeToMask(shp.file = shp.file, ref.grid = ref.grid, 
#'                  reg.names = NUTS.name)
#'
#'# Exmple (2): GADM
#'shp.file <- "/esarchive/shapefiles/gadm_country_mask/gadm_country_ISO3166.shp"
#'ref.grid <- paste0('/esarchive/exp/ecmwf/s2s-monthly_ensfor/weekly_mean/',
#'                   'tas_f6h/tas_20191212.nc')
#'GADM.id <- c("ESP", "ITA")
#'GADM.name <- c("Spain", "Italy")
#'mask1 <- ShapeToMask(shp.file = shp.file, ref.grid = ref.grid, 
#'                  reg.ids = GADM.id, shp.system = "GADM")
#'mask2 <- ShapeToMask(shp.file = shp.file, ref.grid = ref.grid, 
#'                  reg.names = GADM.name, shp.system = "GADM")
#'@import easyNCDF
#'@import sf
#'@export
ShapeToMask <- function(shp.file, ref.grid = NULL, 
                        shp.system = "NUTS", shp.system.name = NULL, 
                        reg.ids = NULL, reg.names = NULL,
                        reg.level = 3, lat_dim = NULL, lon_dim = NULL, 
                        savefile = FALSE, region = FALSE, target_crs = NULL, 
                        check_valid = FALSE, max_dist = 99999999, ...) {

  # NOTE: One region is one number; need to have the option to combine them?
  # TODO: Suppress warnings?
  # TODO: Substitute packages

  # Step 1: Load the shapefile
  shp <- sf::st_read(shp.file)  # class sf
  if (!is.null(target_crs)) {
    transformed_shapefile <- st_transform(shp, crs = target_crs)
    shp <- transformed_shapefile
  }
 
  if (all(is.null(reg.ids), is.null(reg.names))) {
    stop("Either provide parameter 'reg.ids' or 'reg.names'.")
  } else if (!is.null(reg.ids)) {
    ## Method 1: Directly use IDs
    if (!is.null(shp.system.name)) {
      if (shp.system.name %in% names(shp)) {
        shp <- subset(shp, get(shp.system.name) %in% reg.ids)
      } else {
        stop("Shape system name not found in shapefile names.")
      }
    } else if (shp.system == "NUTS") {
      shp <- subset(shp, NUTS_ID %in% reg.ids)
      shp.system.name <- NUTS_ID
    } else if (shp.system == "ADM") {
      shp <- subset(shp, ADM1_PCODE %in% reg.ids)
      shp.system.name <- ADM1_PCODE
    } else if (shp.system == "GADM") {
      shp <- subset(shp, ISO %in% reg.ids)
      shp.system.name <- ISO
    } else {
      stop("shp.system ", shp.system, " is not defined yet.")
    }
    if (!is.null(reg.names)) {
      warning("Only use 'reg.ids' to get the shape region. 'reg.names' is not used.")
    }
  
  } else if (!is.null(reg.names)) {
    shp.system.name <- NULL
    ## Method 2: Use country code & region name
    for (cntr_i in 1:length(reg.names)) {
      if (shp.system == "NUTS") {
        tmp <- subset(shp, CNTR_CODE == names(reg.names)[cntr_i])
        tmp <- subset(tmp, NUTS_NAME %in% reg.names[[cntr_i]])
        shp.system.name <- NUTS_NAME
      } else if (shp.system == "ADM") {
        tmp <- subset(shp, ADM0_EN == names(reg.names)[cntr_i])
        tmp <- subset(tmp, ADM1_EN %in% reg.names[[cntr_i]])
        shp.system.name <- ADM1_EN
      } else if (shp.system == "GADM") {
        tmp <- subset(shp, Name %in% reg.names)
        shp.system.name <- Name
      }
      if (cntr_i == 1) {
        shp_tmp <- tmp
      } else {
        shp_tmp <- rbind(shp_tmp, tmp)
      }
    }
    if (shp.system == "NUTS") {
      shp <- subset(shp_tmp, LEVL_CODE == reg.level)
    } else if (shp.system == "ADM" | shp.system == "GADM") {
      shp <- shp_tmp
    }
  }
  
  # Step 2: Use the reference file to get lat and lon
 
  if (all(tools::file_ext(ref.grid) == 'nc')) {
    if (!file.exists(ref.grid)) {
      stop("ref.grid file does not exist.")
    } else {
    
      ## Method 1: ref.grid is a netCDF file
      if (is.null(lat_dim) | is.null(lon_dim)) {
        var_names <- easyNCDF::NcReadVarNames(ref.grid)
        lat_dim <- var_names[which(var_names %in% s2dv:::.KnownLatNames())]
        lon_dim <- var_names[which(var_names %in% s2dv:::.KnownLonNames())]
      }
      latlon <- NcToArray(ref.grid, vars_to_read = c(lat_dim, lon_dim))
      lat <- NcToArray(ref.grid, vars_to_read = lat_dim)[1, ]
      lon <- NcToArray(ref.grid, vars_to_read = lon_dim)[1, ]
    }
  } else if (is.list(ref.grid)) {
    ## Method 2: ref.grid is a list of lon and lat
    if (length(ref.grid) != 2) {
      stop("If 'ref.grid' is a list, it must have two items for longitude and latitude.")
    }
    if (is.null(lat_dim) | is.null(lon_dim)) {
      # NOTE: the names come from s2dv:::.KnownLonNames and .KnownLatNames
      lon_known_names <- c(s2dv:::.KnownLonNames(), 'lons')
      lat_known_names <- c(s2dv:::.KnownLatNames(), 'lats')
      lon_dim <- lon_known_names[which(lon_known_names %in% names(ref.grid))]
      lat_dim <- lat_known_names[which(lat_known_names %in% names(ref.grid))]

      if (identical(lon_dim, character(0)) | identical(lat_dim, character(0))) {
        stop("longitude and latitude names are not recognized in 'ref.grid'. Please specify 'lon_dim' and 'lat_dim'.")
      }
    }
    lat <- ref.grid[[lat_dim]]
    lon <- ref.grid[[lon_dim]]

  } else {
    stop("Parameter 'ref.grid' must be either a netCDF file or a list of lon and lat.")
  }

  ## Create data frame & sp class for ref grid
  ref.df <- data.frame(data = 0,
                       lon = rep(lon, times = length(lat)),
                       lat = rep(lat, each = length(lon)))
  coord <- as.matrix(data.frame(x = ref.df$lon, y = ref.df$lat))
  
  xy.sfg <- st_multipoint(coord)
  xy.sfc <- st_sfc(xy.sfg)
  # Assign crs of original shapefile
  if (!is.null(target_crs)) {
    st_crs(xy.sfc) <- sf::st_crs(target_crs) #initial_crs # asign crs of original shapefile
    xy.sfc <- sf::st_transform(xy.sfc, st_crs(shp)) 
  } else {
    st_crs(xy.sfc) <- sf::st_crs(shp)
  }

  # Step 3: Create mask 
  ## Create mask array with 0; 1, 2, etc. will be filled in for each shp region
  mask <- array(0, dim = c(length(lon), length(lat)))
  names(dim(mask)) <- c(lon_dim, lat_dim)
  if (region) {
    mask <- array(0, dim = c(nrow(shp), length(lon), length(lat)))
    names(dim(mask)) <- c('region', lon_dim, lat_dim)
  }
  
  if (check_valid) {
    xy.sfc <- st_make_valid(xy.sfc)
    shp <- st_make_valid(shp)
  }

  ## Loop through each shp region
  for (shp_i in 1:nrow(shp)) {
    # NOTE: Don't know it's a problem in st_intersection or st_coordinates, tmp_coords may 
    #       not be identical as lon/lat. E.g., (29, 65) may be (29 - -3.552714e-15, 65).
    shp_pol <- sf::st_intersection(xy.sfc, shp[shp_i, ])
    # shp_pol <- sf::st_intersection(xy.sfc, shp[shp_i, ], ...)
    tmp_coords <- st_coordinates(shp_pol)[, 1:2]
    if (length(tmp_coords) == 0) {
      dim(tmp_coords) <- NULL
    } else if (is.null(dim(tmp_coords))) {
      tmp_coords <- array(tmp_coords, dim = c(1, length(tmp_coords)))
    }
    if (!is.null(dim(tmp_coords))) {

      # polygon_instersection
      for (ii in 1:nrow(tmp_coords)) {
        # pt_x <- which(lon == tmp_coords[ii, 1])
        # pt_y <- which.min(abs(lat - tmp_coords[ii, 2]))
        if (length(dim(mask)) == 2) {
          # min(abs(lon - tmp_coords[ii, 1]))
          # min(abs(lat - tmp_coords[ii, 2]))
          mask[which.min(abs(lon - tmp_coords[ii, 1])),
               which.min(abs(lat - tmp_coords[ii, 2]))] <- shp_i
        } else {
          mask[shp_i, which.min(abs(lon - tmp_coords[ii, 1])),
               which.min(abs(lat - tmp_coords[ii, 2]))] <- 1
        }
      }
    } else {
      x.centroid.shpi <- sf::st_coordinates(sf::st_centroid(shp[shp_i,]))[,1]
      y.centroid.shpi <- sf::st_coordinates(sf::st_centroid(shp[shp_i,]))[,2]
      dist <- sqrt((xy.sfg[,1] - x.centroid.shpi)**2 + (xy.sfg[,2] - y.centroid.shpi)**2)
      tmp_coords <- array(xy.sfg[which(dist == min(dist, na.rm = TRUE)),], dim = c(1,2))
      colnames(tmp_coords) <- c('X', 'Y')
      if (max(dist) <= max_dist & (any(round(lat,2) == round(tmp_coords[1,2],2)) & 
                                   any(round(lon,2) == round(tmp_coords[1,1],2))) ) {
        if (length(dim(mask)) == 2) {
          mask[which.min(abs(lon - tmp_coords[, 1])),
              which.min(abs(lat - tmp_coords[, 2]))] <- shp_i
        } else {
          mask[shp_i, which.min(abs(lon - tmp_coords[, 1])),
              which.min(abs(lat - tmp_coords[, 2]))] <- 1
        }
        warning(paste0('The reference grid has no intersection with region ', 
                ifelse(is.character(shp.system.name), shp[shp_i,][[shp.system.name]], paste0('n° ', shp_i)), 
                ' from the shapefile; the provided grid cell is at a distance of ', dist[which(dist == min(dist, na.rm = TRUE))], 
                ' to the centroid of the region (units are: ° or meters depending on the crs of the shapefile).'))
      } else {
        warning(paste0('The reference grid has no intersection with region ', 
                ifelse(is.character(shp.system.name), shp[shp_i,][[shp.system.name]], paste0('n° ', shp_i))))
      }
    }
  }
  
  # Step 4: Add attributes
  # attr(mask, lon_dim) <- lon
  # attr(mask, lat_dim) <- lat
  # if (shp.system == "NUTS") {
  #   attr(mask, "index") <- as.list(shp$NUTS_ID)
  # } else if (shp.system == "ADM") {
  #   attr(mask, "index") <- as.list(shp$ADM1_PCODE)
  # } else if (shp.system == "GADM") {
  #   attr(mask, "index") <- as.list(shp$ISO)
  # }
  # names(attr(mask, "index")) <- 1:nrow(shp)

  # ## Return all the info from shp
  # attr(mask, "shapefile") <- attributes(shp)
  
  # Step 5: Save the file or return the array 
  if (!savefile) {
    return(mask)
  } else {
    warning("This functionality is not developed yet.")
    # TODO
    # ArrayToNc()
  }
}