PlotForecastPDF.R

#'Plot one or multiple ensemble forecast pdfs for the same event
#'
#'@author Llorenç Lledó \email{llledo@bsc.es}
#'@description This function plots the probability distribution function of 
#'several ensemble forecasts. Separate panels are used to plot forecasts valid 
#'or initialized at different times or by different models or even at different 
#'locations. Probabilities for tercile categories are computed, plotted in 
#'colors and annotated. An asterisk marks the tercile with higher probabilities. 
#'Probabilities for extreme categories (above P90 and below P10) can also be 
#'included as hatched areas. Individual ensemble members can be plotted as 
#'jittered points. The observed value is optionally shown as a diamond.
#'
#'@param fcst A dataframe or array containing all the ensember members for each 
#'  forecast. If \code{'fcst'} is an array, it should have two labelled 
#'  dimensions, and one of them should be \code{'members'}. If \code{'fcsts'} is 
#'  a data.frame, each column shoul be a separate forecast, with the rows beeing 
#'  the different ensemble members.
#'@param tercile.limits An array or vector with P33 and P66 values that define 
#'  the tercile categories for each panel. Use an array of dimensions 
#'  (nforecasts,2) to  define different terciles for each forecast panel, or a 
#'  vector with two elements to reuse the same tercile limits for all forecast 
#'  panels.
#'@param extreme.limits (optional) An array or vector with P10 and P90 values 
#'  that define the extreme categories for each panel. Use an array of 
#'  (nforecasts,2) to define different extreme limits for each forecast panel, 
#'  or a vector with two elements to reuse the same tercile limits for all 
#'  forecast panels. (Default: extreme categories are not shown).
#'@param obs (optional) A vector providing the observed values for each forecast 
#'  panel or a single value that will be reused for all forecast panels. 
#'  (Default: observation is not shown).
#'@param plotfile (optional) A filename (pdf, png...) where the plot will be 
#'  saved. (Default: the plot is not saved).
#'@param title A string with the plot title.
#'@param var.name A string with the variable name and units.
#'@param fcst.names (optional) An array of strings with the titles of each 
#'  individual forecast.
#'@param add.ensmemb Either to add the ensemble members \code{'above'} (default) 
#'  or \code{'below'} the pdf, or not (\code{'no'}).
#'@param color.set A selection of predefined color sets: use \code{'ggplot'} 
#'  (default) for blue/green/red, \code{'s2s4e'} for blue/grey/orange, 
#'  \code{'hydro'} for yellow/gray/blue (suitable for precipitation and 
#'  inflows) or the \code{"vitigeoss"} color set.
#'@param memb_dim A character string indicating the name of the member 
#'  dimension.
#'
#'@return A ggplot object containing the plot.
#'
#'@importFrom data.table data.table
#'@importFrom data.table CJ
#'@importFrom data.table setkey
#'@import ggplot2
#'@importFrom reshape2 melt
#'@importFrom plyr .
#'@importFrom plyr dlply
#'@importFrom s2dv InsertDim
#'@examples
#'fcsts <- data.frame(fcst1 = rnorm(10), fcst2 = rnorm(10, 0.5, 1.2))
#'PlotForecastPDF(fcsts,c(-1,1))
#'@export
PlotForecastPDF <- function(fcst, tercile.limits, extreme.limits = NULL, obs = NULL, 
                            plotfile = NULL, title = "Set a title", var.name = "Varname (units)", 
                            fcst.names = NULL, add.ensmemb = c("above", "below", "no"), 
                            color.set = c("ggplot", "s2s4e", "hydro", "vitigeoss"), 
                            memb_dim = 'member') {
    value <- init <- extremes <- x <- ymin <- ymax <- tercile <- NULL
    y <- xend <- yend <- yjitter <- MLT <- lab.pos <- NULL
    ggColorHue <- function(n) {
      hues <- seq(15, 375, length = n + 1)
      hcl(h = hues, l = 65, c = 100)[1:n]
    }
    #------------------------
    # Define color sets
    #------------------------
    color.set <- match.arg(color.set)
    if (color.set == "s2s4e") {
        colorFill <- c("#FF764D", "#b5b5b5", "#33BFD1") # AN, N, BN fill colors
        colorHatch <- c("indianred3", "deepskyblue3") # AP90, BP10 line colors
        colorMember <- c("#ffff7f")
        colorObs <- "purple"
        colorLab <- c("red", "blue") # AP90, BP10 text colors
    } else if (color.set == "hydro") {
        colorFill <- c("#41CBC9", "#b5b5b5", "#FFAB38")
        colorHatch <- c("deepskyblue3", "darkorange1")
        colorMember <- c("#ffff7f")
        colorObs <- "purple"
        colorLab <- c("blue", "darkorange3")
    } else if (color.set == "ggplot") {
        colorFill <- ggColorHue(3)
        colorHatch <- c("indianred3", "deepskyblue3")
        colorMember <- c("#ffff7f")
        colorObs <- "purple"
        colorLab <- c("red", "blue")
    } else if (color.set == "vitigeoss") {
        colorFill <- rev(c("#007be2", "#acb2b5", "#f40000"))
        colorHatch <- rev(c("#211b79", "#ae0003"))
        colorMember <- c("#ffff7f")
        colorObs <- "purple"
        colorLab <- colorHatch
    } else {
        stop("Parameter 'color.set' should be one of ggplot/s2s4e/hydro")
    }
    #------------------------
    # Check input arguments
    #------------------------
    add.ensmemb <- match.arg(add.ensmemb)
    #------------------------
    # Check fcst type and convert to data.frame if needed
    #------------------------
    if (is.array(fcst)) {
        if (!memb_dim %in% names(dim(fcst)) | length(dim(fcst)) != 2) {
            stop("Parameter 'fcst' should be a two-dimensional array with labelled dimensions and one of them should be for member. The name of this dimension can be adjusted with 'memb_dim'.")
        }
        dim.members <- which(names(dim(fcst)) == memb_dim)
        if (dim.members == 1) {
            fcst.df <- data.frame(fcst)
        } else {
            fcst.df <- data.frame(t(fcst))
        }
    } else if (is.data.frame(fcst)) {
        fcst.df <- fcst
    } else {
        stop("Parameter 'fcst' should be an array or a data.frame")
    }
    npanels <- ncol(fcst.df)
    #------------------------
    # Check observations
    #------------------------
    if (!is.null(obs)) {
        if (!is.vector(obs)){
            stop("Observations should be a vector")
        } else if (!length(obs) %in% c(1, npanels)) {
            stop("The number of observations should equal one or the number of forecasts")
        }
    }
    #------------------------
    # Check tercile limits
    #------------------------
    if (is.vector(tercile.limits)) {
        if (length(tercile.limits) != 2) {
            stop("Provide two tercile limits")
        } 
        tercile.limits <- InsertDim(tercile.limits, 1, npanels, name = "new")
    } else if (is.array(tercile.limits)) {
        if (length(dim(tercile.limits)) == 2) {
            if (dim(tercile.limits)[2] != 2) {
                stop("Provide two tercile limits for each panel")
            }
            if (dim(tercile.limits)[1] != npanels) {
                stop("The number of tercile limits does not match the number of forecasts provided")
            }
        } else {
            stop("Tercile limits should have two dimensions")
        } 
    } else {
        stop("Tercile limits should be a vector of length two or an array of dimension (nfcsts,2)")
    }
    # check consistency of tercile limits
    if (any(tercile.limits[, 1] >= tercile.limits[, 2])) {
        stop("Inconsistent tercile limits")
    }
    #------------------------
    # Check extreme limits
    #------------------------
    if (!is.null(extreme.limits)) {
        if (is.vector(extreme.limits)) {
            if (length(extreme.limits) != 2) {
                stop("Provide two extreme limits")
            } 
            extreme.limits <- InsertDim(extreme.limits, 1, npanels, name = "new")
        } else if (is.array(extreme.limits)) {
            if (length(dim(extreme.limits)) == 2) {
                if (dim(extreme.limits)[2] != 2) {
                    stop("Provide two extreme limits for each panel")
                }
                if (dim(extreme.limits)[1] != npanels) {
                    stop("The number of extreme limits does not match the number of forecasts provided")
                }
            } else {
                stop("extreme limits should have two dimensions")
            }
        } else {
            stop("Extreme limits should be a vector of length two or an array of dimensions (nfcsts,2)")
        }
        # Check that extreme limits are consistent with tercile limits 
        if (any(extreme.limits[, 1] >= tercile.limits[, 1])) {
            stop("Inconsistent lower extreme limits")
        }
        if (any(extreme.limits[, 2] <= tercile.limits[, 2])) {
            stop("Inconsistent higher extreme limits")
        }
    }
    #------------------------
    # Set proper fcst names
    #------------------------
    if (!is.null(fcst.names)) {
        if (length(fcst.names) != dim(fcst.df)[2]) {
            stop("Parameter 'fcst.names' should be an array with as many names as distinct forecasts provided")
        }
        colnames(fcst.df) <- factor(fcst.names, levels = fcst.names)
    }
    #------------------------
    # Produce a first plot with the pdf for each init in a panel
    #------------------------
    melt.df <- reshape2::melt(fcst.df, variable.name = "init", id.vars = NULL)
    plot <- ggplot(melt.df, aes(x = value)) + 
            geom_density(alpha = 1, na.rm = T) + 
            coord_flip() + facet_wrap(~init, strip.position = "top", nrow = 1) +
            xlim(range(c(obs, density(melt.df$value, na.rm = T)$x)))
    ggp <- ggplot_build(plot)
    #------------------------
    # Gather the coordinates of the plots together with init and corresponding
    # terciles
    #------------------------
    tmp.df <- ggp$data[[1]][, c("x", "ymin", "ymax", "PANEL")]
    if (!is.null(ggp$layout$layout)) {
        tmp.df$init <- ggp$layout$layout$init[as.numeric(tmp.df$PANEL)]
    } else if (!is.null(ggp$layout$panel_layout)) {
        tmp.df$init <- ggp$layout$panel_layout$init[as.numeric(tmp.df$PANEL)]
    } else {
        stop("Cannot find PANELS in ggp object")
    }
    tmp.df$tercile <- factor(ifelse(tmp.df$x < tercile.limits[tmp.df$PANEL, 1], 
        "Below normal", ifelse(tmp.df$x < tercile.limits[tmp.df$PANEL, 2], 
        "Normal", "Above normal")), 
        levels = c("Above normal", "Normal", "Below normal"))
    #------------------------
    # Get the height and width of a panel
    #------------------------
    pan.width <- diff(range(tmp.df$x))
    pan.height <- max(tmp.df$ymax)
    magic.ratio <- 9 * pan.height/pan.width
    #------------------------
    # Compute hatch coordinates for extremes
    #------------------------
    if (!is.null(extreme.limits)) {
        tmp.df$extremes <- factor(ifelse(tmp.df$x < extreme.limits[tmp.df$PANEL, 1], 
            "Below P10", ifelse(tmp.df$x < extreme.limits[tmp.df$PANEL, 2], 
            "Normal", "Above P90")), 
            levels = c("Above P90", "Normal", "Below P10"))
        hatch.ls <- dlply(tmp.df, .(init, extremes), function(x) {
            # close the polygon
            tmp.df2 <- data.frame(x = c(x$x, max(x$x), min(x$x)), y = c(x$ymax, 0, 
                0))
            # compute the hatches for this polygon
            hatches <- .polygon.fullhatch(tmp.df2$x, tmp.df2$y, angle = 60, density = 10, 
                width.units = pan.width, height.units = pan.height)
            # add bottom segment
            end1 <- data.frame(x = x$x[1], y = x$ymax[1], xend = x$x[1], yend = 0)
            # add top segment
            end2 <- data.frame(x = x$x[length(x$x)], y = x$ymax[length(x$x)], xend = x$x[length(x$x)], 
                yend = 0)
            return(rbind(hatches, end1, end2))
        })
        attr <- attr(hatch.ls, "split_labels")
        for (i in 1:length(hatch.ls)) {
            hatch.ls[[i]] <- cbind(hatch.ls[[i]], attr[i, ], row.names = NULL)
        }
        hatch.df <- do.call("rbind", hatch.ls)
        # Compute max y for each extreme category
        max.ls <- dlply(tmp.df, .(init, extremes), function(x) {
            data.frame(y = min(0.85 * pan.height, max(x$ymax)))
        })
        attr <- attr(max.ls, "split_labels")
        for (i in 1:length(max.ls)) {
            max.ls[[i]] <- cbind(max.ls[[i]], attr[i, ], row.names = NULL)
        }
        max.df <- do.call("rbind", max.ls)
    }
    #------------------------
    # Compute jitter space for ensemble members
    #------------------------
    if (add.ensmemb != "no") {
        jitter.df <- reshape2::melt(data.frame(dlply(melt.df, .(init), function(x) {
            .jitter.ensmemb(sort(x$value, na.last = T), pan.width / 100)
        }), check.names = F), value.name = "yjitter", variable.name = "init", id.vars = NULL)
        jitter.df$x <- reshape2::melt(data.frame(dlply(melt.df, .(init), function(x) {
            sort(x$value, na.last = T)
        })), value.name = "x", id.vars = NULL)$x
    }
    #------------------------
    # Get y coordinates for observed x values, using a cool data.table feature: merge
    # to nearest value
    #------------------------
    if (!is.null(obs)) {
        tmp.dt <- data.table(tmp.df, key = c("init", "x"))
        obs.dt <- data.table(init = factor(colnames(fcst.df), levels = colnames(fcst.df)), 
            value = rep(obs, dim(fcst.df)[2]))
        setkey(obs.dt, init, value)
        obs.xy <- tmp.dt[obs.dt, roll = "nearest"]
    }
    #------------------------
    # Fill each pdf with different colors for the terciles
    #------------------------
    plot <- plot + 
        geom_ribbon(data = tmp.df, 
                    aes(x = x, ymin = ymin, ymax = ymax, fill = tercile),
                    alpha = 0.7)
    #------------------------
    # Add hatches for extremes
    #------------------------
    if (!is.null(extreme.limits)) {
        if (nrow(hatch.df[hatch.df$extremes != "Normal", ]) == 0) {
            warning("The provided extreme categories are outside the plot bounds. The extremes will not be drawn.")
            extreme.limits <- NULL
        } else {
            plot <- plot + 
                geom_segment(data = hatch.df[hatch.df$extremes != "Normal", ],
                             aes(x = x, y = y, 
                                 xend = xend, yend = yend, color = extremes))
        }
    }
    #------------------------
    # Add obs line
    #------------------------
    if (!is.null(obs)) {
        plot <- plot + 
            geom_vline(data = obs.dt, 
                       aes(xintercept = value),
                       linetype = "dashed", color = colorObs)
    }
    #------------------------
    # Add ensemble members
    #------------------------
    if (add.ensmemb == "below") {
        plot <- plot + 
            # this adds a grey box for ensmembers
            geom_rect(aes(xmin = -Inf, xmax = Inf, 
                          ymin = -Inf, ymax = -pan.height / 10), 
                      fill = "gray95", color = "black", width = 0.2) + 
            # this adds the ensemble members
            geom_point(data = jitter.df,
                       aes(x = x, 
                           y = -pan.height / 10 - magic.ratio * yjitter, 
                           shape = "Ensemble members"),
                       color = "black", fill = colorMember, alpha = 1)
            
    } else if (add.ensmemb == "above") {
        plot <- plot + 
            geom_point(data = jitter.df,
                       aes(x = x, 
                           y = 0.7 * magic.ratio * yjitter, 
                           shape = "Ensemble members"),
                       color = "black", fill = colorMember, alpha = 1)
            
    }
    #------------------------
    # Add obs diamond
    #------------------------
    if (!is.null(obs)) {
        plot <- plot + 
            # this adds the obs diamond
            geom_point(data = obs.xy,
                       aes(x = x, y = ymax, size = "Observation"),
                       shape = 23, color = "black", fill = colorObs)
    }
    #------------------------
    # Compute probability for each tercile and identify MLT
    #------------------------
    tmp.dt <- data.table(tmp.df)
    pct <- tmp.dt[, .(pct = integrate(approxfun(x, ymax), lower = min(x), upper = max(x))$value), 
        by = .(init, tercile)]
    # include potentially missing groups
    pct <- merge(pct, CJ(init = factor(levels(pct$init), levels = levels(pct$init)), 
                         tercile = factor(c("Below normal", "Normal", "Above normal"), 
                         levels = c("Above normal", "Normal", "Below normal"))),
                 by = c("init", "tercile"), all.y = T)
    pct[is.na(pct),"pct"] <- 0
    tot <- pct[, .(tot = sum(pct)), by = init]
    pct <- merge(pct, tot, by = "init")
    pct$pct <- round(100 * pct$pct / pct$tot, 0)
    pct$MLT <- pct[, .(MLT = pct == max(pct)), by = init]$MLT
    pct <- pct[order(init, tercile)]
    pct$lab.pos <- as.vector(apply(tercile.limits, 1, function(x) {c(max(x), mean(x), min(x))}))
    #------------------------
    # Compute probability for extremes
    #------------------------
    if (!is.null(extreme.limits)) {
        pct2 <- tmp.dt[, .(pct = integrate(approxfun(x, ymax), lower = min(x), upper = max(x))$value), 
            by = .(init, extremes)]
        # include potentially missing groups
        pct2 <- merge(pct2, CJ(init = factor(levels(pct2$init), levels = levels(pct2$init)), 
                               extremes = factor(c("Below P10", "Normal", "Above P90"),
                               levels = c("Above P90", "Normal", "Below P10"))),
                      by = c("init", "extremes"), all.y=T)
        pct2[is.na(pct),"pct"] <- 0
        tot2 <- pct2[, .(tot = sum(pct)), by = init]
        pct2 <- merge(pct2, tot2, by = "init")
        pct2$pct <- round(100 * pct2$pct / pct2$tot, 0)
        pct2 <- pct2[order(init, extremes)]
        pct2$lab.pos <-  as.vector(apply(extreme.limits, 1, function(x) {c(x[2], NA, x[1])}))
        pct2 <- merge(pct2, max.df, by = c("init", "extremes"), all.x = T)
    }
    #------------------------
    # Add probability labels for terciles
    #------------------------
    if (add.ensmemb == "above") {
        labpos <- -0.2 * pan.height
        vjust <- 0
    } else {
        labpos <- 0
        vjust <- -0.5
    }
    plot <- plot + 
        geom_text(data = pct,
                  aes(x = lab.pos, y = labpos, label = paste0(pct, "%"),
                      hjust = as.integer(tercile) * -1.5 + 3.5), 
                  vjust = vjust, angle = -90, size = 3.2) + 
        geom_text(data = pct[MLT == T, ],
                  aes(x = lab.pos, y = labpos, label = "*",
                      hjust = as.integer(tercile) * -3.5 + 9),
                  vjust = 0.1, angle = -90, size = 7, color = "black")
    #------------------------
    # Add probability labels for extremes
    #------------------------
    if (!is.null(extreme.limits)) {
        plot <- plot + 
            geom_text(data = pct2[extremes != "Normal", ],
                      aes(x = lab.pos, y = 0.9 * y, label = paste0(pct, "%"),
                          hjust = as.integer(extremes) * -1.5 + 3.5),
                      vjust = -0.5, angle = -90, size = 3.2, 
                      color = rep(colorLab, dim(fcst.df)[2]))
    }
    #------------------------
    # Finish all theme and legend details
    #------------------------
    plot <- plot + 
        theme_minimal() + 
        scale_fill_manual(name = "Probability of\nterciles", 
                          values = colorFill, drop = F) + 
        scale_color_manual(name = "Probability of\nextremes", 
                           values = colorHatch) + 
        scale_shape_manual(name = "Ensemble\nmembers", 
                           values = c(21)) + 
        scale_size_manual(name = "Observation", 
                          values = c(3)) + 
        labs(x = var.name, 
             y = "Probability density\n(total area=1)",
             title = title) +
        theme(axis.text.x = element_blank(),
              panel.grid.minor.x = element_blank(), 
              legend.key.size = unit(0.3, "in"),
              panel.border = element_rect(fill = NA, color = "black"),
              strip.background = element_rect(colour = "black", fill = "gray80"), 
              panel.spacing = unit(0.2, "in"), 
              panel.grid.major.x = element_line(color = "grey93")) + 
        guides(fill = guide_legend(order = 1), 
               color = guide_legend(order = 2), 
               shape = guide_legend(order = 3, label = F),
               size = guide_legend(order = 4, label = F))
    #------------------------
    # Save to plotfile if needed, and return plot
    #------------------------
    if (!is.null(plotfile)) {
        ggsave(plotfile, plot)
    }
    return(plot)
}
.jitter.ensmemb <- function(x, thr = 0.1) {
    # Idea: start with first level. Loop all points, and if distance to last point in
    # the level is more than a threshold, include the point to the level.  Otherwise
    # keep the point for another round.  Do one round in each direction to avoid
    # uggly patterns.
    if (is.unsorted(x, na.rm = T)) {
        stop("Provide a sorted array!")
    }
    lev <- x * 0
    lev[is.na(lev)] <- -1
    level <- 1
    while (any(lev == 0)) {
        last <- -1/0
        for (i in 1:length(x)) {
            if (lev[i] != 0) {
                next
            }
            if (x[i] - last > thr) {
                lev[i] <- level
                last <- x[i]
            }
        }
        level <- level + 1
        last <- 1/0
        for (i in seq(length(x), 1, -1)) {
            if (lev[i] != 0) {
                next
            }
            if (last - x[i] > thr) {
                lev[i] <- level
                last <- x[i]
            }
        }
        level <- level + 1
    }
    lev[lev == -1] <- NA
    return(lev * thr * sqrt(3)/2)
}
.polygon.onehatch <- function(x, y, x0, y0, xd, yd, fillOddEven = F) {
    halfplane <- as.integer(xd * (y - y0) - yd * (x - x0) <= 0)
    cross <- halfplane[-1L] - halfplane[-length(halfplane)]
    does.cross <- cross != 0
    if (!any(does.cross)) {
        return()
    }
    x1 <- x[-length(x)][does.cross]
    y1 <- y[-length(y)][does.cross]
    x2 <- x[-1L][does.cross]
    y2 <- y[-1L][does.cross]
    t <- (((x1 - x0) * (y2 - y1) - (y1 - y0) * (x2 - x1))/(xd * (y2 - y1) - yd * 
        (x2 - x1)))
    o <- order(t)
    tsort <- t[o]
    crossings <- cumsum(cross[does.cross][o])
    if (fillOddEven) {
        crossings <- crossings%%2
    }
    drawline <- crossings != 0
    lx <- x0 + xd * tsort
    ly <- y0 + yd * tsort
    lx1 <- lx[-length(lx)][drawline]
    ly1 <- ly[-length(ly)][drawline]
    lx2 <- lx[-1L][drawline]
    ly2 <- ly[-1L][drawline]
    return(data.frame(x = lx1, y = ly1, xend = lx2, yend = ly2))
}
.polygon.fullhatch <- function(x, y, density, angle, width.units, height.units, inches = c(5, 
    1)) {
    x <- c(x, x[1L])
    y <- c(y, y[1L])
    angle <- angle%%180
    upi <- c(width.units, height.units)/inches
    if (upi[1L] < 0) {
        angle <- 180 - angle
    }
    if (upi[2L] < 0) {
        angle <- 180 - angle
    }
    upi <- abs(upi)
    xd <- cos(angle/180 * pi) * upi[1L]
    yd <- sin(angle/180 * pi) * upi[2L]
    hatch.ls <- list()
    i <- 1
    if (angle < 45 || angle > 135) {
        if (angle < 45) {
            first.x <- max(x)
            last.x <- min(x)
        } else {
            first.x <- min(x)
            last.x <- max(x)
        }
        y.shift <- upi[2L]/density/abs(cos(angle/180 * pi))
        x0 <- 0
        y0 <- floor((min(y) - first.x * yd/xd)/y.shift) * y.shift
        y.end <- max(y) - last.x * yd/xd
        while (y0 < y.end) {
            hatch.ls[[i]] <- .polygon.onehatch(x, y, x0, y0, xd, yd)
            i <- i + 1
            y0 <- y0 + y.shift
        }
    } else {
        if (angle < 90) {
            first.y <- max(y)
            last.y <- min(y)
        } else {
            first.y <- min(y)
            last.y <- max(y)
        }
        x.shift <- upi[1L]/density/abs(sin(angle/180 * pi))
        x0 <- floor((min(x) - first.y * xd/yd)/x.shift) * x.shift
        y0 <- 0
        x.end <- max(x) - last.y * xd/yd
        while (x0 < x.end) {
            hatch.ls[[i]] <- .polygon.onehatch(x, y, x0, y0, xd, yd)
            i <- i + 1
            x0 <- x0 + x.shift
        }
    }
    return(do.call("rbind", hatch.ls))
}