Newer
Older
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
vars_to_transform <- NULL
picked_vars_to_transform <- which(names(picked_vars[[i]]) %in% transform_vars)
if (length(picked_vars_to_transform) > 0) {
picked_vars_to_transform <- names(picked_vars[[i]])[picked_vars_to_transform]
vars_to_transform <- c(vars_to_transform, picked_vars[[i]][picked_vars_to_transform])
if (any(picked_vars_to_transform %in% names(picked_vars_ordered[[i]]))) {
picked_vars_ordered_to_transform <- picked_vars_to_transform[which(picked_vars_to_transform %in% names(picked_vars_ordered[[i]]))]
vars_to_transform[picked_vars_ordered_to_transform] <- picked_vars_ordered[[i]][picked_vars_ordered_to_transform]
}
}
picked_common_vars_to_transform <- which(names(picked_common_vars) %in% transform_vars)
if (length(picked_common_vars_to_transform) > 0) {
picked_common_vars_to_transform <- names(picked_common_vars)[picked_common_vars_to_transform]
vars_to_transform <- c(vars_to_transform, picked_common_vars[picked_common_vars_to_transform])
if (any(picked_common_vars_to_transform %in% names(picked_common_vars_ordered))) {
picked_common_vars_ordered_to_transform <- picked_common_vars_to_transform[which(picked_common_vars_to_transform %in% names(picked_common_vars_ordered))]
vars_to_transform[picked_common_vars_ordered_to_transform] <- picked_common_vars_ordered[picked_common_vars_ordered_to_transform]
}
}
work_piece[['vars_to_transform']] <- vars_to_transform
}
# Send flag to load metadata
if (load_file_metadata) {
work_piece[['save_metadata_in']] <- paste0(metadata_folder, '/', metadata_file_counter)
}
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
work_pieces <- c(work_pieces, list(work_piece))
}
}
j <- j + 1
}
}
}
#print("N")
if (debug) {
print("-> WORK PIECES BUILT")
}
# Calculate the progress %s that will be displayed and assign them to
# the appropriate work pieces.
if (length(work_pieces) / num_procs >= 2 && !silent) {
if (length(work_pieces) / num_procs < 10) {
amount <- 100 / ceiling(length(work_pieces) / num_procs)
reps <- ceiling(length(work_pieces) / num_procs)
} else {
amount <- 10
reps <- 10
}
progress_steps <- rep(amount, reps)
if (length(work_pieces) < (reps + 1)) {
selected_pieces <- length(work_pieces)
progress_steps <- c(sum(head(progress_steps, reps)),
tail(progress_steps, reps))
} else {
selected_pieces <- round(seq(1, length(work_pieces),
length.out = reps + 1))[-1]
}
progress_steps <- paste0(' + ', round(progress_steps, 2), '%')
progress_message <- 'Progress: 0%'
} else {
progress_message <- ''
selected_pieces <- NULL
}
piece_counter <- 1
step_counter <- 1
work_pieces <- lapply(work_pieces,
function (x) {
if (piece_counter %in% selected_pieces) {
wp <- c(x, list(progress_amount = progress_steps[step_counter]))
step_counter <<- step_counter + 1
} else {
wp <- x
}
piece_counter <<- piece_counter + 1
wp
})
if (!silent) {
.message("If the size of the requested data is close to or above the free shared RAM memory, R may crash.")
.message("If the size of the requested data is close to or above the half of the free RAM memory, R may crash.")
.message(paste0("Will now proceed to read and process ", length(work_pieces), " data files:"))
if (length(work_pieces) < 30) {
lapply(work_pieces, function (x) .message(x[['file_path']], indent = 2))
} else {
.message("The list of files is long. You can check it after Start() finishes in the output '$Files'.", indent = 2, exdent = 5)
}
}
# Build the cluster of processes that will do the work and dispatch work pieces.
# The function .LoadDataFile is applied to each work piece. This function will
# open the data file, regrid if needed, subset, apply the mask,
# compute and apply the weights if needed,
# disable extreme values and store in the shared memory matrix.
#print("O")
if (!silent) {
.message("Loading... This may take several minutes...")
if (progress_message != '') {
.message(progress_message, appendLF = FALSE)
}
}
if (num_procs == 1) {
found_files <- lapply(work_pieces, .LoadDataFile,
shared_matrix_pointer = shared_matrix_pointer,
file_data_reader = file_data_reader,
transform = transform,
transform_params = transform_params,
silent = silent, debug = debug)
} else {
cluster <- makeCluster(num_procs, outfile = "")
# Send the heavy work to the workers
work_errors <- try({
found_files <- clusterApplyLB(cluster, work_pieces, .LoadDataFile,
shared_matrix_pointer = shared_matrix_pointer,
file_data_reader = file_data_reader,
transform = transform,
transform_params = transform_params,
silent = silent, debug = debug)
})
stopCluster(cluster)
}
if (!silent) {
if (progress_message != '') {
.message("\n", tag = '')
}
}
#print("P")
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
# NOTE: If merge_across_dims = TRUE, there might be additional NAs due to
# unequal inner_dim ('time') length across file_dim ('file_date').
# If merge_across_dims_narm = TRUE, add additional lines to remove these NAs.
# TODO: Now it assumes that only one '_across'. Add a for loop for more-than-one case.
if (merge_across_dims_narm) {
# Get the length of these two dimensions in final_dims
length_inner_across_store_dims <- final_dims[across_inner_dim]
length_file_across_store_dims <- final_dims[across_file_dim]
# Create a logical array for merge_across_dims
logi_array <- array(rep(FALSE,
length_file_across_store_dims * length_inner_across_store_dims),
dim = c(length_inner_across_store_dims, length_file_across_store_dims))
for (i in 1:length_file_across_store_dims) { #1:4
logi_array[1:length_inner_across_dim[[i]], i] <- TRUE
}
# First, get the data array with final_dims dimension
data_array_final_dims <- array(bigmemory::as.matrix(data_array), dim = final_dims)
# Change the NA derived from additional spaces to -9999, then remove these -9999
func_remove_blank <- function(data_array, logi_array) {
# dim(data_array) = [time, file_date]
# dim(logi_array) = [time, file_date]
# Change the blank spaces from NA to -9999
data_array[which(!logi_array)] <- -9999
return(data_array)
}
data_array_final_dims <- multiApply::Apply(data_array_final_dims,
target_dims = c(across_inner_dim, across_file_dim), #c('time', 'file_date')
output_dims = c(across_inner_dim, across_file_dim),
fun = func_remove_blank,
logi_array = logi_array)$output1
## reorder back to the correct dim
tmp <- match(names(final_dims), names(dim(data_array_final_dims)))
aho
committed
data_array_final_dims <- .aperm2(data_array_final_dims, tmp)
data_array_tmp <- data_array_final_dims[data_array_final_dims != -9999] # become a vector
data_array <- array(data_array_tmp, dim = final_dims_fake)
} else { # merge_across_dims_narm = F (old version)
data_array <- array(bigmemory::as.matrix(data_array), dim = final_dims_fake)
}
# NOTE: If split_multiselected_dims + merge_across_dims, the dimension order may change above.
# To get the user-required dim order, we need to reorder the array again.
if (split_multiselected_dims & merge_across_dims) {
if (inner_dim_pos_in_split_dims != 1) {
correct_order <- match(names(final_dims_fake_output), names(final_dims_fake))
data_array <- .aperm2(data_array, correct_order)
}
}
# Load metadata and remove the metadata folder
if (!is.null(metadata_dims)) {
loaded_metadata_files <- list.files(metadata_folder)
loaded_metadata <- lapply(paste0(metadata_folder, '/', loaded_metadata_files), readRDS)
unlink(metadata_folder, recursive = TRUE)
return_metadata <- vector('list', length = prod(dim(array_of_metadata_flags)[metadata_dims]))
return_metadata[as.numeric(loaded_metadata_files)] <- loaded_metadata
dim(return_metadata) <- dim(array_of_metadata_flags[metadata_dims])
attr(data_array, 'Variables') <- return_metadata
# TODO: Try to infer data type from loaded_metadata
# as.integer(data_array)
}
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
failed_pieces <- work_pieces[which(unlist(found_files))]
for (failed_piece in failed_pieces) {
array_of_not_found_files <- do.call('[<-',
c(list(array_of_not_found_files),
as.list(failed_piece[['file_indices_in_array_of_files']]),
list(value = TRUE)))
}
if (any(array_of_not_found_files)) {
for (i in 1:prod(dim(array_of_files_to_load))) {
if (is.na(array_of_not_found_files[i])) {
array_of_files_to_load[i] <- NA
} else {
if (array_of_not_found_files[i]) {
array_of_not_found_files[i] <- array_of_files_to_load[i]
array_of_files_to_load[i] <- NA
} else {
array_of_not_found_files[i] <- NA
}
}
}
} else {
array_of_not_found_files <- NULL
}
# Replace the vars and common vars by the transformed vars and common vars
for (i in 1:length(dat)) {
if (length(names(transformed_vars[[i]])) > 0) {
picked_vars[[i]][names(transformed_vars[[i]])] <- transformed_vars[[i]]
} else if (length(names(picked_vars_ordered[[i]])) > 0) {
picked_vars[[i]][names(picked_vars_ordered[[i]])] <- picked_vars_ordered[[i]]
}
}
if (length(names(transformed_common_vars)) > 0) {
picked_common_vars[names(transformed_common_vars)] <- transformed_common_vars
} else if (length(names(picked_common_vars_ordered)) > 0) {
picked_common_vars[names(picked_common_vars_ordered)] <- picked_common_vars_ordered
}
if (debug) {
print("-> THE TRANSFORMED VARS:")
print(str(transformed_vars))
print("-> THE PICKED VARS:")
print(str(picked_vars))
}
file_selectors <- NULL
for (i in 1:length(dat)) {
file_selectors[[dat[[i]][['name']]]] <- dat[[i]][['selectors']][which(names(dat[[i]][['selectors']]) %in% found_file_dims[[i]])]
}
if (retrieve) {
if (!silent) {
.message("Successfully retrieved data.")
}
var_backup <- attr(data_array, 'Variables')[[1]]
attr(data_array, 'Variables') <- NULL
attributes(data_array) <- c(attributes(data_array),
list(Variables = c(list(common = c(picked_common_vars, var_backup)),
picked_vars),
Files = array_of_files_to_load,
NotFoundFiles = array_of_not_found_files,
FileSelectors = file_selectors,
PatternDim = found_pattern_dim)
attr(data_array, 'class') <- c('startR_array', attr(data_array, 'class'))
} else {
if (!silent) {
.message("Successfully discovered data dimensions.")
}
start_call <- match.call()
start_call[[i]] <- eval.parent(start_call[[i]])
start_call[['retrieve']] <- TRUE
attributes(start_call) <- c(attributes(start_call),
Nicolau Manubens
committed
list(Dimensions = final_dims_fake,
Variables = c(list(common = picked_common_vars), picked_vars),
ExpectedFiles = array_of_files_to_load,
PatternDim = found_pattern_dim,
MergedDims = if (merge_across_dims) {
inner_dims_across_files
} else {
NULL
},
SplitDims = if (split_multiselected_dims) {
all_split_dims
} else {
NULL
})
attr(start_call, 'class') <- c('startR_cube', attr(start_call, 'class'))
}
# This function is the responsible for loading the data of each work
# piece.
.LoadDataFile <- function(work_piece, shared_matrix_pointer,
file_data_reader, synonims,
transform, transform_params,
silent = FALSE, debug = FALSE) {
# suppressPackageStartupMessages({library(bigmemory)})
### TODO: Specify dependencies as parameter
# suppressPackageStartupMessages({library(ncdf4)})
#print("1")
store_indices <- as.list(work_piece[['store_position']])
first_round_indices <- work_piece[['first_round_indices']]
second_round_indices <- work_piece[['second_round_indices']]
#print("2")
file_to_open <- work_piece[['file_path']]
sub_array <- file_data_reader(file_to_open, NULL,
work_piece[['file_selectors']],
first_round_indices, synonims)
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
if (debug) {
if (all(unlist(store_indices[1:6]) == 1)) {
print("-> LOADING A WORK PIECE")
print("-> STRUCTURE OF READ UNTRANSFORMED DATA:")
print(str(sub_array))
print("-> STRUCTURE OF VARIABLES TO TRANSFORM:")
print(str(work_piece[['vars_to_transform']]))
print("-> COMMON ARRAY DIMENSIONS:")
print(str(work_piece[['store_dims']]))
}
}
if (!is.null(sub_array)) {
# Apply data transformation once we have the data arrays.
if (!is.null(transform)) {
if (debug) {
if (all(unlist(store_indices[1:6]) == 1)) {
print("-> PROCEEDING TO TRANSFORM ARRAY")
print("-> DIMENSIONS OF ARRAY RIGHT BEFORE TRANSFORMING:")
print(dim(sub_array))
}
}
sub_array <- do.call(transform, c(list(data_array = sub_array,
variables = work_piece[['vars_to_transform']],
file_selectors = work_piece[['file_selectors']]),
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
transform_params))
if (debug) {
if (all(unlist(store_indices[1:6]) == 1)) {
print("-> STRUCTURE OF ARRAY AND VARIABLES RIGHT AFTER TRANSFORMING:")
print(str(sub_array))
print("-> DIMENSIONS OF ARRAY RIGHT AFTER TRANSFORMING:")
print(dim(sub_array$data_array))
}
}
sub_array <- sub_array$data_array
# Subset with second round of indices
dims_to_crop <- which(!sapply(second_round_indices, is.null))
if (length(dims_to_crop) > 0) {
dimnames_to_crop <- names(second_round_indices)[dims_to_crop]
sub_array <- Subset(sub_array, dimnames_to_crop,
second_round_indices[dimnames_to_crop])
}
if (debug) {
if (all(unlist(store_indices[1:6]) == 1)) {
print("-> STRUCTURE OF ARRAY AND VARIABLES RIGHT AFTER SUBSETTING WITH 2nd ROUND INDICES:")
print(str(sub_array))
}
}
}
metadata <- attr(sub_array, 'variables')
Nicolau Manubens
committed
names_bk <- names(store_indices)
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
store_indices <- lapply(names(store_indices),
function (x) {
if (!(x %in% names(first_round_indices))) {
store_indices[[x]]
} else if (is.null(second_round_indices[[x]])) {
1:dim(sub_array)[x]
} else {
if (is.numeric(second_round_indices[[x]])) {
## TODO: Review carefully this line. Inner indices are all
## aligned to the left-most positions. If dataset A has longitudes
## 1, 2, 3, 4 but dataset B has only longitudes 3 and 4, then
## they will be stored as follows:
## 1, 2, 3, 4
## 3, 4, NA, NA
##x - min(x) + 1
1:length(second_round_indices[[x]])
} else {
1:length(second_round_indices[[x]])
}
}
})
Nicolau Manubens
committed
names(store_indices) <- names_bk
print("-> STRUCTURE OF FIRST ROUND INDICES FOR THIS WORK PIECE:")
print(str(first_round_indices))
print("-> STRUCTURE OF SECOND ROUND INDICES FOR THIS WORK PIECE:")
print(str(second_round_indices))
print("-> STRUCTURE OF STORE INDICES FOR THIS WORK PIECE:")
print(str(store_indices))
}
}
Nicolau Manubens
committed
store_indices <- lapply(store_indices, as.integer)
Nicolau Manubens
committed
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
# split the storage work of the loaded subset in parts
largest_dim_name <- names(dim(sub_array))[which.max(dim(sub_array))]
max_parts <- length(store_indices[[largest_dim_name]])
# Indexing a data file of N MB with expand.grid takes 30*N MB
# The peak ram of Start is, minimum, 2 * total data to load from all files
# due to inefficiencies in other regions of the code
# The more parts we split the indexing done below in, the lower
# the memory footprint of the indexing and the fast.
# But more than 10 indexing iterations (parts) for each MB processed
# makes the iteration slower (tested empirically on BSC workstations).
subset_size_in_mb <- prod(dim(sub_array)) * 8 / 1024 / 1024
best_n_parts <- ceiling(subset_size_in_mb * 10)
# We want to set n_parts to a greater value than the one that would
# result in a memory footprint (of the subset indexing code below) equal
# to 2 * total data to load from all files.
# s = subset size in MB
# p = number of parts to break it in
# T = total size of data to load
# then, s / p * 30 = 2 * T
# then, p = s * 15 / T
min_n_parts <- ceiling(prod(dim(sub_array)) * 15 / prod(store_dims))
# Make sure we pick n_parts much greater than the minimum calculated
n_parts <- min_n_parts * 10
if (n_parts > best_n_parts) {
n_parts <- best_n_parts
}
# Boundary checks
if (n_parts < 1) {
n_parts <- 1
}
if (n_parts > max_parts) {
n_parts <- max_parts
}
if (n_parts > 1) {
make_parts <- function(length, n) {
clusters <- cut(1:length, n, labels = FALSE)
lapply(1:n, function(y) which(clusters == y))
}
part_indices <- make_parts(max_parts, n_parts)
parts <- lapply(part_indices,
function(x) {
store_indices[[largest_dim_name]][x]
})
} else {
part_indices <- list(1:max_parts)
parts <- store_indices[largest_dim_name]
}
Nicolau Manubens
committed
# do the storage work
weights <- sapply(1:length(store_dims),
function(i) prod(c(1, store_dims)[1:i]))
part_indices_in_sub_array <- as.list(rep(TRUE, length(dim(sub_array))))
names(part_indices_in_sub_array) <- names(dim(sub_array))
data_array <- bigmemory::attach.big.matrix(shared_matrix_pointer)
for (i in 1:n_parts) {
store_indices[[largest_dim_name]] <- parts[[i]]
# Converting array indices to vector indices
matrix_indices <- do.call("expand.grid", store_indices)
# Given a matrix where each row is a set of array indices of an element
# the vector indices are computed
matrix_indices <- 1 + colSums(t(matrix_indices - 1) * weights)
part_indices_in_sub_array[[largest_dim_name]] <- part_indices[[i]]
data_array[matrix_indices] <- as.vector(do.call('[',
c(list(x = sub_array),
part_indices_in_sub_array)))
}
rm(data_array)
gc()
if (!is.null(work_piece[['save_metadata_in']])) {
saveRDS(metadata, file = work_piece[['save_metadata_in']])
}
}
if (!is.null(work_piece[['progress_amount']]) && !silent) {
message(work_piece[['progress_amount']], appendLF = FALSE)
}
is.null(sub_array)
}