Newer
Older
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
start_call[['retrieve']] <- TRUE
attributes(start_call) <- c(attributes(start_call),
list(Dimensions = final_dims_fake,
Variables = c(list(common = picked_common_vars), picked_vars),
ExpectedFiles = array_of_files_to_load,
FileSelectors = file_selectors,
PatternDim = found_pattern_dim,
MergedDims = if (merge_across_dims) {
inner_dims_across_files
} else {
NULL
},
SplitDims = if (split_multiselected_dims) {
all_split_dims
} else {
NULL
})
)
attr(start_call, 'class') <- c('startR_cube', attr(start_call, 'class'))
start_call
}
}
# This function is the responsible for loading the data of each work
# piece.
.LoadDataFile <- function(work_piece, shared_matrix_pointer,
file_data_reader, synonims,
transform, transform_params, transform_crop_domain = NULL,
nperez
committed
#warning(attr(shared_matrix_pointer, 'description')$sharedName)
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
# suppressPackageStartupMessages({library(bigmemory)})
### TODO: Specify dependencies as parameter
# suppressPackageStartupMessages({library(ncdf4)})
#print("1")
store_indices <- as.list(work_piece[['store_position']])
first_round_indices <- work_piece[['first_round_indices']]
second_round_indices <- work_piece[['second_round_indices']]
#print("2")
file_to_open <- work_piece[['file_path']]
sub_array <- file_data_reader(file_to_open, NULL,
work_piece[['file_selectors']],
first_round_indices, synonims)
if (debug) {
if (all(unlist(store_indices[1:6]) == 1)) {
print("-> LOADING A WORK PIECE")
print("-> STRUCTURE OF READ UNTRANSFORMED DATA:")
print(str(sub_array))
print("-> STRUCTURE OF VARIABLES TO TRANSFORM:")
print(str(work_piece[['vars_to_transform']]))
print("-> COMMON ARRAY DIMENSIONS:")
print(str(work_piece[['store_dims']]))
}
}
if (!is.null(sub_array)) {
# Apply data transformation once we have the data arrays.
if (!is.null(transform)) {
if (debug) {
if (all(unlist(store_indices[1:6]) == 1)) {
print("-> PROCEEDING TO TRANSFORM ARRAY")
print("-> DIMENSIONS OF ARRAY RIGHT BEFORE TRANSFORMING:")
print(dim(sub_array))
}
}
sub_array <- do.call(transform, c(list(data_array = sub_array,
variables = work_piece[['vars_to_transform']],
file_selectors = work_piece[['file_selectors']],
crop_domain = transform_crop_domain),
transform_params))
if (debug) {
if (all(unlist(store_indices[1:6]) == 1)) {
print("-> STRUCTURE OF ARRAY AND VARIABLES RIGHT AFTER TRANSFORMING:")
print(str(sub_array))
print("-> DIMENSIONS OF ARRAY RIGHT AFTER TRANSFORMING:")
print(dim(sub_array$data_array))
}
}
sub_array <- sub_array$data_array
# Subset with second round of indices
dims_to_crop <- which(!sapply(second_round_indices, is.null))
if (length(dims_to_crop) > 0) {
dimnames_to_crop <- names(second_round_indices)[dims_to_crop]
sub_array <- ClimProjDiags::Subset(sub_array, dimnames_to_crop,
second_round_indices[dimnames_to_crop])
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
}
if (debug) {
if (all(unlist(store_indices[1:6]) == 1)) {
print("-> STRUCTURE OF ARRAY AND VARIABLES RIGHT AFTER SUBSETTING WITH 2nd ROUND INDICES:")
print(str(sub_array))
}
}
}
metadata <- attr(sub_array, 'variables')
names_bk <- names(store_indices)
store_indices <- lapply(names(store_indices),
function (x) {
if (!(x %in% names(first_round_indices))) {
store_indices[[x]]
} else if (is.null(second_round_indices[[x]])) {
1:dim(sub_array)[x]
} else {
if (is.numeric(second_round_indices[[x]])) {
## TODO: Review carefully this line. Inner indices are all
## aligned to the left-most positions. If dataset A has longitudes
## 1, 2, 3, 4 but dataset B has only longitudes 3 and 4, then
## they will be stored as follows:
## 1, 2, 3, 4
## 3, 4, NA, NA
##x - min(x) + 1
1:length(second_round_indices[[x]])
} else {
1:length(second_round_indices[[x]])
}
}
})
names(store_indices) <- names_bk
if (debug) {
if (all(unlist(store_indices) == 1)) {
print("-> STRUCTURE OF FIRST ROUND INDICES FOR THIS WORK PIECE:")
print(str(first_round_indices))
print("-> STRUCTURE OF SECOND ROUND INDICES FOR THIS WORK PIECE:")
print(str(second_round_indices))
print("-> STRUCTURE OF STORE INDICES FOR THIS WORK PIECE:")
print(str(store_indices))
}
}
store_indices <- lapply(store_indices, as.integer)
store_dims <- work_piece[['store_dims']]
# split the storage work of the loaded subset in parts
largest_dim_name <- names(dim(sub_array))[which.max(dim(sub_array))]
max_parts <- length(store_indices[[largest_dim_name]])
# Indexing a data file of N MB with expand.grid takes 30*N MB
# The peak ram of Start is, minimum, 2 * total data to load from all files
# due to inefficiencies in other regions of the code
# The more parts we split the indexing done below in, the lower
# the memory footprint of the indexing and the fast.
# But more than 10 indexing iterations (parts) for each MB processed
# makes the iteration slower (tested empirically on BSC workstations).
subset_size_in_mb <- prod(dim(sub_array)) * 8 / 1024 / 1024
best_n_parts <- ceiling(subset_size_in_mb * 10)
# We want to set n_parts to a greater value than the one that would
# result in a memory footprint (of the subset indexing code below) equal
# to 2 * total data to load from all files.
# s = subset size in MB
# p = number of parts to break it in
# T = total size of data to load
# then, s / p * 30 = 2 * T
# then, p = s * 15 / T
min_n_parts <- ceiling(prod(dim(sub_array)) * 15 / prod(store_dims))
# Make sure we pick n_parts much greater than the minimum calculated
n_parts <- min_n_parts * 10
if (n_parts > best_n_parts) {
n_parts <- best_n_parts
}
# Boundary checks
if (n_parts < 1) {
n_parts <- 1
}
if (n_parts > max_parts) {
n_parts <- max_parts
}
if (n_parts > 1) {
make_parts <- function(length, n) {
clusters <- cut(1:length, n, labels = FALSE)
lapply(1:n, function(y) which(clusters == y))
}
part_indices <- make_parts(max_parts, n_parts)
parts <- lapply(part_indices,
function(x) {
store_indices[[largest_dim_name]][x]
})
} else {
part_indices <- list(1:max_parts)
parts <- store_indices[largest_dim_name]
}
# do the storage work
weights <- sapply(1:length(store_dims),
function(i) prod(c(1, store_dims)[1:i]))
part_indices_in_sub_array <- as.list(rep(TRUE, length(dim(sub_array))))
names(part_indices_in_sub_array) <- names(dim(sub_array))
data_array <- bigmemory::attach.big.matrix(shared_matrix_pointer)
for (i in 1:n_parts) {
store_indices[[largest_dim_name]] <- parts[[i]]
# Converting array indices to vector indices
matrix_indices <- do.call("expand.grid", store_indices)
# Given a matrix where each row is a set of array indices of an element
# the vector indices are computed
matrix_indices <- 1 + colSums(t(matrix_indices - 1) * weights)
part_indices_in_sub_array[[largest_dim_name]] <- part_indices[[i]]
data_array[matrix_indices] <- as.vector(do.call('[',
c(list(x = sub_array),
part_indices_in_sub_array)))
}
rm(data_array)
gc()
if (!is.null(work_piece[['save_metadata_in']])) {
saveRDS(metadata, file = work_piece[['save_metadata_in']])
}
}
if (!is.null(work_piece[['progress_amount']]) && !silent) {
message(work_piece[['progress_amount']], appendLF = FALSE)
}
is.null(sub_array)
}