Newer
Older
which(transformed_subset_var >= min(unlist(sub_sub_array_of_values)) &
transformed_subset_var <= max(unlist(sub_sub_array_of_values)))
# if it's 1st chunk & the first sri is not included, include it.
if (chunks[[inner_dim]]["chunk"] == 1 &
!(sub_array_of_sri_complete[1] %in% sub_array_of_sri)) {
sub_array_of_sri <- c(sub_array_of_sri_complete[1], sub_array_of_sri)
}
# if it's last chunk & the last sri is not included, include it.
if (chunks[[inner_dim]]["chunk"] == chunks[[inner_dim]]["n_chunks"] &
!(tail(sub_array_of_sri_complete, 1) %in% sub_array_of_sri)) {
sub_array_of_sri <- c(sub_array_of_sri, tail(sub_array_of_sri_complete, 1))
}
# Check if sub_array_of_sri perfectly connects to the previous sri.
# If not, inlclude the previous sri.
#NOTE 1: don't know if the transform for the previous sri is
# correct or not.
#NOTE 2: If crop = T, sub_array_of_sri always starts from 1.
# Don't know if the cropping will miss some sri or not.
if (sub_array_of_sri[1] != 1) {
if (!is.null(previous_sub_sub_array_of_values)) {
# if decreasing = F
if (transformed_subset_var[1] < transformed_subset_var[2]) {
previous_sri <- max(which(transformed_subset_var <= previous_sub_sub_array_of_values))
} else {
# if decreasing = T
previous_sri <- max(which(transformed_subset_var >= previous_sub_sub_array_of_values))
if (previous_sri + 1 != sub_array_of_sri[1]) {
sub_array_of_sri <- (previous_sri + 1):sub_array_of_sri[length(sub_array_of_sri)]
}
} else { # is vector
tmp <- which(transformed_subset_var >= min(sub_sub_array_of_values) &
transformed_subset_var <= max(sub_sub_array_of_values))
# Ensure tmp and sub_array_of_sri are both ascending or descending
if (is.unsorted(tmp) != is.unsorted(sub_array_of_sri)) {
tmp <- rev(tmp)
}
# Include first or last sri if tmp doesn't have. It's only for
# ""vectors"" because vectors look for the closest value.
#NOTE: The condition here is not correct. The criteria should be
# 'vector' instead of indices.
if (chunks[[inner_dim]]["chunk"] == 1) {
sub_array_of_sri <- unique(c(sub_array_of_sri[1], tmp))
} else if (chunks[[inner_dim]]["chunk"] ==
chunks[[inner_dim]]["n_chunks"]) { # last chunk
sub_array_of_sri <- unique(c(tmp, sub_array_of_sri[length(sub_array_of_sri)]))
} else {
sub_array_of_sri <- tmp
}
# Check if sub_array_of_sri perfectly connects to the previous sri.
# If not, inlclude the previous sri.
#NOTE 1: don't know if the transform for the previous sri is
# correct or not.
#NOTE 2: If crop = T, sub_array_of_sri always starts from 1.
# Don't know if the cropping will miss some sri or not.
if (sub_array_of_sri[1] != 1) {
if (!is.null(previous_sub_sub_array_of_values)) {
# if decreasing = F
if (transformed_subset_var[1] < transformed_subset_var[2]) {
previous_sri <- max(which(transformed_subset_var <= previous_sub_sub_array_of_values))
} else {
# if decreasing = T
previous_sri <- max(which(transformed_subset_var >= previous_sub_sub_array_of_values))
}
if (previous_sri + 1 != which(sub_array_of_sri[1] == sub_array_of_sri_complete)) {
sub_array_of_sri <- (previous_sri + 1):sub_array_of_sri[length(sub_array_of_sri)]
ordered_sri <- sub_array_of_sri
sub_array_of_sri <- transformed_subset_var_unorder[sub_array_of_sri]
###########################old##################################
# if (chunks[[inner_dim]]["n_chunks"] > 1) {
# tmp <- which(transformed_subset_var >= min(sub_sub_array_of_values) &
# transformed_subset_var <= max(sub_sub_array_of_values))
# sub_array_of_sri <- sub_array_of_sri[tmp]
# }
################################################################
# In this case, the tvi are not defined and the 'transformed_subset_var'
# will be taken instead of the var transformed before in the code.
if (debug) {
if (inner_dim %in% dims_to_check) {
print("-> FIRST INDEX:")
# print(first_index)
print("NOTE: Check function generate_sub_array_of_fri() in zzz.R")
# print(last_index)
print("NOTE: Check function generate_sub_array_of_fri() in zzz.R")
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
print("-> STRUCTURE OF FIRST ROUND INDICES:")
print(str(sub_array_of_fri))
print("-> STRUCTURE OF SECOND ROUND INDICES:")
print(str(sub_array_of_sri))
print("-> STRUCTURE OF TRANSFORMED VARIABLE INDICES:")
print(str(tvi))
}
}
### # If the selectors are expressed after transformation
### } else {
###if (debug) {
###if (inner_dim %in% dims_to_check) {
###print("-> SELECTORS REQUESTED AFTER TRANSFORM.")
###}
###}
### if (goes_across_prime_meridian) {
### sub_array_of_indices <- c(sub_array_of_indices[[1]]:m,
### 1:sub_array_of_indices[[2]])
### }
### first_index <- min(unlist(sub_array_of_indices))
### last_index <- max(unlist(sub_array_of_indices))
### first_index_before_transform <- max(transform_indices(first_index, m, n) - beta, 1)
### last_index_before_transform <- min(transform_indices(last_index, m, n) + beta, n)
### sub_array_of_fri <- first_index_before_transform:last_index_before_transform
### n_of_extra_cells <- round(beta / n * m)
### if (is.list(sub_array_of_indices) && (length(sub_array_of_indices) > 1)) {
### sub_array_of_sri <- 1:(last_index - first_index + 1)
### if (is.null(tvi)) {
### tvi <- sub_array_of_sri + first_index - 1
### }
### } else {
### sub_array_of_sri <- sub_array_of_indices - first_index + 1
### if (is.null(tvi)) {
### tvi <- sub_array_of_indices
### }
### }
### sub_array_of_sri <- sub_array_of_sri + n_of_extra_cells
sri <- do.call('[[<-', c(list(x = sri), as.list(selector_store_position),
list(value = sub_array_of_sri)))
} else { # !with_transform
sub_array_of_fri <- generate_sub_array_of_fri(
with_transform, goes_across_prime_meridian, sub_array_of_indices, n, beta,
is_circular_dim)
# Reorder sub_array_of_fri if reordering function is used.
# It was index in the assigned order (e.g., in [-180, 180] if CircularSort(-180, 180)), and here is changed to the index in the original order.
if (!is.null(var_unorder_indices)) {
if (is.null(ordered_fri)) {
ordered_fri <- sub_array_of_fri
}
sub_array_of_fri <- var_unorder_indices[sub_array_of_fri]
}
fri <- do.call('[[<-', c(list(x = fri), as.list(selector_store_position),
list(value = sub_array_of_fri)))
if (!is.null(file_dim)) {
taken_chunks[selector_store_position[[file_dim]]] <- TRUE
} else {
taken_chunks <- TRUE
}
}
} else {
# The inner dim goes across a file dim (e.g., time_across = 'sdate')
if (debug) {
if (inner_dim %in% dims_to_check) {
print("-> THE INNER DIMENSION GOES ACROSS A FILE DIMENSION.")
}
}
# If "<inner_dim>_across = <file_dim> + merge_across_dims = FALSE + chunk over <inner_dim>", return error because this instance is not logically correct.
if (chunks[[inner_dim]]["n_chunks"] > 1 & inner_dim %in% inner_dims_across_files) {
stop("Chunk over dimension '", inner_dim, "' is not allowed because '",
inner_dim, "' is across '",
names(inner_dims_across_files)[which(inner_dim %in% inner_dims_across_files)], "'.")
}
if (inner_dim %in% names(dim(sub_array_of_selectors))) {
if (is.null(var_with_selectors_name)) {
if (!largest_dims_length | (largest_dims_length & length(unique(inner_dim_lengths)) <= 1)) { #old code
maximal_indice <- data_dims[inner_dim] * chunk_amount
} else { # files have different length of inner dim
maximal_indice <- sum(inner_dim_lengths)
}
any(na.omit(unlist(sub_array_of_selectors)) > maximal_indice)) {
stop("Provided indices out of range for dimension '", inner_dim, "' ",
"for dataset '", dat[[i]][['name']], "' (accepted range: 1 to ",
maximal_indice, ").")
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
}
} else {
if (inner_dim %in% names(dim(sub_array_of_values))) {
# NOTE: Put across-inner-dim at the 1st position.
# POSSIBLE PROB!! Only organize inner dim, the rest dims may not in the same order as sub_array_of_selectors below.
inner_dim_pos_in_sub_array <- which(names(dim(sub_array_of_values)) == inner_dim)
if (inner_dim_pos_in_sub_array != 1) {
new_sub_array_order <- (1:length(dim(sub_array_of_values)))[-inner_dim_pos_in_sub_array]
new_sub_array_order <- c(inner_dim_pos_in_sub_array, new_sub_array_order)
sub_array_of_values <- .aperm2(sub_array_of_values, new_sub_array_order)
}
}
}
# NOTE: Put across-inner-dim at the 1st position.
# POSSIBLE PROB!! Only organize inner dim, the rest dims may not in the same order as sub_array_of_values above.
inner_dim_pos_in_sub_array <- which(names(dim(sub_array_of_selectors)) == inner_dim)
if (inner_dim_pos_in_sub_array != 1) {
new_sub_array_order <- (1:length(dim(sub_array_of_selectors)))[-inner_dim_pos_in_sub_array]
new_sub_array_order <- c(inner_dim_pos_in_sub_array, new_sub_array_order)
sub_array_of_selectors <- .aperm2(sub_array_of_selectors, new_sub_array_order)
}
sub_array_of_indices <- selector_checker(sub_array_of_selectors, sub_array_of_values,
tolerance = tolerance_params[[inner_dim]])
# It is needed to expand the indices here, otherwise for
# values(list(date1, date2)) only 2 values are picked.
if (is.list(sub_array_of_indices)) {
sub_array_of_indices <- sub_array_of_indices[[1]]:sub_array_of_indices[[2]]
}
sub_array_of_indices <- sub_array_of_indices[chunk_indices(length(sub_array_of_indices),
chunks[[inner_dim]]['chunk'],
chunks[[inner_dim]]['n_chunks'],
inner_dim)]
sub_array_is_list <- FALSE
if (is.list(sub_array_of_indices)) {
sub_array_is_list <- TRUE
sub_array_of_indices <- unlist(sub_array_of_indices)
}
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
# "indices_chunk" refers to in which file the
# sub_array_of_indices is; "transformed_indices"
# refers to the indices of sub_array_of_indices in each file.
if (!largest_dims_length |
(largest_dims_length & length(unique(inner_dim_lengths)) <= 1)) {
# old code; all the files have the same length of inner_dim
if (is.null(var_with_selectors_name)) {
indices_chunk <- floor((sub_array_of_indices - 1) / data_dims[inner_dim]) + 1
transformed_indices <- ((sub_array_of_indices - 1) %% data_dims[inner_dim]) + 1
} else {
indices_chunk <- floor((sub_array_of_indices - 1) / var_full_dims[inner_dim]) + 1
transformed_indices <- ((sub_array_of_indices - 1) %% var_full_dims[inner_dim]) + 1
}
} else { # files have different inner dim length
indices_chunk <- c()
for (item in 1:length(inner_dim_lengths)) {
tmp <- which(sub_array_of_indices <= cumsum(inner_dim_lengths)[item])
indices_chunk <- c(indices_chunk, rep(item, length(tmp) - length(indices_chunk)))
}
sub_array_of_indices_by_file <- split(sub_array_of_indices, indices_chunk)
for (item in 2:length(sub_array_of_indices_by_file)) {
sub_array_of_indices_by_file[[item]] <- sub_array_of_indices_by_file[[item]] - cumsum(inner_dim_lengths)[item - 1]
}
transformed_indices <- unlist(sub_array_of_indices_by_file, use.names = FALSE)
if (sub_array_is_list) {
sub_array_of_indices <- as.list(sub_array_of_indices)
}
if (debug) {
if (inner_dim %in% dims_to_check) {
print("-> GOING TO ITERATE ALONG CHUNKS.")
}
}
for (chunk in 1:chunk_amount) {
if (!is.null(names(selector_store_position))) {
selector_store_position[file_dim] <- chunk
} else {
selector_store_position <- chunk
}
sub_array_of_indices <- transformed_indices[which(indices_chunk == chunk)]
sub_array_of_indices <- transformed_indices[which(indices_chunk == chunk)]
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
if (with_transform) {
# If the provided selectors are expressed in the world
# before transformation
if (!aiat) {
first_index <- min(unlist(sub_array_of_indices))
last_index <- max(unlist(sub_array_of_indices))
sub_array_of_fri <- max(c(first_index - beta, 1)):min(c(last_index + beta, n))
sub_array_of_sri <- transform_indices(unlist(sub_array_of_indices) - first_index + 1, n, m)
if (is.list(sub_array_of_indices)) {
if (length(sub_array_of_sri) > 1) {
sub_array_of_sri <- sub_array_of_sri[[1]]:sub_array_of_sri[[2]]
}
}
##TODO: TRANSFORM SUBSET VARIABLE AS ABOVE, TO COMPUTE SRI
# If the selectors are expressed after transformation
} else {
first_index <- min(unlist(sub_array_of_indices))
last_index <- max(unlist(sub_array_of_indices))
first_index_before_transform <- max(transform_indices(first_index, m, n) - beta, 1)
last_index_before_transform <- min(transform_indices(last_index, m, n) + beta, n)
sub_array_of_fri <- first_index_before_transform:last_index_before_transform
if (is.list(sub_array_of_indices) && (length(sub_array_of_indices) > 1)) {
sub_array_of_sri <- 1:(last_index - first_index + 1) +
round(beta / n * m)
} else {
sub_array_of_sri <- sub_array_of_indices - first_index + 1 +
round(beta / n * m)
}
##TODO: FILL IN TVI
}
sri <- do.call('[[<-', c(list(x = sri), as.list(selector_store_position),
list(value = sub_array_of_sri)))
if (length(sub_array_of_sri) > 0) {
taken_chunks[chunk] <- TRUE
}
} else {
sub_array_of_fri <- sub_array_of_indices
if (length(sub_array_of_fri) > 0) {
taken_chunks[chunk] <- TRUE
}
}
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
if (!is.null(var_unorder_indices)) {
ordered_fri <- sub_array_of_fri
sub_array_of_fri <- var_unorder_indices[sub_array_of_fri]
}
fri <- do.call('[[<-', c(list(x = fri), as.list(selector_store_position),
list(value = sub_array_of_fri)))
}
if (debug) {
if (inner_dim %in% dims_to_check) {
print("-> FINISHED ITERATING ALONG CHUNKS")
}
}
} else {
stop("Provided array of indices for dimension '", inner_dim, "', ",
"which goes across the file dimension '", file_dim, "', but ",
"the provided array does not have the dimension '", inner_dim,
"', which is mandatory.")
}
}
}
}
if (debug) {
if (inner_dim %in% dims_to_check) {
print("-> PROCEEDING TO CROP VARIABLES")
}
}
#if ((length(selector_array) == 1) && (selector_array %in% c('all', 'first', 'last'))) {
#if (!is.null(var_with_selectors_name) || (is.null(var_with_selectors_name) && is.character(selector_array) &&
# (length(selector_array) == 1) && (selector_array %in% c('all', 'first', 'last')))) {
empty_chunks <- which(!taken_chunks)
if (length(empty_chunks) >= length(taken_chunks)) {
stop("Selectors do not match any of the possible values for the dimension '", inner_dim, "'.")
}
if (length(empty_chunks) > 0) {
# # Get the first group of chunks to remove, and remove them.
# # E.g., from c(1, 2, 4, 5, 6, 8, 9) remove only 1 and 2
# dist <- abs(rev(empty_chunks) - c(rev(empty_chunks)[1] - 1, head(rev(empty_chunks), length(rev(empty_chunks)) - 1)))
# if (all(dist == 1)) {
# start_chunks_to_remove <- NULL
# } else {
# first_chunk_to_remove <- tail(which(dist > 1), 1)
# start_chunks_to_remove <- rev(rev(empty_chunks)[first_chunk_to_remove:length(empty_chunks)])
# }
# # Get the last group of chunks to remove, and remove them.
# # E.g., from c(1, 2, 4, 5, 6, 8, 9) remove only 8 and 9
# dist <- abs(empty_chunks - c(empty_chunks[1] - 1, head(empty_chunks, length(empty_chunks) - 1)))
# if (all(dist == 1)) {
# first_chunk_to_remove <- 1
# } else {
# first_chunk_to_remove <- tail(which(dist > 1), 1)
# }
# end_chunks_to_remove <- empty_chunks[first_chunk_to_remove:length(empty_chunks)]
# chunks_to_keep <- which(!((1:length(taken_chunks)) %in% c(start_chunks_to_remove, end_chunks_to_remove)))
chunks_to_keep <- which(taken_chunks)
dims_to_crop[[file_dim]] <- c(dims_to_crop[[file_dim]], list(chunks_to_keep))
# found_indices <- Subset(found_indices, file_dim, chunks_to_keep)
# # Crop dataset variables file dims.
# for (picked_var in names(picked_vars[[i]])) {
# if (file_dim %in% names(dim(picked_vars[[i]][[picked_var]]))) {
# picked_vars[[i]][[picked_var]] <- Subset(picked_vars[[i]][[picked_var]], file_dim, chunks_to_keep)
# }
# }
}
#}
dat[[i]][['selectors']][[inner_dim]] <- list(fri = fri, sri = sri)
# Crop dataset variables inner dims.
# Crop common variables inner dims.
types_of_var_to_crop <- 'picked'
if (with_transform) {
types_of_var_to_crop <- c(types_of_var_to_crop, 'transformed')
}
if (!is.null(dim_reorder_params[[inner_dim]])) {
types_of_var_to_crop <- c(types_of_var_to_crop, 'reordered')
}
for (type_of_var_to_crop in types_of_var_to_crop) {
if (type_of_var_to_crop == 'transformed') {
if (is.null(tvi)) {
if (!is.null(dim_reorder_params[[inner_dim]])) {
crop_indices <- unique(unlist(ordered_sri))
} else {
crop_indices <- unique(unlist(sri))
}
} else {
crop_indices <- unique(unlist(tvi))
}
vars_to_crop <- transformed_vars[[i]]
common_vars_to_crop <- transformed_common_vars
} else if (type_of_var_to_crop == 'reordered') {
crop_indices <- unique(unlist(ordered_fri))
vars_to_crop <- picked_vars_ordered[[i]]
common_vars_to_crop <- picked_common_vars_ordered
} else {
#TODO: If fri has different indices in each list, the crop_indices should be
# separated for each list. Otherwise, picked_common_vars later will be wrong.
crop_indices <- unique(unlist(fri))
vars_to_crop <- picked_vars[[i]]
common_vars_to_crop <- picked_common_vars
}
for (var_to_crop in names(vars_to_crop)) {
if (inner_dim %in% names(dim(vars_to_crop[[var_to_crop]]))) {
if (!is.null(crop_indices)) {
if (type_of_var_to_crop == 'transformed') {
if (!aiat) {
if (!(length(selector_array) == 1 &
selector_array %in% c('all', 'first', 'last'))) {
vars_to_crop[[var_to_crop]] <- Subset(transformed_subset_var, inner_dim, crop_indices)
aho
committed
} else {
vars_to_crop[[var_to_crop]] <-
Subset(transformed_var_with_selectors, inner_dim, crop_indices)
} else {
vars_to_crop[[var_to_crop]] <- Subset(vars_to_crop[[var_to_crop]], inner_dim, crop_indices)
}
} else {
vars_to_crop[[var_to_crop]] <- Subset(vars_to_crop[[var_to_crop]], inner_dim, crop_indices)
}
}
}
}
if (i == length(dat)) {
for (common_var_to_crop in names(common_vars_to_crop)) {
if (inner_dim %in% names(dim(common_vars_to_crop[[common_var_to_crop]]))) {
if (type_of_var_to_crop == 'transformed' & !aiat) {
if (!(length(selector_array) == 1 &
selector_array %in% c('all', 'first', 'last'))) {
common_vars_to_crop[[common_var_to_crop]] <-
Subset(transformed_subset_var, inner_dim, crop_indices)
aho
committed
} else {
common_vars_to_crop[[common_var_to_crop]] <-
Subset(transformed_var_with_selectors, inner_dim, crop_indices)
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
} else { #old code
common_vars_to_crop[[common_var_to_crop]] <- Subset(common_vars_to_crop[[common_var_to_crop]], inner_dim, crop_indices)
}
}
}
}
if (type_of_var_to_crop == 'transformed') {
if (!is.null(vars_to_crop)) {
transformed_vars[[i]] <- vars_to_crop
}
if (i == length(dat)) {
transformed_common_vars <- common_vars_to_crop
}
} else if (type_of_var_to_crop == 'reordered') {
if (!is.null(vars_to_crop)) {
picked_vars_ordered[[i]] <- vars_to_crop
}
if (i == length(dat)) {
picked_common_vars_ordered <- common_vars_to_crop
}
} else {
if (!is.null(vars_to_crop)) {
picked_vars[[i]] <- vars_to_crop
}
if (i == length(dat)) {
#NOTE: To avoid redundant run
if (inner_dim %in% names(common_vars_to_crop)) {
picked_common_vars <- common_vars_to_crop
}
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
}
}
}
#}
}
# After the selectors have been picked (using the original variables),
# the variables are transformed. At that point, the original selectors
# for the transformed variables are also kept in the variable original_selectors.
#print("L")
}
}
}
# if (!is.null(transformed_common_vars)) {
# picked_common_vars[names(transformed_common_vars)] <- transformed_common_vars
# }
# Remove the trailing chunks, if any.
for (file_dim in names(dims_to_crop)) {
# indices_to_keep <- min(sapply(dims_to_crop[[file_dim]], min)):max(sapply(dims_to_crop[[file_dim]], max))
## TODO: Merge indices in dims_to_crop with some advanced mechanism?
indices_to_keep <- unique(unlist(dims_to_crop[[file_dim]]))
array_of_files_to_load <- Subset(array_of_files_to_load, file_dim, indices_to_keep)
array_of_not_found_files <- Subset(array_of_not_found_files, file_dim, indices_to_keep)
for (i in 1:length(dat)) {
# Crop selectors
for (selector_dim in names(dat[[i]][['selectors']])) {
if (selector_dim == file_dim) {
for (j in 1:length(dat[[i]][['selectors']][[selector_dim]][['fri']])) {
dat[[i]][['selectors']][[selector_dim]][['fri']][[j]] <- dat[[i]][['selectors']][[selector_dim]][['fri']][[j]][indices_to_keep]
}
for (j in 1:length(dat[[i]][['selectors']][[selector_dim]][['sri']])) {
dat[[i]][['selectors']][[selector_dim]][['sri']][[j]] <- dat[[i]][['selectors']][[selector_dim]][['sri']][[j]][indices_to_keep]
}
}
if (file_dim %in% names(dim(dat[[i]][['selectors']][[selector_dim]][['fri']]))) {
dat[[i]][['selectors']][[selector_dim]][['fri']] <- Subset(dat[[i]][['selectors']][[selector_dim]][['fri']], file_dim, indices_to_keep)
dat[[i]][['selectors']][[selector_dim]][['sri']] <- Subset(dat[[i]][['selectors']][[selector_dim]][['sri']], file_dim, indices_to_keep)
}
}
# Crop dataset variables file dims.
for (picked_var in names(picked_vars[[i]])) {
if (file_dim %in% names(dim(picked_vars[[i]][[picked_var]]))) {
picked_vars[[i]][[picked_var]] <- Subset(picked_vars[[i]][[picked_var]], file_dim, indices_to_keep)
}
}
for (transformed_var in names(transformed_vars[[i]])) {
if (file_dim %in% names(dim(transformed_vars[[i]][[transformed_var]]))) {
transformed_vars[[i]][[transformed_var]] <- Subset(transformed_vars[[i]][[transformed_var]], file_dim, indices_to_keep)
}
}
}
# Crop common variables file dims.
for (picked_common_var in names(picked_common_vars)) {
if (file_dim %in% names(dim(picked_common_vars[[picked_common_var]]))) {
picked_common_vars[[picked_common_var]] <- Subset(picked_common_vars[[picked_common_var]], file_dim, indices_to_keep)
}
}
for (transformed_common_var in names(transformed_common_vars)) {
if (file_dim %in% names(dim(transformed_common_vars[[transformed_common_var]]))) {
transformed_common_vars[[transformed_common_var]] <- Subset(transformed_common_vars[[transformed_common_var]], file_dim, indices_to_keep)
}
}
}
# Calculate the size of the final array.
total_inner_dims <- NULL
for (i in 1:length(dat)) {
if (dataset_has_files[i]) {
inner_dims <- expected_inner_dims[[i]]
inner_dims <- sapply(inner_dims,
function(x) {
if (!all(sapply(dat[[i]][['selectors']][[x]][['sri']], is.null))) {
max(sapply(dat[[i]][['selectors']][[x]][['sri']], length))
} else {
if (length(var_params[[x]]) > 0) {
if (var_params[[x]] %in% names(transformed_vars[[i]])) {
length(transformed_vars[[i]][[var_params[[x]]]])
} else if (var_params[[x]] %in% names(transformed_common_vars)) {
length(transformed_common_vars[[var_params[[x]]]])
} else {
max(sapply(dat[[i]][['selectors']][[x]][['fri']], length))
}
} else {
max(sapply(dat[[i]][['selectors']][[x]][['fri']], length))
}
}
})
names(inner_dims) <- expected_inner_dims[[i]]
if (is.null(total_inner_dims)) {
total_inner_dims <- inner_dims
} else {
new_dims <- .MergeArrayDims(total_inner_dims, inner_dims)
total_inner_dims <- new_dims[[3]]
}
}
}
new_dims <- .MergeArrayDims(dim(array_of_files_to_load), total_inner_dims)
final_dims <- new_dims[[3]][dim_names]
# final_dims_fake is the vector of final dimensions after having merged the
# 'across' file dimensions with the respective 'across' inner dimensions, and
# after having broken into multiple dimensions those dimensions for which
# multidimensional selectors have been provided.
# final_dims will be used for collocation of data, whereas final_dims_fake
# will be used for shaping the final array to be returned to the user.
final_dims_fake <- final_dims
if (merge_across_dims) {
final_dims_fake <- dims_merge(inner_dims_across_files, final_dims_fake)
#=========================================================================
# Find the dimension to split if split_multiselected_dims = TRUE.
# If there is no dimension able to be split, change split_multiselected_dims to FALSE.
tmp <- dims_split(dim_params, final_dims_fake)
final_dims_fake <- tmp[[1]]
# all_split_dims is a list containing all the split dims
all_split_dims <- tmp[[2]]
if (is.null(all_split_dims)) {
split_multiselected_dims <- FALSE
.warning(paste0("Not found any dimensions able to be split. The parameter ",
"'split_multiselected_dims' is changed to FALSE."))
}
#======================================================================
# If only merge_across_dims and merge_across_dims_narm and no split_multiselected_dims,
# the length of inner across dim (e.g., time) needs to be adjusted. Sum up the actual length
# without potential NAs.
if (merge_across_dims) {
across_inner_dim <- inner_dims_across_files[[1]] #TODO: more than one?
# Get the length of each inner_dim ('time') along each file_dim ('file_date')
length_inner_across_dim <- lapply(dat[[i]][['selectors']][[across_inner_dim]][['fri']], length)
if (merge_across_dims_narm & !split_multiselected_dims) {
final_dims_fake <- merge_narm_dims(final_dims_fake, across_inner_dim, length_inner_across_dim)
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
}
}
if (!silent) {
.message("Detected dimension sizes:")
longest_dim_len <- max(sapply(names(final_dims_fake), nchar))
longest_size_len <- max(sapply(paste0(final_dims_fake, ''), nchar))
sapply(names(final_dims_fake),
function(x) {
message(paste0("* ", paste(rep(' ', longest_dim_len - nchar(x)), collapse = ''),
x, ": ", paste(rep(' ', longest_size_len - nchar(paste0(final_dims_fake[x], ''))), collapse = ''),
final_dims_fake[x]))
})
bytes <- prod(c(final_dims_fake, 8))
dim_sizes <- paste(final_dims_fake, collapse = ' x ')
if (retrieve) {
.message(paste("Total size of requested data:"))
} else {
.message(paste("Total size of involved data:"))
}
.message(paste(dim_sizes, " x 8 bytes =",
format(structure(bytes, class = "object_size"), units = "auto")),
indent = 2)
}
# NOTE: If split_multiselected_dims + merge_across_dims, the dim order may need to be changed.
# The inner_dim needs to be the first dim among split dims.
# TODO: Cannot control the rest dims are in the same order or not...
# Suppose users put the same order of across inner and file dims.
if (split_multiselected_dims & merge_across_dims) {
# TODO: More than one split?
inner_dim_pos_in_split_dims <- which(names(all_split_dims[[1]]) == inner_dims_across_files)
# if inner_dim is not the first, change!
if (inner_dim_pos_in_split_dims != 1) {
# Save the current final_dims_fake for reordering it back later
tmp <- reorder_split_dims(all_split_dims[[1]], inner_dim_pos_in_split_dims, final_dims_fake)
final_dims_fake <- tmp[[1]]
all_split_dims[[1]] <- tmp[[2]]
# The following several lines will only run if retrieve = TRUE
if (retrieve) {
########## CREATING THE SHARED MATRIX AND DISPATCHING WORK PIECES ###########
# TODO: try performance of storing all in cols instead of rows
# Create the shared memory array, and a pointer to it, to be sent
# to the work pieces.
if (is.null(ObjectBigmemory)) {
data_array <- bigmemory::big.matrix(nrow = prod(final_dims), ncol = 1)
} else {
data_array <- bigmemory::big.matrix(nrow = prod(final_dims), ncol = 1,
backingfile = ObjectBigmemory)
}
if (is.null(ObjectBigmemory)) {
name_bigmemory_obj <- attr(shared_matrix_pointer, 'description')$sharedName
} else {
name_bigmemory_obj <- attr(shared_matrix_pointer, 'description')$filename
}
#warning(paste("SharedName:", attr(shared_matrix_pointer, 'description')$sharedName))
#warning(paste("Filename:", attr(shared_matrix_pointer, 'description')$filename))
nperez
committed
#if (!is.null(ObjectBigmemory)) {
# attr(shared_matrix_pointer, 'description')$sharedName <- ObjectBigmemory
#}
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
if (is.null(num_procs)) {
num_procs <- future::availableCores()
}
# Creating a shared tmp folder to store metadata from each chunk
array_of_metadata_flags <- array(FALSE, dim = dim(array_of_files_to_load))
if (!is.null(metadata_dims)) {
metadata_indices_to_load <- as.list(rep(1, length(dim(array_of_files_to_load))))
names(metadata_indices_to_load) <- names(dim(array_of_files_to_load))
metadata_indices_to_load[metadata_dims] <- as.list(rep(TRUE, length(metadata_dims)))
array_of_metadata_flags <- do.call('[<-', c(list(array_of_metadata_flags), metadata_indices_to_load,
list(value = rep(TRUE, prod(dim(array_of_files_to_load)[metadata_dims])))))
}
metadata_file_counter <- 0
metadata_folder <- tempfile('metadata')
dir.create(metadata_folder)
# Build the work pieces, each with:
# - file path
# - total size (dims) of store array
# - start position in store array
# - file selectors (to provide extra info. useful e.g. to select variable)
# - indices to take from file
work_pieces <- list()
for (i in 1:length(dat)) {
if (dataset_has_files[i]) {
# metadata_file_counter may be changed by the following function
work_pieces <- build_work_pieces(
work_pieces = work_pieces, i = i, selectors = dat[[i]][['selectors']],
file_dims = found_file_dims[[i]],
inner_dims = expected_inner_dims[[i]], final_dims = final_dims,
found_pattern_dim = found_pattern_dim,
inner_dims_across_files = inner_dims_across_files,
array_of_files_to_load = array_of_files_to_load,
array_of_not_found_files = array_of_not_found_files,
array_of_metadata_flags = array_of_metadata_flags,
metadata_file_counter = metadata_file_counter,
depending_file_dims = depending_file_dims, transform = transform,
transform_vars = transform_vars, picked_vars = picked_vars[[i]],
picked_vars_ordered = picked_vars_ordered[[i]],
picked_common_vars = picked_common_vars,
picked_common_vars_ordered = picked_common_vars_ordered,
metadata_folder = metadata_folder, debug = debug)
}
}
#print("N")
if (debug) {
print("-> WORK PIECES BUILT")
}
# Calculate the progress %s that will be displayed and assign them to
# the appropriate work pieces.
work_pieces <- retrieve_progress_message(work_pieces, num_procs, silent)
# NOTE: In .LoadDataFile(), metadata is saved in metadata_folder/1 or /2 etc. Before here,
# the path name is created in work_pieces but the path hasn't been built yet.
if (num_procs == 1) {
found_files <- lapply(work_pieces, .LoadDataFile,
shared_matrix_pointer = shared_matrix_pointer,
file_data_reader = file_data_reader,
synonims = synonims,
transform = transform,
transform_params = transform_params,
transform_crop_domain = transform_crop_domain,
silent = silent, debug = debug)
} else {
cluster <- parallel::makeCluster(num_procs, outfile = "")
# Send the heavy work to the workers
work_errors <- try({
found_files <- parallel::clusterApplyLB(cluster, work_pieces, .LoadDataFile,
shared_matrix_pointer = shared_matrix_pointer,
file_data_reader = file_data_reader,
synonims = synonims,
transform = transform,
transform_params = transform_params,
transform_crop_domain = transform_crop_domain,
silent = silent, debug = debug)
})
parallel::stopCluster(cluster)
}
if (!silent) {
# if (progress_message != '')
if (length(work_pieces) / num_procs >= 2 && !silent) {
.message("\n", tag = '')
}
}
#print("P")
# NOTE: If merge_across_dims = TRUE, there might be additional NAs due to
# unequal inner_dim ('time') length across file_dim ('file_date').
# If merge_across_dims_narm = TRUE, add additional lines to remove these NAs.
# TODO: Now it assumes that only one '_across'. Add a for loop for more-than-one case.
if (merge_across_dims & (split_multiselected_dims | merge_across_dims_narm)) {
if (!merge_across_dims_narm) {
data_array_tmp <- array(bigmemory::as.matrix(data_array), dim = final_dims)
} else {
data_array_tmp <- remove_additional_na_from_merge(
inner_dims_across_files, final_dims, across_inner_dim,
length_inner_across_dim, data_array)
if (length(data_array_tmp) != prod(final_dims_fake)) {
stop(paste0("After reshaping, the data do not fit into the expected output dimension. ",
"Check if the reshaping parameters are used correctly."))
#NOTE: When one file contains values for dicrete dimensions, rearrange the
# chunks (i.e., work_piece) is necessary.
data_array_tmp <- rebuild_array_merge_split(
data_array_tmp, indices_chunk, all_split_dims, final_dims_fake,
across_inner_dim, length_inner_across_dim)
}
data_array <- array(data_array_tmp, dim = final_dims_fake)
} else { # ! (merge_across_dims + split_multiselected_dims) (old version)
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
data_array <- array(bigmemory::as.matrix(data_array), dim = final_dims_fake)
}
# NOTE: If split_multiselected_dims + merge_across_dims, the dimension order may change above.
# To get the user-required dim order, we need to reorder the array again.
if (split_multiselected_dims & merge_across_dims) {
if (inner_dim_pos_in_split_dims != 1) {
correct_order <- match(names(final_dims_fake_output), names(final_dims_fake))
data_array <- .aperm2(data_array, correct_order)
}
}
gc()
# Load metadata and remove the metadata folder
if (!is.null(metadata_dims)) {
loaded_metadata_files <- list.files(metadata_folder)
if (!identical(loaded_metadata_files, character(0))) { # old code
loaded_metadata <- lapply(paste0(metadata_folder, '/', loaded_metadata_files), readRDS)
} else {
loaded_metadata <- NULL
}
unlink(metadata_folder, recursive = TRUE)
# Create a list of metadata of the variable (e.g., tas)
return_metadata <- create_metadata_list(array_of_metadata_flags, metadata_dims, pattern_dims,
loaded_metadata_files, loaded_metadata, dat_names,
dataset_has_files)
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
# TODO: Try to infer data type from loaded_metadata
# as.integer(data_array)
}
failed_pieces <- work_pieces[which(unlist(found_files))]
for (failed_piece in failed_pieces) {
array_of_not_found_files <- do.call('[<-',
c(list(array_of_not_found_files),
as.list(failed_piece[['file_indices_in_array_of_files']]),
list(value = TRUE)))
}
if (any(array_of_not_found_files)) {
for (i in 1:prod(dim(array_of_files_to_load))) {
if (is.na(array_of_not_found_files[i])) {
array_of_files_to_load[i] <- NA
} else {
if (array_of_not_found_files[i]) {
array_of_not_found_files[i] <- array_of_files_to_load[i]
array_of_files_to_load[i] <- NA
} else {
array_of_not_found_files[i] <- NA
}
}
}
} else {
array_of_not_found_files <- NULL
}
} # End if (retrieve)
# Change final_dims_fake back because retrieve = FALSE will use it for attributes later
if (exists("final_dims_fake_output")) {
final_dims_fake <- final_dims_fake_output
}
# Replace the vars and common vars by the transformed vars and common vars
for (i in 1:length(dat)) {
if (length(names(transformed_vars[[i]])) > 0) {
picked_vars[[i]][names(transformed_vars[[i]])] <- transformed_vars[[i]]
} else if (length(names(picked_vars_ordered[[i]])) > 0) {
picked_vars[[i]][names(picked_vars_ordered[[i]])] <- picked_vars_ordered[[i]]
}
}
if (length(names(transformed_common_vars)) > 0) {
picked_common_vars[names(transformed_common_vars)] <- transformed_common_vars
} else if (length(names(picked_common_vars_ordered)) > 0) {
picked_common_vars[names(picked_common_vars_ordered)] <- picked_common_vars_ordered
}
if (debug) {
print("-> THE TRANSFORMED VARS:")
print(str(transformed_vars))
print("-> THE PICKED VARS:")
print(str(picked_vars))
}
file_selectors <- NULL
for (i in 1:length(dat)) {
file_selectors[[dat[[i]][['name']]]] <- dat[[i]][['selectors']][which(names(dat[[i]][['selectors']]) %in% found_file_dims[[i]])]
}
if (retrieve) {
if (!silent) {
.message("Successfully retrieved data.")
}
if (all(sapply(return_metadata, is.null))) {
# We don't have metadata of the variable (e.g., tas). The returned metadata list only
# contains those are specified in argument "return_vars".
Variables_list <- c(list(common = picked_common_vars), picked_vars)
.warning(paste0("Metadata cannot be retrieved. The reason may be the ",
"non-existence of the first file. Use parameter 'metadata_dims'",
" to assign to file dimensions along which to return metadata, ",
"or check the existence of the first file."))
} else {
# Add the metadata of the variable (e.g., tas) into the list of picked_vars or
# picked_common_vars.
Variables_list <- combine_metadata_picked_vars(
return_metadata, picked_vars, picked_common_vars,
metadata_dims, pattern_dims, length(dat))
Files = array_of_files_to_load,
NotFoundFiles = array_of_not_found_files,
FileSelectors = file_selectors,
ObjectBigmemory = name_bigmemory_obj) #attr(shared_matrix_pointer, 'description')$sharedName)
)
attr(data_array, 'class') <- c('startR_array', attr(data_array, 'class'))
data_array
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
if (!silent) {
.message("Successfully discovered data dimensions.")
}
start_call <- match.call()
for (i in 2:length(start_call)) {
if (class(start_call[[i]]) %in% c('name', 'call')) {
start_call[[i]] <- eval.parent(start_call[[i]])
}
}
start_call[['retrieve']] <- TRUE
attributes(start_call) <- c(attributes(start_call),
list(Dimensions = final_dims_fake,
Variables = c(list(common = picked_common_vars), picked_vars),
ExpectedFiles = array_of_files_to_load,
FileSelectors = file_selectors,
PatternDim = found_pattern_dim,
MergedDims = if (merge_across_dims) {
inner_dims_across_files
} else {
NULL
},
SplitDims = if (split_multiselected_dims) {
all_split_dims
} else {
NULL
})
)
attr(start_call, 'class') <- c('startR_cube', attr(start_call, 'class'))
start_call
}
}
# This function is the responsible for loading the data of each work
# piece.
.LoadDataFile <- function(work_piece, shared_matrix_pointer,
file_data_reader, synonims,
transform, transform_params, transform_crop_domain = NULL,
nperez
committed
#warning(attr(shared_matrix_pointer, 'description')$sharedName)
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
# suppressPackageStartupMessages({library(bigmemory)})
### TODO: Specify dependencies as parameter
# suppressPackageStartupMessages({library(ncdf4)})
#print("1")
store_indices <- as.list(work_piece[['store_position']])
first_round_indices <- work_piece[['first_round_indices']]
second_round_indices <- work_piece[['second_round_indices']]
#print("2")
file_to_open <- work_piece[['file_path']]
sub_array <- file_data_reader(file_to_open, NULL,
work_piece[['file_selectors']],
first_round_indices, synonims)
if (debug) {
if (all(unlist(store_indices[1:6]) == 1)) {
print("-> LOADING A WORK PIECE")
print("-> STRUCTURE OF READ UNTRANSFORMED DATA:")
print(str(sub_array))
print("-> STRUCTURE OF VARIABLES TO TRANSFORM:")
print(str(work_piece[['vars_to_transform']]))
print("-> COMMON ARRAY DIMENSIONS:")
print(str(work_piece[['store_dims']]))
}
}
if (!is.null(sub_array)) {
# Apply data transformation once we have the data arrays.
if (!is.null(transform)) {
if (debug) {
if (all(unlist(store_indices[1:6]) == 1)) {
print("-> PROCEEDING TO TRANSFORM ARRAY")
print("-> DIMENSIONS OF ARRAY RIGHT BEFORE TRANSFORMING:")
print(dim(sub_array))
}
}
sub_array <- do.call(transform, c(list(data_array = sub_array,
variables = work_piece[['vars_to_transform']],
file_selectors = work_piece[['file_selectors']],
crop_domain = transform_crop_domain),
transform_params))
if (debug) {
if (all(unlist(store_indices[1:6]) == 1)) {
print("-> STRUCTURE OF ARRAY AND VARIABLES RIGHT AFTER TRANSFORMING:")
print(str(sub_array))
print("-> DIMENSIONS OF ARRAY RIGHT AFTER TRANSFORMING:")
print(dim(sub_array$data_array))
}
}
sub_array <- sub_array$data_array
# Subset with second round of indices