Open
Description
base::file.info()
is sometimes a bottleneck in targets
pipelines with many files (c.f. ropensci/targets#1403). On a slow nfs drive in a shared RHEL9 cluster, I noticed performance improvements with a custom C implementation that uses fts. Is this an appealing enhancement for ps
, either in ps::ps_fs_stat()
or elsewhere in the package? If not, I will just implement it for targets
. Reprex:
directory <- "files"
if (!file.exists(directory)) {
dir.create(directory)
}
files <- seq_len(5e4)
random_data <- function() {
out <- list()
rows <- sample(seq(from = 800, to = 1200), size = 1)
for (name in paste0("x", seq_len(32L))) {
out[[name]] <- runif(rows)
}
as.data.frame(out)
}
temp <- lapply(files, \(file) {
if (!(file %% 100)) print(file)
saveRDS(
object = random_data(),
file = file.path(directory, paste0("test-data-", file, ".rds")),
compress = FALSE
)
})
paths <- list.files("files", full.names = TRUE)
system.time(out_base <- file.info(paths, extra_cols = FALSE))
#> user system elapsed
#> 0.033 0.345 102.515
file_info_fts <- inline::cfunction(
sig = c(directory = "character"),
includes = c(
"#include <R.h>",
"#include <Rinternals.h>",
"#include <fts.h>",
"#include <sys/stat.h>",
"#include <string.h>",
"#include <errno.h>"
),
body = "const char *path_argv[] = {CHAR(STRING_ELT(directory, 0)), NULL};
FTS *fts = fts_open((char* const*) path_argv, FTS_LOGICAL, NULL);
if (!fts) {
Rf_error(\"fts_open() failed: %s\", strerror(errno));
}
int capacity = 2048;
int count = 0;
FTSENT *entry;
SEXP path;
SEXP size;
SEXP mtime;
PROTECT_INDEX index_path;
PROTECT_INDEX index_size;
PROTECT_INDEX index_mtime;
PROTECT_WITH_INDEX(path = allocVector(STRSXP, capacity), &index_path);
PROTECT_WITH_INDEX(size = allocVector(REALSXP, capacity), &index_size);
PROTECT_WITH_INDEX(mtime = allocVector(REALSXP, capacity), &index_mtime);
while ((entry = fts_read(fts)) != NULL) {
R_CheckUserInterrupt();
if (entry->fts_info == FTS_F) {
if (count == capacity) {
capacity *= 2;
REPROTECT(path = Rf_xlengthgets(path, capacity), index_path);
REPROTECT(size = Rf_xlengthgets(size, capacity), index_size);
REPROTECT(mtime = Rf_xlengthgets(mtime, capacity), index_mtime);
}
SET_STRING_ELT(path, count, mkChar(entry->fts_path));
REAL(size)[count] = (double) entry->fts_statp->st_size;
REAL(mtime)[count] = (double) entry->fts_statp->st_mtime;
count++;
}
}
fts_close(fts);
REPROTECT(path = Rf_xlengthgets(path, count), index_path);
REPROTECT(size = Rf_xlengthgets(size, count), index_size);
REPROTECT(mtime = Rf_xlengthgets(mtime, count), index_mtime);
SEXP result = PROTECT(allocVector(VECSXP, 3));
SEXP names = PROTECT(allocVector(STRSXP, 3));
SET_STRING_ELT(names, 0, mkChar(\"path\"));
SET_STRING_ELT(names, 1, mkChar(\"size\"));
SET_STRING_ELT(names, 2, mkChar(\"mtime\"));
SET_VECTOR_ELT(result, 0, path);
SET_VECTOR_ELT(result, 1, size);
SET_VECTOR_ELT(result, 2, mtime);
setAttrib(result, R_NamesSymbol, names);
UNPROTECT(5);
return result;"
)
system.time(out_fts <- file_info_fts("files"))
#> user system elapsed
#> 0.010 0.100 4.497