Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -139,3 +139,4 @@ inst/extdata/filesize_hex_wide.R
inst/extdata/filesize_checks.R
inst/extdata/bigdata.sh
*.parquet
pipeline/
4 changes: 3 additions & 1 deletion R/processing.R
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,6 @@ process_narr2 <- function(
#' @importFrom stringi stri_extract_first_regex stri_sub
#' @importFrom terra writeRaster
#' @author Insang Song
#' @keywords Calculation
#' @export
#' @examples
#' \dontrun{
Expand Down Expand Up @@ -390,6 +389,9 @@ export_tif <- function(
subdataset = "",
dest = "."
) {
if (!dir.exists(dest)) {
dir.create(dest, recursive = TRUE)
}
redate <- stringi::stri_extract_first_regex(path_in, pat)
redate <- unique(redate)
redate <- stringi::stri_sub(redate, 2, 8)
Expand Down
135 changes: 129 additions & 6 deletions _targets.R
Original file line number Diff line number Diff line change
Expand Up @@ -66,14 +66,15 @@ controller_mlp <- crew.cluster::crew_controller_slurm(
script_lines = scriptlines_mlp
)
)

##### `controller_lgb` uses 100 CPUs for {lightGBM} models.
scriptlines_lgb <- glue::glue(
"#SBATCH --job-name=lgb \
#SBATCH --partition=gpu \
#SBATCH --nodelist=gn040809 \
#SBATCH --ntasks=1 \
#SBATCH --cpus-per-task=32 \
#SBATCH --mem=100G \
#SBATCH --cpus-per-task=25 \
#SBATCH --mem=500G \
#SBATCH --error=slurm/lgb_%j.out \
export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK \
export LIGHTGBM_NUM_THREADS=$SLURM_CPUS_PER_TASK \
Expand All @@ -87,13 +88,126 @@ scriptlines_lgb <- glue::glue(
)
controller_lgb <- crew.cluster::crew_controller_slurm(
name = "controller_lgb",
workers = 3,
workers = 25,
options_cluster = crew.cluster::crew_options_slurm(
verbose = TRUE,
script_lines = scriptlines_lgb
)
)

scriptlines_backup <- glue::glue(
"#SBATCH --job-name=g_back \
#SBATCH --partition=normal,highmem,geo \
#SBATCH --ntasks=1 \
#SBATCH --cpus-per-task=1 \
#SBATCH --mem=200G \
#SBATCH --error=slurm/backup_%j.out \
module load R \
set -euo pipefail \
{scriptlines_apptainer} exec ",
"--bind {scriptlines_basedir}:/mnt ",
"--bind {scriptlines_basedir}/inst:/inst ",
"--bind {scriptlines_inputdir}:/input ",
"--bind {scriptlines_targetdir}/targets:/opt/_targets ",
"{scriptlines_container} \\"
)

controller_backup <- crew.cluster::crew_controller_slurm(
name = "controller_backup",
workers = 100,
options_cluster = crew.cluster::crew_options_slurm(
verbose = TRUE,
script_lines = scriptlines_backup
),
garbage_collection = TRUE
)

##### `controller_grid` uses 100 CPUs for {grid covariates} models.
scriptlines_grid <- glue::glue(
"#SBATCH --job-name=grid \
#SBATCH --partition=normal,highmem \
#SBATCH --requeue \
#SBATCH --ntasks=1 \
#SBATCH --cpus-per-task=1 \
#SBATCH --mem=35G \
#SBATCH --error=slurm/grid_%j.out \
module load R \
set -euo pipefail \
export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK \
{scriptlines_apptainer} exec --env OMP_NUM_THREADS=$OMP_NUM_THREADS ",
"--bind {scriptlines_basedir}:/mnt ",
"--bind {scriptlines_basedir}/inst:/inst ",
"--bind {scriptlines_inputdir}:/input ",
"--bind /run/munge:/run/munge ",
"--bind /ddn/gs1/tools/slurm/etc/slurm:/ddn/gs1/tools/slurm/etc/slurm ",
"--bind {scriptlines_targetdir}/targets:/opt/_targets ",
"{scriptlines_container} \\"
)


controller_grid <- crew.cluster::crew_controller_slurm(
name = "controller_grid",
workers = 1000,
crashes_max = 5L,
options_cluster = crew.cluster::crew_options_slurm(
verbose = TRUE,
script_lines = scriptlines_grid
),
options_metrics = crew::crew_options_metrics(
path = "pipeline/",
seconds_interval = 1
),
backup = controller_backup,
tasks_max = 1L
)

# #SBATCH --nodelist=cn040301,cn040609,cn030307,cn030309,cn030311 \

scriptlines_big_grid <- glue::glue(
"#SBATCH --job-name=biggrid \
#SBATCH --partition=normal,highmem \
#SBATCH --requeue \
#SBATCH --ntasks=1 \
#SBATCH --cpus-per-task=1 \
#SBATCH --mem=50G \
#SBATCH --error=slurm/bgrid_%j.out \
module load R \
set -euo pipefail \
export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK \
{scriptlines_apptainer} exec --env OMP_NUM_THREADS=$OMP_NUM_THREADS ",
"--bind {scriptlines_basedir}:/mnt ",
"--bind {scriptlines_basedir}/inst:/inst ",
"--bind {scriptlines_inputdir}:/input ",
"--bind /run/munge:/run/munge ",
"--bind /ddn/gs1/tools/slurm/etc/slurm:/ddn/gs1/tools/slurm/etc/slurm ",
"--bind {scriptlines_targetdir}/targets:/opt/_targets ",
"{scriptlines_container} \\"
)

controller_big_grid <- crew.cluster::crew_controller_slurm(
name = "controller_big_grid",
workers = 1000,
crashes_max = 5L,
options_cluster = crew.cluster::crew_options_slurm(
verbose = TRUE,
script_lines = scriptlines_big_grid
),
options_metrics = crew::crew_options_metrics(
path = "pipeline/",
seconds_interval = 1
),
backup = controller_backup,
tasks_max = 1L
)


# if (targets::tar_active()) {
# autometric::log_start(
# path = "main_process.txt", # Statistics on the main process go here.
# seconds = 1
# )
# }

############################## STORE ##############################
targets::tar_config_set(store = "/opt/_targets")

Expand All @@ -110,6 +224,7 @@ if (Sys.getenv("BEETHOVEN") == "covariates") {
"sf",
"crew",
"crew.cluster",
"mirai",
"lubridate",
"qs2",
"kernlab",
Expand All @@ -128,6 +243,7 @@ if (Sys.getenv("BEETHOVEN") == "covariates") {
"crew",
"crew.cluster",
"lubridate",
"mirai",
"qs2",
"torch",
"parsnip",
Expand All @@ -139,14 +255,17 @@ if (Sys.getenv("BEETHOVEN") == "covariates") {
"spatialsample",
"tidymodels",
"brulee",
"workflows"
"workflows",
"h3",
"h3r",
"autometric"
)
}
targets::tar_option_set(
packages = beethoven_packages,
repository = "local",
error = "continue",
memory = "transient",
memory = "auto",
format = "qs",
storage = "worker",
deployment = "worker",
Expand All @@ -161,7 +280,10 @@ targets::tar_option_set(
controller_5,
controller_1,
controller_mlp,
controller_lgb
controller_lgb,
controller_grid,
controller_backup,
controller_big_grid
),
resources = targets::tar_resources(
crew = targets::tar_resources_crew(controller = "controller_250")
Expand Down Expand Up @@ -194,6 +316,7 @@ if (Sys.getenv("BEETHOVEN") == "covariates") {
target_predict <- list()
}


############################## PIPELINE ##############################
list(
target_critical,
Expand Down
4 changes: 3 additions & 1 deletion container/container_models.def
Original file line number Diff line number Diff line change
Expand Up @@ -104,14 +104,16 @@ From: nvidia/cuda:11.8.0-devel-ubuntu22.04
Rscript -e "pak::pak('shikokuchuo/nanonext'); pak::pak('shikokuchuo/mirai')"
Rscript -e "devtools::install_version('crew', version = '1.1.2')"
Rscript -e "devtools::install_version('crew.cluster', version = '0.3.7')"
Rscript -e "pak::pak('wlandau/autometric')"
# Rscript -e "pak::pak('wlandau/crew'); pak::pak('wlandau/crew.cluster')"
Rscript -e "install.packages(c('scoringRules', 'future.mirai', 'vetiver', \
'lwgeom', 'doRNG', 'quarto', 'kernlab', 'arrow', 'h3r'))"
Rscript -e "pak::pak('finetune')"
Rscript -e "pak::pak('lme4',upgrade = TRUE)"
Rscript -e "remotes::install_github('crazycapivara/h3-r')"
Rscript -e "pak::pak('ropensci/chopin')"
Rscript -e "pak::pak('NIEHS/beethoven@mm-0701')"
Rscript -e "pak::pak('NIEHS/amadeus')"
Rscript -e "pak::pak('NIEHS/beethoven')"
Rscript -e "pak::pak('r-lib/ps', upgrade = TRUE)"

%environment
Expand Down
2 changes: 1 addition & 1 deletion inst/scripts/run_covariates.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash

#SBATCH --job-name=covariate
#SBATCH --mail-user=mitchell.manware@nih.gov
#SBATCH --mail-user=kyle.messier@nih.gov
#SBATCH --mail-type=END,FAIL
#SBATCH --partition=geo
#SBATCH --ntasks=1
Expand Down
42 changes: 42 additions & 0 deletions inst/scripts/run_grid.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/bin/bash

#SBATCH --job-name=gridh3
#SBATCH [email protected]
#SBATCH --mail-type=END,FAIL
#SBATCH --partition=normal
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
#SBATCH --mem=1G
#SBATCH --error=slurm/gridh3_%j.err
#SBATCH --output=slurm/gridh3_%j.out

############################ CERTIFICATES ############################
# Export CURL_CA_BUNDLE and SSL_CERT_FILE environmental variables to vertify
# servers' SSL certificates during download.
export CURL_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt
export SSL_CERT_FILE=/etc/ssl/certs/ca-certificates.crt

############################# PREDICTION COVARIATEs #############################
# Set environmental variable to indicate download and covariate
# calculation targets.
export BEETHOVEN=grid

############################### GPU SETUP #############################
# Ensure all allocated GPUs are visible
export CUDA_VISIBLE_DEVICES=$(echo $(seq 0 $((SLURM_GPUS_ON_NODE-1))) | tr ' ' ',')


# Set stack size limit for large merge of TRI covariates.
ulimit -s 20000

# Download and calculate covariates via container_covariates.sif
apptainer exec \
--nv \
--bind $PWD:/mnt \
--bind $PWD/inst:/inst \
--bind /ddn/gs1/group/set/Projects/NRT-AP-Model/input:/input \
--bind /ddn/gs1/group/set/Projects/beethoven/targets:/opt/_targets \
--bind /run/munge:/run/munge \
--bind /ddn/gs1/tools/slurm/etc/slurm:/ddn/gs1/tools/slurm/etc/slurm \
container_covariates.sif \
/usr/local/lib/R/bin/Rscript --no-init-file /mnt/inst/targets/targets_start.R
11 changes: 5 additions & 6 deletions inst/scripts/run_models.sh
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
#!/bin/bash

#SBATCH --job-name=model
#SBATCH --mail-user=mitchell.manware@nih.gov
#SBATCH --mail-user=kyle.messier@nih.gov
#SBATCH --mail-type=END,FAIL
#SBATCH --partition=geo
#SBATCH --partition=normal
#SBATCH --ntasks=1
#SBATCH --mem=900G
#SBATCH --cpus-per-task=225
#SBATCH --mem=4G
#SBATCH --cpus-per-task=1
#SBATCH --error=slurm/model_%j.err
#SBATCH --output=slurm/model_%j.out

Expand All @@ -18,15 +18,14 @@ export SSL_CERT_FILE=/etc/ssl/certs/ca-certificates.crt

############################### GPU SETUP #############################
# Ensure all allocated GPUs are visible
export CUDA_VISIBLE_DEVICES=$(echo $(seq 0 $((SLURM_GPUS_ON_NODE-1))) | tr ' ' ',')
# export CUDA_VISIBLE_DEVICES=$(echo $(seq 0 $((SLURM_GPUS_ON_NODE-1))) | tr ' ' ',')

############################# MODELS #############################
# Set environmental variable to indicate CPU-enabled model fitting targets.
export BEETHOVEN=models

# Fit CPU-enabled base learner models via container_models.sif.
apptainer exec \
--nv \
--bind $PWD:/mnt \
--bind $PWD/inst:/inst \
--bind /ddn/gs1/group/set/Projects/NRT-AP-Model/input:/input \
Expand Down
15 changes: 10 additions & 5 deletions inst/scripts/run_predict.sh
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
#!/bin/bash

#SBATCH --job-name=predict
#SBATCH --mail-user=mitchell.manware@nih.gov
#SBATCH --job-name=gridh3
#SBATCH --mail-user=kyle.messier@nih.gov
#SBATCH --mail-type=END,FAIL
#SBATCH --partition=geo
#SBATCH --ntasks=1
#SBATCH --mem=900G
#SBATCH --cpus-per-task=225
#SBATCH --cpus-per-task=1
#SBATCH --mem=1G
#SBATCH --error=slurm/predict_%j.err
#SBATCH --output=slurm/predict_%j.out

Expand All @@ -16,11 +16,16 @@
export CURL_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt
export SSL_CERT_FILE=/etc/ssl/certs/ca-certificates.crt

############################# COVARIATES #############################
############################# PREDICTION COVARIATEs #############################
# Set environmental variable to indicate download and covariate
# calculation targets.
export BEETHOVEN=predict

############################### GPU SETUP #############################
# Ensure all allocated GPUs are visible
export CUDA_VISIBLE_DEVICES=$(echo $(seq 0 $((SLURM_GPUS_ON_NODE-1))) | tr ' ' ',')


# Set stack size limit for large merge of TRI covariates.
ulimit -s 20000

Expand Down
Loading
Loading