Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -151,3 +151,6 @@ _html/

# Parsl log files
run_logs/

# Emacs
*~
262 changes: 235 additions & 27 deletions src/kbmod_wf/resource_configs/klone_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,93 +18,301 @@ def klone_resource_config():
app_cache=True,
checkpoint_mode="task_exit",
checkpoint_files=get_all_checkpoints(
os.path.join("/gscratch/dirac/kbmod/workflow/run_logs", datetime.date.today().isoformat())
os.path.join(os.path.abspath(os.curdir), datetime.date.today().isoformat())
),
run_dir=os.path.join("/gscratch/dirac/kbmod/workflow/run_logs", datetime.date.today().isoformat()),
run_dir=os.path.join(os.path.abspath(os.curdir), datetime.date.today().isoformat()),
retries=1,
executors=[
####################
# Resample resources
####################
HighThroughputExecutor(
label="small_cpu",
label="astro_96gb_8cpus",
max_workers=1,
provider=SlurmProvider(
partition="ckpt-g2",
partition="compute-bigmem",
account="astro",
min_blocks=0,
max_blocks=4, # Low block count for shared resource
init_blocks=0,
parallelism=1,
nodes_per_block=1,
mem_per_node=96, # 96 GB for >100, 48 for < 100
cores_per_node=8,
exclusive=False,
walltime=walltimes["sharded_reproject"],
worker_init="",
),
),
HighThroughputExecutor(
label="astro_48gb_8cpus",
max_workers=1,
provider=SlurmProvider(
partition="compute-bigmem",
account="astro",
min_blocks=0,
max_blocks=4,
max_blocks=4, # Low block count for shared resource
init_blocks=0,
parallelism=1,
nodes_per_block=1,
cores_per_node=1, # perhaps should be 8???
mem_per_node=256, # In GB
mem_per_node=48, # 96 GB for >100, 48 for < 100
cores_per_node=8,
exclusive=False,
walltime=walltimes["compute_bigmem"],
# Command to run before starting worker - i.e. conda activate <special_env>
walltime=walltimes["sharded_reproject"],
worker_init="",
),
),
HighThroughputExecutor(
label="large_mem",
label="esci_96gb_8cpus",
max_workers=1,
provider=SlurmProvider(
partition="ckpt-g2",
partition="gpu-a40",
account="escience",
min_blocks=0,
max_blocks=4, # low block count for shared resources
init_blocks=0,
parallelism=1,
nodes_per_block=1,
mem_per_node=96, # 96 GB for >100, 48 for < 100
cores_per_node=8,
exclusive=False,
walltime=walltimes["sharded_reproject"],
worker_init="",
),
),
HighThroughputExecutor(
label="esci_48gb_8cpus",
max_workers=1,
provider=SlurmProvider(
partition="gpu-a40",
account="escience",
min_blocks=0,
max_blocks=4, # low block count for shared resources
init_blocks=0,
parallelism=1,
nodes_per_block=1,
mem_per_node=48, # 96 GB for >100, 48 for < 100
cores_per_node=8,
exclusive=False,
walltime=walltimes["sharded_reproject"],
worker_init="",
),
),
HighThroughputExecutor(
label="ckpt_96gb_8cpus",
max_workers=1,
provider=SlurmProvider(
partition="ckpt-all",
account="astro",
min_blocks=0,
max_blocks=2,
max_blocks=50, # scale to the size of the GPU blocks, big number for low memory
init_blocks=0,
parallelism=1,
nodes_per_block=1,
cores_per_node=32,
mem_per_node=512,
mem_per_node=96, # 96 GB for >100, 48 for < 100
cores_per_node=8,
exclusive=False,
walltime=walltimes["large_mem"],
# Command to run before starting worker - i.e. conda activate <special_env>
walltime=walltimes["sharded_reproject"],
worker_init="",
),
),
HighThroughputExecutor(
label="sharded_reproject",
label="ckpt_48gb_8cpus",
max_workers=1,
provider=SlurmProvider(
partition="ckpt-g2",
partition="ckpt-all",
account="astro",
min_blocks=0,
max_blocks=2,
max_blocks=50, # scale to the size of the GPU blocks, big number for low memory
init_blocks=0,
parallelism=1,
nodes_per_block=1,
cores_per_node=32,
mem_per_node=128, # ~2-4 GB per core
mem_per_node=48, # 96 GB for >100, 48 for < 100
cores_per_node=8,
exclusive=False,
walltime=walltimes["sharded_reproject"],
worker_init="",
),
),
####################
# Search resources
####################
HighThroughputExecutor(
label="esci_96gb_2cpu_1gpu",
max_workers=1,
provider=SlurmProvider(
partition="gpu-a40",
account="escience",
min_blocks=0,
max_blocks=4, # low block count for shared resource
init_blocks=0,
parallelism=1,
nodes_per_block=1,
cores_per_node=2, # perhaps should be 8???
mem_per_node=96, # 96 GB for >100, 48 for < 100
exclusive=False,
walltime=walltimes["gpu_max"],
worker_init="",
scheduler_options="#SBATCH --gpus=1",
),
),
HighThroughputExecutor(
label="esci_48gb_2cpu_1gpu",
max_workers=1,
provider=SlurmProvider(
partition="gpu-a40",
account="escience",
min_blocks=0,
max_blocks=4, # low block count for shared resource
init_blocks=0,
parallelism=1,
nodes_per_block=1,
cores_per_node=2, # perhaps should be 8???
mem_per_node=48, # 96 GB for >100, 48 for < 100
exclusive=False,
walltime=walltimes["gpu_max"],
worker_init="",
scheduler_options="#SBATCH --gpus=1",
),
),
HighThroughputExecutor(
label="esci_32gb_2cpu_1gpu",
max_workers=1,
provider=SlurmProvider(
partition="gpu-a40",
account="escience",
min_blocks=0,
max_blocks=4, # low block count for shared resource
init_blocks=0,
parallelism=1,
nodes_per_block=1,
cores_per_node=2, # perhaps should be 8???
mem_per_node=32, # 96 GB for >100, 48 for < 100
exclusive=False,
walltime=walltimes["gpu_max"],
worker_init="",
scheduler_options="#SBATCH --gpus=1",
),
),
HighThroughputExecutor(
label="ckpt_96gb_2cpu_1gpu",
max_workers=1,
provider=SlurmProvider(
partition="ckpt-g2",
account="escience",
min_blocks=0,
max_blocks=50, # 20 for 96, 50 for 48
init_blocks=0,
parallelism=1,
nodes_per_block=1,
cores_per_node=2, # perhaps should be 8???
mem_per_node=96, # 96 GB for >100, 48 for < 100
exclusive=False,
walltime=walltimes["gpu_max"],
# Command to run before starting worker - i.e. conda activate <special_env>
worker_init="",
scheduler_options="#SBATCH --gpus=1",
),
),
HighThroughputExecutor(
label="ckpt_48gb_2cpu_1gpu",
max_workers=1,
provider=SlurmProvider(
partition="ckpt-g2",
account="escience",
min_blocks=0,
max_blocks=50, # 20 for 96, 50 for 48
init_blocks=0,
parallelism=1,
nodes_per_block=1,
cores_per_node=2, # perhaps should be 8???
mem_per_node=48, # 96 GB for >100, 48 for < 100
exclusive=False,
walltime=walltimes["gpu_max"],
# Command to run before starting worker - i.e. conda activate <special_env>
worker_init="",
scheduler_options="#SBATCH --gpus=1",
),
),
HighThroughputExecutor(
label="gpu",
label="ckpt_32gb_2cpu_1gpu",
max_workers=1,
provider=SlurmProvider(
partition="ckpt-g2",
account="escience",
min_blocks=0,
max_blocks=2,
max_blocks=50, # 20 for 96, 50 for 48
init_blocks=0,
parallelism=1,
nodes_per_block=1,
cores_per_node=2, # perhaps should be 8???
mem_per_node=512, # In GB
mem_per_node=32, # 96 GB for >100, 48 for < 100
exclusive=False,
walltime=walltimes["gpu_max"],
# Command to run before starting worker - i.e. conda activate <special_env>
worker_init="",
scheduler_options="#SBATCH --gpus=1",
),
),

####################
# Analysis resource
####################
HighThroughputExecutor(
label="local_thread",
provider=LocalProvider(
label="astro_4gb_2cpus",
max_workers=1, # Do we mean max_workers_per_node here?
provider=SlurmProvider(
partition="compute-bigmem", # ckpt-all
account="astro", # astro
min_blocks=0,
max_blocks=12, # low block count for shared resource
init_blocks=0,
max_blocks=1,
parallelism=1,
nodes_per_block=1,
mem_per_node=4,
cores_per_node=2,
exclusive=False,
walltime=walltimes["sharded_reproject"],
# Command to run before starting worker - i.e. conda activate <special_env>
worker_init="",
),
),
HighThroughputExecutor(
label="esci_4gb_2cpus",
max_workers=1, # Do we mean max_workers_per_node here?
provider=SlurmProvider(
partition="gpu-a40", # ckpt-all
account="escience", # astro
min_blocks=0,
max_blocks=12, # low block count for shared resource
init_blocks=0,
parallelism=1,
nodes_per_block=1,
mem_per_node=4,
cores_per_node=2,
exclusive=False,
walltime=walltimes["sharded_reproject"],
# Command to run before starting worker - i.e. conda activate <special_env>
worker_init="",
),
),
HighThroughputExecutor(
label="ckpt_4gb_2cpus",
max_workers=1, # Do we mean max_workers_per_node here?
provider=SlurmProvider(
partition="ckpt-all", # ckpt-all
account="astro", # astro
min_blocks=0,
max_blocks=100, # can leave large at all times
init_blocks=0,
parallelism=1,
nodes_per_block=1,
mem_per_node=4,
cores_per_node=2,
exclusive=False,
walltime=walltimes["sharded_reproject"],
# Command to run before starting worker - i.e. conda activate <special_env>
worker_init="",
),
),
],
Expand Down
Loading