This repository will contain the code for the paper " Scalable Pipeline Parallel Training of Neural Networks using Reversibility ".
This project depends on Singularity and Slurm.
- Build [CUDA|ROCM] image
export SINGULARITY_TMPDIR=/tmp
singularity build --fakeroot unlimited_pp_[cuda|rocm].sif environment/unlimited_pp_[cuda|rocm].def- Enqueue the job
export CONFIG=$PWD/config/env_mycluster.sh # Set your environment configuration
./scripts/exp_training/submit_all.sh # Submit training jobs
./scripts/exp_profiling/submit_all.sh # Submit profiling jobs- Or Run interactively
export PARTITION=your_partition_name
srun -N 1 -p $PARTITION --pty --gres=gpu:8 \
singularity shell [--nv|--rocm] \
--bind /lvs0/rccs-prt/dataset:/dataset \ # Bind dataset directory
./unlimited_pp_[cuda|rocm].sif
Singularity> torchrun --nnodes=1 --nproc_per_node=8 -m main --par-mode=pp --batch-size=1024 --microbatch-size=64 --num-microbatches=16 --exp-mode=profilingpytorch_unlimited_pp/
├── checkpoints/ # Model checkpoints
├── config/ # Put your environment settings here (See `env_template.sh`)
│ ├── env_common.sh # Common environment settings
│ └── env_template.sh # Template
├── environment/ # Singularity settings
│ ├── unlimited_pp_cuda.def # for CUDA
│ └── unlimited_pp_rocm.def # for ROCm
├── exp/ # Experiment code
│ ├── actv_err/ # Activation error analysis
│ ├── profiling/ # Profiling using PyTorch Profiler
│ └── training/ # Training
├── logs/ # Log files
├── models/ # Model files
├── pipelining/ # Pipelining code derived from PyTorch 2.6.0
├── scripts/ # Experiment scripts
├── src/ # Source code
├── args.py # Argument parser
├── main.py # Template for the experiment
├── README.md # This file
└── LICENSE # License filepipelining/ contains the pipelining code derived from PyTorch 2.6.0.
You can find License file in the pipelining/ directory.