add ENABLE_EXPERT_PARALLEL engine arg for MoE models (#239)

velaraptor-runpod · web-flow · commit 3851d53f93e2 · 2025-11-17T19:25:19.000+01:00
* enable expert parallel arg for moe models

* add ENABLE_EXPERT_PARALLEL to hub config
diff --git a/.runpod/hub.json b/.runpod/hub.json
@@ -929,6 +929,16 @@
           "advanced": true
         }
       },
+      {
+        "key": "ENABLE_EXPERT_PARALLEL",
+        "input": {
+          "name": "Enable Expert Parallel",
+          "type": "boolean",
+          "description": "Enable Expert Parallel for MoE models",
+          "default": false,
+          "advanced": true
+        }
+      },
       {
         "key": "MODEL_REVISION",
         "input": {
diff --git a/docs/configuration.md b/docs/configuration.md
@@ -85,6 +85,7 @@ Complete guide to all environment variables and configuration options for worker
 | `ENFORCE_EAGER`                | False   | `bool`          | Always use eager-mode PyTorch. If False(`0`), will use eager mode and CUDA graph in hybrid for maximal performance and flexibility. |
 | `MAX_SEQ_LEN_TO_CAPTURE`       | `8192`  | `int`           | Maximum context length covered by CUDA graphs. When a sequence has context length larger than this, we fall back to eager mode.     |
 | `DISABLE_CUSTOM_ALL_REDUCE`    | `0`     | `int`           | Enables or disables custom all reduce.                                                                                              |
+| `ENABLE_EXPERT_PARALLEL`       | `False` | `bool`           |  Enable Expert Parallel for MoE models  |
 
 ## Tokenizer Settings
 
diff --git a/src/engine_args.py b/src/engine_args.py
@@ -80,6 +80,7 @@
     "guided_decoding_backend": os.getenv('GUIDED_DECODING_BACKEND', 'outlines'),
     "speculative_model": os.getenv('SPECULATIVE_MODEL', None),
     "speculative_draft_tensor_parallel_size": int(os.getenv('SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE', 0)) or None,
+    "enable_expert_parallel": bool(os.getenv('ENABLE_EXPERT_PARALLEL', 'False').lower() == 'true'),
     "num_speculative_tokens": int(os.getenv('NUM_SPECULATIVE_TOKENS', 0)) or None,
     "speculative_max_model_len": int(os.getenv('SPECULATIVE_MAX_MODEL_LEN', 0)) or None,
     "speculative_disable_by_batch_size": int(os.getenv('SPECULATIVE_DISABLE_BY_BATCH_SIZE', 0)) or None,