[docs][data][llm] Introduce docs for serve deployment processor and cross-node parallelism (ray-project#57261)

jeffreyjeffreywang · jeffreyjeffreywang · richardliaw · web-flow · commit eec5c69db0d7 · 2025-10-29T18:42:44.000-07:00
Signed-off-by: jeffreyjeffreywang &lt;jeffjeffreywang@gmail.com&gt;
Signed-off-by: Richard Liaw &lt;rliaw@berkeley.edu&gt;
Co-authored-by: jeffreyjeffreywang &lt;jeffjeffreywang@gmail.com&gt;
Co-authored-by: Richard Liaw &lt;rliaw@berkeley.edu&gt;
Co-authored-by: gemini-code-assist[bot] &lt;176961590+gemini-code-assist[bot]@users.noreply.github.com&gt;
Co-authored-by: Nikhil G &lt;nrghosh@users.noreply.github.com&gt;
diff --git a/doc/source/data/doc_code/working-with-llms/basic_llm_example.py b/doc/source/data/doc_code/working-with-llms/basic_llm_example.py
@@ -197,4 +197,135 @@ def create_embedding_processor():
 
 # __embedding_config_example_end__
 
+# __shared_vllm_engine_config_example_start__
+import ray
+from ray import serve
+from ray.data.llm import ServeDeploymentProcessorConfig, build_llm_processor
+from ray.serve.llm import (
+    LLMConfig,
+    ModelLoadingConfig,
+    build_llm_deployment,
+)
+from ray.serve.llm.openai_api_models import CompletionRequest
+
+llm_config = LLMConfig(
+    model_loading_config=ModelLoadingConfig(
+        model_id="facebook/opt-1.3b",
+        model_source="facebook/opt-1.3b",
+    ),
+    deployment_config=dict(
+        name="demo_deployment_config",
+        autoscaling_config=dict(
+            min_replicas=1,
+            max_replicas=1,
+        ),
+    ),
+    engine_kwargs=dict(
+        enable_prefix_caching=True,
+        enable_chunked_prefill=True,
+        max_num_batched_tokens=4096,
+    ),
+)
+
+APP_NAME = "demo_app"
+DEPLOYMENT_NAME = "demo_deployment"
+override_serve_options = dict(name=DEPLOYMENT_NAME)
+
+llm_app = build_llm_deployment(
+    llm_config, override_serve_options=override_serve_options
+)
+app = serve.run(llm_app, name=APP_NAME)
+config = ServeDeploymentProcessorConfig(
+    deployment_name=DEPLOYMENT_NAME,
+    app_name=APP_NAME,
+    dtype_mapping={
+        "CompletionRequest": CompletionRequest,
+    },
+    concurrency=1,
+    batch_size=64,
+)
+
+processor1 = build_llm_processor(
+    config,
+    preprocess=lambda row: dict(
+        method="completions",
+        dtype="CompletionRequest",
+        request_kwargs=dict(
+            model="facebook/opt-1.3b",
+            prompt=f"This is a prompt for {row['id']}",
+            stream=False,
+        ),
+    ),
+    postprocess=lambda row: dict(
+        prompt=row["choices"][0]["text"],
+    ),
+)
+
+processor2 = build_llm_processor(
+    config,
+    preprocess=lambda row: dict(
+        method="completions",
+        dtype="CompletionRequest",
+        request_kwargs=dict(
+            model="facebook/opt-1.3b",
+            prompt=row["prompt"],
+            stream=False,
+        ),
+    ),
+    postprocess=lambda row: row,
+)
+
+ds = ray.data.range(10)
+ds = processor2(processor1(ds))
+print(ds.take_all())
+# __shared_vllm_engine_config_example_end__
+
+# __cross_node_parallelism_config_example_start__
+config = vLLMEngineProcessorConfig(
+    model_source="unsloth/Llama-3.1-8B-Instruct",
+    engine_kwargs={
+        "enable_chunked_prefill": True,
+        "max_num_batched_tokens": 4096,
+        "max_model_len": 16384,
+        "pipeline_parallel_size": 4,
+        "tensor_parallel_size": 4,
+        "distributed_executor_backend": "ray",
+    },
+    batch_size=32,
+    concurrency=1,
+)
+# __cross_node_parallelism_config_example_end__
+
+# __custom_placement_group_strategy_config_example_start__
+config = vLLMEngineProcessorConfig(
+    model_source="unsloth/Llama-3.1-8B-Instruct",
+    engine_kwargs={
+        "enable_chunked_prefill": True,
+        "max_num_batched_tokens": 4096,
+        "max_model_len": 16384,
+        "pipeline_parallel_size": 2,
+        "tensor_parallel_size": 2,
+        "distributed_executor_backend": "ray",
+    },
+    batch_size=32,
+    concurrency=1,
+    placement_group_config={
+        "bundles": [{"GPU": 1}] * 4,
+        "strategy": "STRICT_PACK",
+    },
+)
+# __custom_placement_group_strategy_config_example_end__
+
+# __concurrent_config_example_start__
+config = vLLMEngineProcessorConfig(
+    model_source="unsloth/Llama-3.1-8B-Instruct",
+    engine_kwargs={
+        "enable_chunked_prefill": True,
+        "max_num_batched_tokens": 4096,
+        "max_model_len": 16384,
+    },
+    concurrency=10,
+    batch_size=64,
+)
+# __concurrent_config_example_end__
 # __basic_llm_example_end__
diff --git a/doc/source/data/working-with-llms.rst b/doc/source/data/working-with-llms.rst
@@ -205,6 +205,51 @@ You can also make calls to deployed models that have an OpenAI compatible API en
     :start-after: __openai_example_start__
     :end-before: __openai_example_end__
 
+Batch inference with serve deployments
+---------------------------------------
+
+You can configure any :ref:`serve deployment <converting-to-ray-serve-application>` for batch inference. This is particularly useful for multi-turn conversations,
+where you can use a shared vLLM engine across conversations. To achieve this, create an :ref:`LLM serve deployment <serving-llms>` and use
+the :class:`ServeDeploymentProcessorConfig <ray.data.llm.ServeDeploymentProcessorConfig>` class to configure the processor.
+
+.. literalinclude:: doc_code/working-with-llms/basic_llm_example.py
+    :language: python
+    :start-after: __shared_vllm_engine_config_example_start__
+    :end-before: __shared_vllm_engine_config_example_end__
+
+Cross-node parallelism
+---------------------------------------
+
+Ray Data LLM supports cross-node parallelism, including tensor parallelism and pipeline parallelism.
+You can configure the parallelism level through the `engine_kwargs` argument in
+:class:`vLLMEngineProcessorConfig <ray.data.llm.vLLMEngineProcessorConfig>`. Use `ray` as the
+distributed executor backend to enable cross-node parallelism.
+
+.. literalinclude:: doc_code/working-with-llms/basic_llm_example.py
+    :language: python
+    :start-after: __cross_node_parallelism_config_example_start__
+    :end-before: __cross_node_parallelism_config_example_end__
+
+
+In addition, you can customize the placement group strategy to control how Ray places vLLM engine workers across nodes.
+While you can specify the degree of tensor and pipeline parallelism, the specific assignment of model ranks to GPUs is managed by the vLLM engine and you can't directly configure it through the Ray Data LLM API.
+
+.. literalinclude:: doc_code/working-with-llms/basic_llm_example.py
+    :language: python
+    :start-after: __custom_placement_group_strategy_config_example_start__
+    :end-before: __custom_placement_group_strategy_config_example_end__
+
+Besides cross-node parallelism, you can also horizontally scale the LLM stage to multiple nodes.
+Configure the number of replicas with the `concurrency` argument in
+:class:`vLLMEngineProcessorConfig <ray.data.llm.vLLMEngineProcessorConfig>`.
+
+.. literalinclude:: doc_code/working-with-llms/basic_llm_example.py
+    :language: python
+    :start-after: __concurrent_config_example_start__
+    :end-before: __concurrent_config_example_end__
+
+
+
 Usage Data Collection
 --------------------------
 
@@ -227,27 +272,6 @@ to turn it off.
 Frequently Asked Questions (FAQs)
 --------------------------------------------------
 
-.. TODO(#55491): Rewrite this section once the restriction is lifted.
-.. TODO(#55405): Cross-node TP in progress.
-.. _cross_node_parallelism:
-
-How to configure LLM stage to parallelize across multiple nodes?
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-At the moment, Ray Data LLM doesn't support cross-node parallelism (either
-tensor parallelism or pipeline parallelism).
-
-The processing pipeline is designed to run on a single node. The number of
-GPUs is calculated as the product of the tensor parallel size and the pipeline
-parallel size, and apply
-[`STRICT_PACK` strategy](https://docs.ray.io/en/latest/ray-core/scheduling/placement-group.html#pgroup-strategy)
-to ensure that each replica of the LLM stage is executed on a single node.
-
-Nevertheless, you can still horizontally scale the LLM stage to multiple nodes
-as long as each replica (TP * PP) fits into a single node. The number of
-replicas is configured by the `concurrency` argument in
-:class:`vLLMEngineProcessorConfig <ray.data.llm.vLLMEngineProcessorConfig>`.
-
 .. _gpu_memory_management:
 
 GPU Memory Management and CUDA OOM Prevention