feat(scheduling): support assigning multiple GPU per worker instance (bentoml#3950)

aarnphm · web-flow · commit 89e5fdaf2086 · 2023-06-12T10:27:54.000-07:00
* feat(scheduling): support assigning multiple GPU per worker instance

Signed-off-by: aarnphm-ec2-dev &lt;29749331+aarnphm@users.noreply.github.com&gt;

* tests: add tests case for fractional GPU

Signed-off-by: aarnphm-ec2-dev &lt;29749331+aarnphm@users.noreply.github.com&gt;

* fix(strategy): arithmetic counting

Signed-off-by: aarnphm-ec2-dev &lt;29749331+aarnphm@users.noreply.github.com&gt;

* chore(logs): add a error log about the exception

Signed-off-by: aarnphm-ec2-dev &lt;29749331+aarnphm@users.noreply.github.com&gt;

* fix(strategy): rounding the assigned resources

Signed-off-by: Aaron &lt;29749331+aarnphm@users.noreply.github.com&gt;

* chore: update grammar about exception message

---------

Signed-off-by: aarnphm-ec2-dev &lt;29749331+aarnphm@users.noreply.github.com&gt;
Signed-off-by: Aaron &lt;29749331+aarnphm@users.noreply.github.com&gt;
diff --git a/src/bentoml/_internal/configuration/v1/__init__.py b/src/bentoml/_internal/configuration/v1/__init__.py
@@ -150,7 +150,9 @@
     # NOTE: there is a distinction between being unset and None here; if set to 'None'
     # in configuration for a specific runner, it will override the global configuration.
     s.Optional("resources"): s.Or({s.Optional(str): object}, lambda s: s == "system", None),  # type: ignore (incomplete schema typing)
-    s.Optional("workers_per_resource"): s.And(int, ensure_larger_than_zero),
+    s.Optional("workers_per_resource"): s.And(
+        s.Or(int, float), ensure_larger_than_zero
+    ),
     s.Optional("logging"): {
         s.Optional("access"): {
             s.Optional("enabled"): bool,
diff --git a/src/bentoml/_internal/runner/runner.py b/src/bentoml/_internal/runner/runner.py
@@ -130,7 +130,7 @@ def __getattr__(self, item: str) -> t.Any:
 
     runner_methods: list[RunnerMethod[t.Any, t.Any, t.Any]]
     scheduling_strategy: type[Strategy]
-    workers_per_resource: int = 1
+    workers_per_resource: int | float = 1
     runnable_init_params: dict[str, t.Any] = attr.field(
         default=None, converter=attr.converters.default_if_none(factory=dict)
     )
diff --git a/src/bentoml/_internal/runner/strategy.py b/src/bentoml/_internal/runner/strategy.py
@@ -1,13 +1,12 @@
 from __future__ import annotations
 
 import abc
+import logging
 import math
 import typing as t
-import logging
 
+from ..resource import get_resource, system_resources
 from .runnable import Runnable
-from ..resource import get_resource
-from ..resource import system_resources
 
 logger = logging.getLogger(__name__)
 
@@ -18,8 +17,8 @@ class Strategy(abc.ABC):
     def get_worker_count(
         cls,
         runnable_class: t.Type[Runnable],
-        resource_request: dict[str, t.Any],
-        workers_per_resource: int,
+        resource_request: dict[str, t.Any] | None,
+        workers_per_resource: int | float,
     ) -> int:
         ...
 
@@ -28,19 +27,15 @@ def get_worker_count(
     def get_worker_env(
         cls,
         runnable_class: t.Type[Runnable],
-        resource_request: dict[str, t.Any],
-        workers_per_resource: int,
+        resource_request: dict[str, t.Any] | None,
+        workers_per_resource: int | float,
         worker_index: int,
     ) -> dict[str, t.Any]:
         """
-        Parameters
-        ----------
-        runnable_class : type[Runnable]
-            The runnable class to be run.
-        resource_request : dict[str, Any]
-            The resource request of the runnable.
-        worker_index : int
-            The index of the worker, start from 0.
+        Args:
+            runnable_class : The runnable class to be run.
+            resource_request : The resource request of the runnable.
+            worker_index : The index of the worker, start from 0.
         """
         ...
 
@@ -66,7 +61,7 @@ def get_worker_count(
         cls,
         runnable_class: t.Type[Runnable],
         resource_request: dict[str, t.Any] | None,
-        workers_per_resource: int,
+        workers_per_resource: int | float,
     ) -> int:
         if resource_request is None:
             resource_request = system_resources()
@@ -78,7 +73,7 @@ def get_worker_count(
             and len(nvidia_gpus) > 0
             and "nvidia.com/gpu" in runnable_class.SUPPORTED_RESOURCES
         ):
-            return len(nvidia_gpus) * workers_per_resource
+            return math.ceil(len(nvidia_gpus) * workers_per_resource)
 
         # use CPU
         cpus = get_resource(resource_request, "cpu")
@@ -90,6 +85,10 @@ def get_worker_count(
                 )
 
             if runnable_class.SUPPORTS_CPU_MULTI_THREADING:
+                if isinstance(workers_per_resource, float):
+                    raise ValueError(
+                        "Fractional CPU multi threading support is not yet supported."
+                    )
                 return workers_per_resource
 
             return math.ceil(cpus) * workers_per_resource
@@ -105,31 +104,53 @@ def get_worker_env(
         cls,
         runnable_class: t.Type[Runnable],
         resource_request: dict[str, t.Any] | None,
-        workers_per_resource: int,
+        workers_per_resource: int | float,
         worker_index: int,
     ) -> dict[str, t.Any]:
         """
-        Parameters
-        ----------
-        runnable_class : type[Runnable]
-            The runnable class to be run.
-        resource_request : dict[str, Any]
-            The resource request of the runnable.
-        worker_index : int
-            The index of the worker, start from 0.
+        Args:
+            runnable_class : The runnable class to be run.
+            resource_request : The resource request of the runnable.
+            worker_index : The index of the worker, start from 0.
         """
         environ: dict[str, t.Any] = {}
         if resource_request is None:
             resource_request = system_resources()
-
         # use nvidia gpu
-        nvidia_gpus = get_resource(resource_request, "nvidia.com/gpu")
+        nvidia_gpus: list[int] | None = get_resource(resource_request, "nvidia.com/gpu")
         if (
             nvidia_gpus is not None
             and len(nvidia_gpus) > 0
             and "nvidia.com/gpu" in runnable_class.SUPPORTED_RESOURCES
         ):
-            dev = str(nvidia_gpus[worker_index // workers_per_resource])
+            if isinstance(workers_per_resource, float):
+                # NOTE: We hit this branch when workers_per_resource is set to
+                # float, for example 0.5 or 0.25
+                if workers_per_resource > 1:
+                    raise ValueError(
+                        "Currently, the default strategy doesn't support workers_per_resource > 1. It is recommended that one should implement a custom strategy in this case."
+                    )
+                # We are round the assigned resource here. This means if workers_per_resource=.4
+                # then it will round down to 2. If workers_per_source=0.6, then it will also round up to 2.
+                assigned_resource_per_worker = round(1 / workers_per_resource)
+                if len(nvidia_gpus) < assigned_resource_per_worker:
+                    logger.warning(
+                        "Failed to allocate %s GPUs for %s (number of available GPUs < assigned workers per resource [%s])",
+                        nvidia_gpus,
+                        worker_index,
+                        assigned_resource_per_worker,
+                    )
+                    raise IndexError(
+                        f"There aren't enough assigned GPU(s) for given worker id '{worker_index}' [required: {assigned_resource_per_worker}]."
+                    )
+                assigned_gpu = nvidia_gpus[
+                    assigned_resource_per_worker
+                    * worker_index : assigned_resource_per_worker
+                    * (worker_index + 1)
+                ]
+                dev = ",".join(map(str, assigned_gpu))
+            else:
+                dev = str(nvidia_gpus[worker_index // workers_per_resource])
             environ["CUDA_VISIBLE_DEVICES"] = dev
             logger.info(
                 "Environ for worker %s: set CUDA_VISIBLE_DEVICES to %s",
diff --git a/tests/unit/_internal/runner/test_strategy.py b/tests/unit/_internal/runner/test_strategy.py
@@ -8,8 +8,8 @@
     from _pytest.monkeypatch import MonkeyPatch
 
 import bentoml
-from bentoml._internal.runner import strategy
 from bentoml._internal.resource import get_resource
+from bentoml._internal.runner import strategy
 from bentoml._internal.runner.strategy import DefaultStrategy
 
 
@@ -51,6 +51,29 @@ def test_default_gpu_strategy(monkeypatch: MonkeyPatch):
         == 4
     )
 
+    assert (
+        DefaultStrategy.get_worker_count(GPURunnable, {"nvidia.com/gpu": [2, 7]}, 0.5)
+        == 1
+    )
+    assert (
+        DefaultStrategy.get_worker_count(
+            GPURunnable, {"nvidia.com/gpu": [2, 7, 9]}, 0.5
+        )
+        == 2
+    )
+    assert (
+        DefaultStrategy.get_worker_count(
+            GPURunnable, {"nvidia.com/gpu": [2, 7, 8, 9]}, 0.5
+        )
+        == 2
+    )
+    assert (
+        DefaultStrategy.get_worker_count(
+            GPURunnable, {"nvidia.com/gpu": [2, 5, 7, 8, 9]}, 0.4
+        )
+        == 2
+    )
+
     envs = DefaultStrategy.get_worker_env(GPURunnable, {"nvidia.com/gpu": 2}, 1, 0)
     assert envs.get("CUDA_VISIBLE_DEVICES") == "0"
     envs = DefaultStrategy.get_worker_env(GPURunnable, {"nvidia.com/gpu": 2}, 1, 1)
@@ -69,6 +92,37 @@ def test_default_gpu_strategy(monkeypatch: MonkeyPatch):
     envs = DefaultStrategy.get_worker_env(GPURunnable, {"nvidia.com/gpu": [2, 7]}, 2, 2)
     assert envs.get("CUDA_VISIBLE_DEVICES") == "7"
 
+    envs = DefaultStrategy.get_worker_env(
+        GPURunnable, {"nvidia.com/gpu": [2, 7]}, 0.5, 0
+    )
+    assert envs.get("CUDA_VISIBLE_DEVICES") == "2,7"
+
+    envs = DefaultStrategy.get_worker_env(
+        GPURunnable, {"nvidia.com/gpu": [2, 7, 8, 9]}, 0.5, 0
+    )
+    assert envs.get("CUDA_VISIBLE_DEVICES") == "2,7"
+    envs = DefaultStrategy.get_worker_env(
+        GPURunnable, {"nvidia.com/gpu": [2, 7, 8, 9]}, 0.5, 1
+    )
+    assert envs.get("CUDA_VISIBLE_DEVICES") == "8,9"
+    envs = DefaultStrategy.get_worker_env(
+        GPURunnable, {"nvidia.com/gpu": [2, 7, 8, 9]}, 0.25, 0
+    )
+    assert envs.get("CUDA_VISIBLE_DEVICES") == "2,7,8,9"
+
+    envs = DefaultStrategy.get_worker_env(
+        GPURunnable, {"nvidia.com/gpu": [2, 6, 7, 8, 9]}, 0.4, 0
+    )
+    assert envs.get("CUDA_VISIBLE_DEVICES") == "2,6"
+    envs = DefaultStrategy.get_worker_env(
+        GPURunnable, {"nvidia.com/gpu": [2, 6, 7, 8, 9]}, 0.4, 1
+    )
+    assert envs.get("CUDA_VISIBLE_DEVICES") == "7,8"
+    envs = DefaultStrategy.get_worker_env(
+        GPURunnable, {"nvidia.com/gpu": [2, 6, 7, 8, 9]}, 0.4, 2
+    )
+    assert envs.get("CUDA_VISIBLE_DEVICES") == "9"
+
 
 def test_default_cpu_strategy(monkeypatch: MonkeyPatch):
     monkeypatch.setattr(strategy, "get_resource", unvalidated_get_resource)

Original file line number	Diff line number	Diff line change
`@@ -130,7 +130,7 @@ def __getattr__(self, item: str) -> t.Any:`
`130`	`130`
`131`	`131`	`runner_methods: list[RunnerMethod[t.Any, t.Any, t.Any]]`
`132`	`132`	`scheduling_strategy: type[Strategy]`
`133`		`- workers_per_resource: int = 1`
	`133`	`+ workers_per_resource: int \| float = 1`
`134`	`134`	`runnable_init_params: dict[str, t.Any] = attr.field(`
`135`	`135`	`default=None, converter=attr.converters.default_if_none(factory=dict)`
`136`	`136`	`)`