[Feature] pass policy-factory in mp data collectors

Vincent Moens · Vincent Moens · commit f29494ecc644 · 2025-03-19T10:17:44.000-07:00
ghstack-source-id: 3af3a99 Pull Request resolved: #2859
diff --git a/examples/collectors/mp_collector_mps.py b/examples/collectors/mp_collector_mps.py
@@ -0,0 +1,122 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Updating MPS weights in multiprocess/distributed data collectors
+================================================================
+
+Overview of the Script
+----------------------
+
+This script demonstrates a weight update in TorchRL.
+The script uses a custom `MPSRemoteWeightUpdater` class to update the weights of a policy network across multiple workers.
+
+Key Features
+------------
+
+- Multi-Worker Setup: The script creates two worker processes that collect data from a Gym environment
+  ("Pendulum-v1") using a policy network.
+- MPS (Metal Performance Shaders) Device: The policy network is placed on an MPS device.
+- Custom Weight Updater: The `MPSRemoteWeightUpdater` class is used to update the policy weights across workers. This
+  class is necessary because MPS tensors cannot be sent over a pipe due to serialization/pickling issues in PyTorch.
+
+Workaround for MPS Tensor Serialization Issue
+---------------------------------------------
+
+In PyTorch, MPS tensors cannot be serialized or pickled, which means they cannot be sent over a pipe or shared between
+processes. To work around this issue, the MPSRemoteWeightUpdater class sends the policy weights on the CPU device
+instead of the MPS device. The local workers then copy the weights from the CPU device to the MPS device.
+
+Script Flow
+-----------
+
+1. Initialize the environment, policy network, and collector.
+2. Update the policy weights using the MPSRemoteWeightUpdater.
+3. Collect data from the environment using the policy network.
+4. Zero out the policy weights after a few iterations.
+5. Verify that the updated policy weights are being used by checking the actions generated by the policy network.
+
+"""
+
+import tensordict
+import torch
+from tensordict import TensorDictBase
+from tensordict.nn import TensorDictModule
+from torch import nn
+from torchrl.collectors import MultiSyncDataCollector, RemoteWeightUpdaterBase
+
+from torchrl.envs.libs.gym import GymEnv
+
+
+class MPSRemoteWeightUpdater(RemoteWeightUpdaterBase):
+    def __init__(self, policy_weights, num_workers):
+        # Weights are on mps device, which cannot be shared
+        self.policy_weights = policy_weights.data
+        self.num_workers = num_workers
+
+    def _sync_weights_with_worker(
+        self, worker_id: int | torch.device, server_weights: TensorDictBase
+    ) -> TensorDictBase:
+        # Send weights on cpu - the local workers will do the cpu->mps copy
+        self.collector.pipes[worker_id].send((server_weights, "update"))
+        val, msg = self.collector.pipes[worker_id].recv()
+        assert msg == "updated"
+        return server_weights
+
+    def _get_server_weights(self) -> TensorDictBase:
+        print((self.policy_weights == 0).all())
+        return self.policy_weights.cpu()
+
+    def _maybe_map_weights(self, server_weights: TensorDictBase) -> TensorDictBase:
+        print((server_weights == 0).all())
+        return server_weights
+
+    def all_worker_ids(self) -> list[int] | list[torch.device]:
+        return list(range(self.num_workers))
+
+
+if __name__ == "__main__":
+    device = "mps"
+
+    def env_maker():
+        return GymEnv("Pendulum-v1", device="cpu")
+
+    def policy_factory(device=device):
+        return TensorDictModule(
+            nn.Linear(3, 1), in_keys=["observation"], out_keys=["action"]
+        ).to(device=device)
+
+    policy = policy_factory()
+    policy_weights = tensordict.from_module(policy)
+
+    collector = MultiSyncDataCollector(
+        create_env_fn=[env_maker, env_maker],
+        policy_factory=policy_factory,
+        total_frames=2000,
+        max_frames_per_traj=50,
+        frames_per_batch=200,
+        init_random_frames=-1,
+        reset_at_each_iter=False,
+        device=device,
+        storing_device="cpu",
+        remote_weights_updater=MPSRemoteWeightUpdater(policy_weights, 2),
+        # use_buffers=False,
+        # cat_results="stack",
+    )
+
+    collector.update_policy_weights_()
+    try:
+        for i, data in enumerate(collector):
+            if i == 2:
+                print(data)
+                assert (data["action"] != 0).any()
+                # zero the policy
+                policy_weights.data.zero_()
+                collector.update_policy_weights_()
+            elif i == 3:
+                assert (data["action"] == 0).all(), data["action"]
+                break
+    finally:
+        collector.shutdown()
diff --git a/test/test_collector.py b/test/test_collector.py
@@ -39,7 +39,11 @@
     prod,
     seed_generator,
 )
-from torchrl.collectors import aSyncDataCollector, SyncDataCollector
+from torchrl.collectors import (
+    aSyncDataCollector,
+    RemoteWeightUpdaterBase,
+    SyncDataCollector,
+)
 from torchrl.collectors.collectors import (
     _Interruptor,
     MultiaSyncDataCollector,
@@ -146,6 +150,7 @@
 PYTHON_3_10 = sys.version_info.major == 3 and sys.version_info.minor == 10
 PYTHON_3_7 = sys.version_info.major == 3 and sys.version_info.minor == 7
 TORCH_VERSION = version.parse(version.parse(torch.__version__).base_version)
+_has_cuda = torch.cuda.is_available()
 
 
 class WrappablePolicy(nn.Module):
@@ -3476,6 +3481,69 @@ def __deepcopy_error__(*args, **kwargs):
     raise RuntimeError("deepcopy not allowed")
 
 
+class TestPolicyFactory:
+    class MPSRemoteWeightUpdater(RemoteWeightUpdaterBase):
+        def __init__(self, policy_weights, num_workers):
+            # Weights are on mps device, which cannot be shared
+            self.policy_weights = policy_weights.data
+            self.num_workers = num_workers
+
+        def _sync_weights_with_worker(
+            self, worker_id: int | torch.device, server_weights: TensorDictBase
+        ) -> TensorDictBase:
+            # Send weights on cpu - the local workers will do the cpu->mps copy
+            self.collector.pipes[worker_id].send((server_weights, "update"))
+            val, msg = self.collector.pipes[worker_id].recv()
+            assert msg == "updated"
+            return server_weights
+
+        def _get_server_weights(self) -> TensorDictBase:
+            return self.policy_weights.cpu()
+
+        def _maybe_map_weights(self, server_weights: TensorDictBase) -> TensorDictBase:
+            return server_weights
+
+        def all_worker_ids(self) -> list[int] | list[torch.device]:
+            return list(range(self.num_workers))
+
+    @pytest.mark.skipif(not _has_cuda, reason="requires cuda another device than CPU.")
+    def test_weight_update(self):
+        device = "cuda:0"
+        env_maker = lambda: GymEnv("Pendulum-v1", device="cpu")
+        policy_factory = lambda: TensorDictModule(
+            nn.Linear(3, 1), in_keys=["observation"], out_keys=["action"]
+        ).to(device)
+        policy = policy_factory()
+        policy_weights = TensorDict.from_module(policy)
+
+        collector = MultiSyncDataCollector(
+            create_env_fn=[env_maker, env_maker],
+            policy_factory=policy_factory,
+            total_frames=2000,
+            max_frames_per_traj=50,
+            frames_per_batch=200,
+            init_random_frames=-1,
+            reset_at_each_iter=False,
+            device=device,
+            storing_device="cpu",
+            remote_weights_updater=self.MPSRemoteWeightUpdater(policy_weights, 2),
+        )
+
+        collector.update_policy_weights_()
+        try:
+            for i, data in enumerate(collector):
+                if i == 2:
+                    assert (data["action"] != 0).any()
+                    # zero the policy
+                    policy_weights.data.zero_()
+                    collector.update_policy_weights_()
+                elif i == 3:
+                    assert (data["action"] == 0).all(), data["action"]
+                    break
+        finally:
+            collector.shutdown()
+
+
 if __name__ == "__main__":
     args, unknown = argparse.ArgumentParser().parse_known_args()
     pytest.main([__file__, "--capture", "no", "--exitfirst"] + unknown)
diff --git a/torchrl/collectors/collectors.py b/torchrl/collectors/collectors.py
@@ -152,8 +152,28 @@ class DataCollectorBase(IterableDataset, metaclass=abc.ABCMeta):
     trust_policy: bool
     compiled_policy: bool
     cudagraphed_policy: bool
-    local_weights_updater: LocalWeightUpdaterBase | None = None
-    remote_weights_updater: RemoteWeightUpdaterBase | None = None
+    _local_weights_updater: LocalWeightUpdaterBase | None = None
+    _remote_weights_updater: RemoteWeightUpdaterBase | None = None
+
+    @property
+    def local_weight_updater(self) -> LocalWeightUpdaterBase:
+        return self._local_weight_updater
+
+    @local_weight_updater.setter
+    def local_weight_updater(self, value: LocalWeightUpdaterBase | None):
+        if value is not None:
+            value.register_collector(self)
+        self._local_weight_updater = value
+
+    @property
+    def remote_weight_updater(self) -> RemoteWeightUpdaterBase:
+        return self._remote_weight_updater
+
+    @remote_weight_updater.setter
+    def remote_weight_updater(self, value: RemoteWeightUpdaterBase | None):
+        if value is not None:
+            value.register_collector(self)
+        self._remote_weight_updater = value
 
     def _get_policy_and_device(
         self,
@@ -1515,7 +1535,7 @@ def __repr__(self) -> str:
                 f"\nexploration={self.exploration_type})"
             )
             return string
-        except AttributeError:
+        except Exception:
             return f"{type(self).__name__}(not_init)"
 
 
@@ -1831,6 +1851,7 @@ def __init__(
         self.local_weights_updater = local_weights_updater
 
         self.policy = policy
+        self.policy_factory = policy_factory
 
         remainder = 0
         if total_frames is None or total_frames < 0:
@@ -2012,6 +2033,10 @@ def _run_processes(self) -> None:
                 env_fun = CloudpickleWrapper(env_fun)
 
             # Create a policy on the right device
+            policy_factory = self.policy_factory
+            if policy_factory is not None:
+                policy_factory = CloudpickleWrapper(policy_factory)
+
             policy_device = self.policy_device[i]
             storing_device = self.storing_device[i]
             env_device = self.env_device[i]
@@ -2020,13 +2045,14 @@ def _run_processes(self) -> None:
             #  This makes sure that a given set of shared weights for a given device are
             #  shared for all policies that rely on that device.
             policy = self.policy
-            policy_weights = self._policy_weights_dict[policy_device]
+            policy_weights = self._policy_weights_dict.get(policy_device)
             if policy is not None and policy_weights is not None:
                 cm = policy_weights.to_module(policy)
             else:
                 cm = contextlib.nullcontext()
             with cm:
                 kwargs = {
+                    "policy_factory": policy_factory,
                     "pipe_parent": pipe_parent,
                     "pipe_child": pipe_child,
                     "queue_out": queue_out,
@@ -3107,6 +3133,7 @@ def _main_async_collector(
     compile_policy: bool = False,
     cudagraph_policy: bool = False,
     no_cuda_sync: bool = False,
+    policy_factory: Callable | None = None,
 ) -> None:
     pipe_parent.close()
     # init variables that will be cleared when closing
@@ -3116,6 +3143,7 @@ def _main_async_collector(
         create_env_fn,
         create_env_kwargs=create_env_kwargs,
         policy=policy,
+        policy_factory=policy_factory,
         total_frames=-1,
         max_frames_per_traj=max_frames_per_traj,
         frames_per_batch=frames_per_batch,
@@ -3278,7 +3306,7 @@ def cast_tensor(x, MPS_ERROR=MPS_ERROR):
                 continue
 
         elif msg == "update":
-            inner_collector.update_policy_weights_()
+            inner_collector.update_policy_weights_(policy_weights=data_in)
             pipe_child.send((j, "updated"))
             has_timed_out = False
             continue
diff --git a/torchrl/collectors/weight_update.py b/torchrl/collectors/weight_update.py
@@ -5,8 +5,9 @@
 from __future__ import annotations
 
 import abc
+import weakref
 from abc import abstractmethod
-from typing import Callable, TypeVar
+from typing import Any, Callable, TypeVar
 
 import torch
 from tensordict import TensorDictBase
@@ -44,6 +45,25 @@ class LocalWeightUpdaterBase(metaclass=abc.ABCMeta):
 
     """
 
+    _collector_wr: Any = None
+
+    def register_collector(self, collector: DataCollectorBase):  # noqa
+        """Register a collector in the updater.
+
+        Once registered, the updater will not accept another collector.
+
+        Args:
+            collector (DataCollectorBase): The collector to register.
+
+        """
+        if self._collector_wr is not None:
+            raise RuntimeError("Cannot register collector twice.")
+        self._collector_wr = weakref.ref(collector)
+
+    @property
+    def collector(self) -> torchrl.collectors.DataCollectorBase:  # noqa
+        return self._collector_wr() if self._collector_wr is not None else None
+
     @abstractmethod
     def _get_server_weights(self) -> TensorDictBase:
         ...
@@ -104,12 +124,33 @@ class RemoteWeightUpdaterBase(metaclass=abc.ABCMeta):
 
     Methods:
         update_weights: Updates the weights on specified or all remote workers.
+        register_collector: Registers a collector. This should be called automatically by the collector
+            upon registration of the updater.
 
     .. seealso:: :class:`~torchrl.collectors.LocalWeightsUpdaterBase` and
         :meth:`~torchrl.collectors.DataCollectorBase.update_policy_weights_`.
 
     """
 
+    _collector_wr: Any = None
+
+    def register_collector(self, collector: DataCollectorBase):  # noqa
+        """Register a collector in the updater.
+
+        Once registered, the updater will not accept another collector.
+
+        Args:
+            collector (DataCollectorBase): The collector to register.
+
+        """
+        if self._collector_wr is not None:
+            raise RuntimeError("Cannot register collector twice.")
+        self._collector_wr = weakref.ref(collector)
+
+    @property
+    def collector(self) -> DataCollectorBase:
+        return self._collector_wr() if self._collector_wr is not None else None
+
     @abstractmethod
     def _sync_weights_with_worker(
         self, worker_id: int | torch.device, server_weights: TensorDictBase
diff --git a/torchrl/data/utils.py b/torchrl/data/utils.py
diff --git a/torchrl/envs/common.py b/torchrl/envs/common.py