pytorch · Mar 20, 2025
diff --git a/‎test/test_env.py
+74-38 b/‎test/test_env.py
+74-38
diff --git a/‎torchrl/envs/custom/llm.py
+109-27 b/‎torchrl/envs/custom/llm.py
+109-27
diff --git a/‎torchrl/envs/transforms/llm.py
+23-10 b/‎torchrl/envs/transforms/llm.py
+23-10
diff --git a/‎torchrl/envs/transforms/transforms.py
+175-72 b/‎torchrl/envs/transforms/transforms.py
+175-72
diff --git a/‎torchrl/modules/llm/vllm_policy.py
+57-31 b/‎torchrl/modules/llm/vllm_policy.py
+57-31
@@ -14,6 +14,7 @@
 import re
 import string
 from collections import defaultdict
+from contextlib import nullcontext
 from functools import partial
 from sys import platform
 from typing import Any, Optional
@@ -33,7 +34,7 @@
     TensorDictBase,
 )
 from tensordict.nn import TensorDictModuleBase
-from tensordict.tensorclass import NonTensorStack, TensorClass
+from tensordict.tensorclass import NonTensorData, NonTensorStack, TensorClass
 from tensordict.utils import _unravel_key_to_tuple
 from torch import nn
 
@@ -4630,6 +4631,7 @@ def __next__(self):
                 else:
                     return tensors
 
+    @pytest.mark.skipif(not _has_transformers, reason="test requires transformers")
     @pytest.mark.parametrize(
         "str2str,stack_method",
         [
@@ -4674,22 +4676,36 @@ def test_llm_env(self, str2str, batched, stack_method, device, batch_size):
         else:
             env.check_env_specs(break_when_any_done="both")
 
+    @pytest.mark.skipif(not _has_transformers, reason="test requires transformers")
+    @pytest.mark.parametrize("tokenizer", [True, False])
     @pytest.mark.parametrize(
-        "str2str,stack_method",
+        "str2str,no_stack,stack_method",
         [
-            [True, None],
-            [False, "as_padded_tensor"],
-            # TODO: a bit experimental, fails with check_env_specs
-            # [False, "as_nested_tensor"],
-            [False, None],
+            [True, True, None],
+            [True, False, None],
+            [False, False, "as_padded_tensor"],
+            [False, False, None],
         ],
     )
     @pytest.mark.parametrize("batched", [True, False])
     @pytest.mark.parametrize("device", [None, "cpu"])
     @pytest.mark.parametrize("batch_size", [0, 4])
     def test_llm_from_dataloader(
-        self, str2str, batched, stack_method, device, batch_size
+        self,
+        str2str,
+        batched,
+        stack_method,
+        device,
+        batch_size,
+        tokenizer,
+        no_stack,
     ):
+        from transformers import AutoTokenizer
+
+        if tokenizer:
+            tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+        else:
+            tokenizer = None
         if str2str:
             kwargs = {
                 "dataloader": self.DummyDataLoader(batch_size=batch_size),
@@ -4712,7 +4728,8 @@ def test_llm_from_dataloader(
                 "str2str": str2str,
                 "device": device,
                 "has_attention": False,
-                "no_stack": False,
+                "no_stack": no_stack,
+                "tokenizer": tokenizer,
             }
         )
         env = LLMEnv.from_dataloader(**kwargs)
@@ -4725,12 +4742,17 @@ def test_llm_from_dataloader(
         if batch_size > 0:
 
             def policy(td):
-                if str2str:
+                if str2str and tokenizer is None:
                     if not td.shape:
-                        td[LLMEnv._DEFAULT_ACTION_STR_KEY] = "<nothing>"
+                        td[LLMEnv._DEFAULT_ACTION_STR_KEY] = NonTensorData(
+                            "<nothing>", device=device
+                        )
                     else:
                         td[LLMEnv._DEFAULT_ACTION_STR_KEY] = NonTensorStack(
-                            *["<nothing>" for _ in range(td.shape[0])]
+                            *[
+                                NonTensorData("<nothing>", device=device)
+                                for _ in range(td.shape[0])
+                            ]
                         )
                 else:
                     td[LLMEnv._DEFAULT_ACTION_TOKENS_KEY] = torch.ones(
@@ -4742,34 +4764,48 @@ def policy(td):
                 # Tell the env that we want 3 sub-envs
                 r = env.rollout(10, policy, tensordict=TensorDict(batch_size=[3]))
                 assert r.ndim == 2
-                if str2str:
+                if str2str and tokenizer is None:
                     assert isinstance(r[0, 0][LLMEnv._DEFAULT_STR_KEY], str)
                     assert isinstance(r[0, 1][LLMEnv._DEFAULT_STR_KEY], str)
-                    assert (
-                        r[0, 0][LLMEnv._DEFAULT_STR_KEY]
-                        == r[0, 1][LLMEnv._DEFAULT_STR_KEY][
-                            : -len(r[0, 0][LLMEnv._DEFAULT_ACTION_STR_KEY])
-                        ]
-                    )
-                    assert (
-                        r[0, 1][LLMEnv._DEFAULT_STR_KEY]
-                        == r[0, 2][LLMEnv._DEFAULT_STR_KEY][
-                            : -len(r[0, 1][LLMEnv._DEFAULT_ACTION_STR_KEY])
-                        ]
-                    )
-                    assert (
-                        r[-1, 0][LLMEnv._DEFAULT_STR_KEY]
-                        == r[-1, 1][LLMEnv._DEFAULT_STR_KEY][
-                            : -len(r[-1, 0][LLMEnv._DEFAULT_ACTION_STR_KEY])
-                        ]
-                    )
-                    assert (
-                        r[-1, 1][LLMEnv._DEFAULT_STR_KEY]
-                        == r[-1, 2][LLMEnv._DEFAULT_STR_KEY][
-                            : -len(r[-1, 1][LLMEnv._DEFAULT_ACTION_STR_KEY])
-                        ]
-                    )
-                else:
+                    should_fail = no_stack
+                    if should_fail:
+                        ctx = pytest.raises(AssertionError)
+                    else:
+                        ctx = nullcontext()
+                    with ctx:
+                        assert (
+                            r[0, 0][LLMEnv._DEFAULT_STR_KEY]
+                            == r[0, 1][LLMEnv._DEFAULT_STR_KEY][
+                                : -len(r[0, 0][LLMEnv._DEFAULT_ACTION_STR_KEY])
+                            ]
+                        ), (
+                            r[0, 0][LLMEnv._DEFAULT_STR_KEY],
+                            r[0, 0][LLMEnv._DEFAULT_ACTION_STR_KEY],
+                            r[0, 0]["next", LLMEnv._DEFAULT_STR_KEY],
+                            r[0, 1][LLMEnv._DEFAULT_STR_KEY],
+                        )
+                    with ctx:
+                        assert (
+                            r[0, 1][LLMEnv._DEFAULT_STR_KEY]
+                            == r[0, 2][LLMEnv._DEFAULT_STR_KEY][
+                                : -len(r[0, 1][LLMEnv._DEFAULT_ACTION_STR_KEY])
+                            ]
+                        )
+                    with ctx:
+                        assert (
+                            r[-1, 0][LLMEnv._DEFAULT_STR_KEY]
+                            == r[-1, 1][LLMEnv._DEFAULT_STR_KEY][
+                                : -len(r[-1, 0][LLMEnv._DEFAULT_ACTION_STR_KEY])
+                            ]
+                        )
+                    with ctx:
+                        assert (
+                            r[-1, 1][LLMEnv._DEFAULT_STR_KEY]
+                            == r[-1, 2][LLMEnv._DEFAULT_STR_KEY][
+                                : -len(r[-1, 1][LLMEnv._DEFAULT_ACTION_STR_KEY])
+                            ]
+                        )
+                elif tokenizer is None:
                     assert (
                         r[0, 0][LLMEnv._DEFAULT_TOKEN_KEY]
                         == r[0, 1][LLMEnv._DEFAULT_TOKEN_KEY][:-1]
 
@@ -8,10 +8,18 @@
 
 import torch
 
-from tensordict import NestedKey, TensorDict, TensorDictBase, unravel_key
+from tensordict import (
+    is_leaf_nontensor,
+    NestedKey,
+    TensorDict,
+    TensorDictBase,
+    unravel_key,
+)
 from tensordict.tensorclass import NonTensorData, NonTensorStack
 from tensordict.utils import _zip_strict
 from torch.utils.data import DataLoader
+
+from torchrl._utils import _replace_last
 from torchrl.data.map.hash import SipHash
 from torchrl.data.tensor_specs import (
     Bounded,
@@ -38,7 +46,8 @@ class LLMEnv(EnvBase):
 
     Users must append a transform to set the "done" condition, which would trigger the loading of the next prompt.
 
-    Prompts to the language model can be loaded when the environment is ``reset`` if the environment is created via :meth:`~from_dataloader`
+    Prompts to the language model can be loaded when the environment is ``reset`` if the environment is created via
+    :meth:`~from_dataloader`.
 
     Keyword Args:
         token_key (NestedKey, optional): The key in the tensordict where the tokens are stored (when `str2str=False`).
@@ -145,12 +154,19 @@ def __init__(
             self.full_observation_spec_unbatched = Composite(
                 {
                     self.str_key: NonTensor(
-                        example_data="a string", batched=True, shape=()
+                        example_data="a string",
+                        batched=True,
+                        shape=(),
+                        device=device,
                     )
                 }
             )
             self.full_action_spec_unbatched = Composite(
-                {action_key: NonTensor(example_data="a string", batched=True, shape=())}
+                {
+                    action_key: NonTensor(
+                        example_data="a string", batched=True, shape=(), device=device
+                    )
+                }
             )
         else:
             if vocab_size is None:
@@ -208,27 +224,28 @@ def __init__(
         if not self.assign_done:
             # Use single done
             self.full_done_spec_unbatched = Composite(
-                done=Unbounded(shape=(1,), dtype=torch.bool),
-                terminated=Unbounded(shape=(1,), dtype=torch.bool),
+                done=Unbounded(shape=(1,), dtype=torch.bool, device=device),
+                terminated=Unbounded(shape=(1,), dtype=torch.bool, device=device),
             )
         elif self.str2str:
             raise STR2STR_ERR
         else:
             # Use single done
             self.full_done_spec_unbatched = Composite(
                 tokens_data=Composite(
-                    done=Unbounded(shape=(-1,), dtype=torch.bool),
-                    terminated=Unbounded(shape=(-1,), dtype=torch.bool),
+                    done=Unbounded(shape=(-1,), dtype=torch.bool, device=device),
+                    terminated=Unbounded(shape=(-1,), dtype=torch.bool, device=device),
                 ),
-                done=Unbounded(shape=(1,), dtype=torch.bool),
-                terminated=Unbounded(shape=(1,), dtype=torch.bool),
+                done=Unbounded(shape=(1,), dtype=torch.bool, device=device),
+                terminated=Unbounded(shape=(1,), dtype=torch.bool, device=device),
             )
 
     @classmethod
     def from_dataloader(
         cls,
         dataloader: DataLoader,
         *,
+        tokenizer: transformers.PretrainedTokenizerBase | None = None,  # noqa
         token_key: NestedKey | None = None,
         str_key: NestedKey | None = None,
         attention_key: NestedKey | None = None,
@@ -257,6 +274,18 @@ def from_dataloader(
 
         Args:
             dataloader (DataLoader): The dataloader to load data from.
+
+        Keyword Args:
+            tokenizer (transformers.PretrainedTokenizerBase or str, optional): the tokenizer to use. If ``None``,
+                "bert-base-uncased" will be used by default. If a string is provided, it should be the name of a
+                pre-trained tokenizer.
+
+                .. note:: Using the `tokenizer` will append a :class:`~torchrl.envs.Tokenizer` transform to the environment.
+                    If `str2str` is set to `True`, the tokenizer will be called during every iteration and the rollout
+                    will contain both tokens and text data.
+                    If `str2str` is set to `False`, the tokenizer will be called during reset only, and the only
+                    text data in the rollout will be the text sampled from the dataset.
+
             token_key (NestedKey, optional): The key in the tensordict where the tokens are stored (when `str2str=False`).
                 Defaults to ``("tokens_in", "input_ids")``.
             str_key (NestedKey, optional): The key in the tensordict where the string input is stored (when `str2str=True`).
@@ -305,19 +334,54 @@ def from_dataloader(
         Returns:
             LLMEnv: The created LLMEnv instance.
         """
-        from torchrl.envs import DataLoadingPrimer
+        from torchrl.envs import DataLoadingPrimer, Tokenizer
+
+        if str_key is None:
+            str_key = LLMEnv._DEFAULT_STR_KEY
+        if token_key is None:
+            token_key = LLMEnv._DEFAULT_TOKEN_KEY
+        if attention_key is None:
+            attention_key = LLMEnv._DEFAULT_ATTENTION_KEY
+        elif tokenizer is not None and attention_key != _replace_last(
+            token_key, "attention_mask"
+        ):
+            raise ValueError(
+                "When using the Tokenizer, attention key must match `(*token_key[:-1], 'attention_mask')` where "
+                f"`token_key` is a tuple-typed nested key. Got attention_key={attention_key} while expecting "
+                f"{_replace_last(token_key, 'attention_mask')}."
+            )
+
+        if tokenizer is not None:
+            if str2str:
+                # In this case, the tokenizer is appended to the env after each step
+                if action_key is None:
+                    action_key = cls._DEFAULT_ACTION_STR_KEY
+                tokenizer_transform = Tokenizer(
+                    tokenizer=tokenizer,
+                    in_keys=[str_key],
+                    out_keys=[token_key],
+                    # Assume that the tokens are named according to _DEFAULT_ACTION_TOKENS_KEY
+                    in_keys_inv=[action_key],
+                    out_keys_inv=[cls._DEFAULT_ACTION_TOKENS_KEY],
+                    call_before_reset=False,
+                    # We should always see the required entries
+                    missing_tolerance=False,
+                )
+            else:
+                # In this case, the tokenizer acts before reset and that's all
+                tokenizer_transform = Tokenizer(
+                    tokenizer=tokenizer,
+                    in_keys=[str_key],
+                    out_keys=[token_key],
+                    call_before_reset=True,
+                    missing_tolerance=True,
+                )
 
         if data_keys is None:
             if str2str:
-                if str_key is None:
-                    data_keys = [LLMEnv._DEFAULT_STR_KEY]
-                else:
-                    data_keys = [str_key]
+                data_keys = [str_key]
             else:
-                if token_key is None:
-                    data_keys = [LLMEnv._DEFAULT_TOKEN_KEY]
-                else:
-                    data_keys = [token_key]
+                data_keys = [token_key]
                 if has_attention:
                     if attention_key is None:
                         data_keys.append(LLMEnv._DEFAULT_ATTENTION_KEY)
@@ -332,6 +396,7 @@ def from_dataloader(
             example_data=example_data,
             stack_method=stack_method,
             repeats=repeats,
+            device=device,
         )
         env = LLMEnv(
             str2str=str2str,
@@ -349,15 +414,17 @@ def from_dataloader(
             has_attention=has_attention,
             as_llm_data=as_llm_data,
         )
+        if tokenizer is not None:
+            env = env.append_transform(tokenizer_transform)
         return env.append_transform(primer)
 
     @staticmethod
-    def _check_obs_act_and_cat(obs, action):
+    def _check_obs_act_and_cat(obs, action, *, device):
         if not isinstance(obs, str):
             raise TypeError(f"Observation must be a string, got {type(obs)}.")
         if not isinstance(action, str):
             raise TypeError(f"Action must be a string, got {type(action)}.")
-        return obs + action
+        return NonTensorData(obs + action, device=device)
 
     def _step(
         self,
@@ -409,10 +476,11 @@ def _make_next_obs(
         self, tensordict: TensorDictBase, nex_td: TensorDictBase
     ) -> TensorDictBase:
         if self.no_stack:
-            if self.str2str:
-                raise NotImplementedError
             action = tensordict.get(self.action_key)
-            nex_td.set(self.token_key, action)
+            if self.str2str:
+                nex_td.set(self.str_key, action)
+            else:
+                nex_td.set(self.token_key, action)
             if self.has_attention:
                 attention_mask = tensordict.get(self.attention_key)
                 n = action.shape[-1] - attention_mask.shape[-1]
@@ -438,11 +506,13 @@ def _make_next_obs(
                         "The tensordict is batchless, yet the action and/or observations are not "
                         f"strings but {type(action)} and {type(obs)}, respectivly."
                     )
-                observation = self._check_obs_act_and_cat(obs, action)
+                observation = self._check_obs_act_and_cat(
+                    obs, action, device=self.device
+                )
             else:
                 observation = NonTensorStack(
                     *[
-                        self._check_obs_act_and_cat(_obs, _action)
+                        self._check_obs_act_and_cat(_obs, _action, device=self.device)
                         for (_obs, _action) in _zip_strict(obs, action)
                     ]
                 )
@@ -463,6 +533,12 @@ def _make_next_obs(
                     )
                 else:
                     observation = torch.cat([obs, action], -1)
+                    if self.has_attention:
+                        attention_mask = tensordict.get(self.attention_key)
+                        attention_mask = torch.cat(
+                            [attention_mask, attention_mask.new_ones(action.shape)], -1
+                        )
+                        nex_td.set(self.attention_key, attention_mask)
             except TypeError:
                 raise TypeError(
                     "Failed to cat action and observation tensors. Check that str2str argument is correctly "
@@ -484,10 +560,16 @@ def check_str():
 
         if tensordict is None or check_token() or check_str():
             raise KeyError(
-                f"Observation key {self.token_key} is not defined. Make sure a TensorDictPrimer (eg, "
+                f"Observation key {self.token_key}/{self.str_key} is not defined in tensordict with keys "
+                f"{list(tensordict.keys(True, True, is_leaf=is_leaf_nontensor))}. Make sure a TensorDictPrimer (eg, "
                 f"torchrl.envs.DataLoadingPrimer) is appended to the env transforms."
             )
         td_reset = tensordict.copy()
+        if td_reset.device != self.device:
+            if self.device is None:
+                td_reset.clear_device_()
+            else:
+                td_reset = td_reset.to(self.device)
         tensordict = self._maybe_make_done(tensordict, td_reset)
         if self.as_llm_data:
             raise NotImplementedError()
 
@@ -10,13 +10,7 @@
 from typing import Any, Callable, Iterable, Literal
 
 import torch
-from tensordict import (
-    maybe_dense_stack,
-    NestedKey,
-    TensorDict,
-    TensorDictBase,
-    unravel_key,
-)
+from tensordict import lazy_stack, NestedKey, TensorDict, TensorDictBase, unravel_key
 from tensordict.nn import ProbabilisticTensorDictModule, TensorDictParams
 from tensordict.utils import _zip_strict, is_seq_of_nested_key
 from torch import nn
@@ -364,6 +358,7 @@ def __init__(
         use_buffer: bool | None = None,
         auto_batch_size: bool = True,
         repeats: int | None = None,
+        device: torch.device | None = None,
     ):
         self.dataloader = dataloader
         if repeats is None:
@@ -385,7 +380,7 @@ def __init__(
         self.endless_dataloader = self._endless_iter(self.dataloader)
 
         if stack_method is None:
-            stack_method = maybe_dense_stack
+            stack_method = lazy_stack
         elif stack_method == "as_nested_tensor":
             stack_method = as_nested_tensor
         elif stack_method == "as_padded_tensor":
@@ -424,6 +419,7 @@ def __init__(
             expand_specs=None,
             single_default_value=True,
             call_before_env_reset=True,
+            device=device,
         )
         self._reset_key = "_reset"
 
@@ -432,10 +428,14 @@ def _endless_iter(self, obj):
         while True:
             yield from obj
 
+    # def _reset_env_preprocess(self, tensordict: TensorDictBase) -> TensorDictBase:
+    #     td = super()._reset_env_preprocess(tensordict)
+    #     return lazy_stack(list(td.unbind(0)))
+    #
     def _load_from_dataloader(self, reset: torch.Tensor | None = None):
         """Loads a single element from the dataloader, or alternatively from the buffer.
 
-        If `reset` is passed, the one element per reset will be loaded.
+        If `reset` is passed, then one element per reset will be loaded.
         """
         if reset is not None:
             if not reset.any():
@@ -444,8 +444,16 @@ def _load_from_dataloader(self, reset: torch.Tensor | None = None):
                 loaded = [self._load_from_dataloader() for i in range(reset.sum())]
                 return self.stack_method(loaded)
 
+        primers = getattr(self, "primers", None)
+        if primers is not None:
+            device = self.primers.device
+        else:
+            device = None
+
         if self.use_buffer and len(self._queue) > 0:
             result = self._queue.popleft()
+            if result.device != device:
+                result = result.to(device)
             return result
 
         data = next(self.endless_dataloader)
@@ -454,7 +462,10 @@ def _load_from_dataloader(self, reset: torch.Tensor | None = None):
         # TODO: one could rename the keys too
         if isinstance(data, Mapping):
             out = TensorDict.from_dict(
-                data, auto_batch_size=self.auto_batch_size, batch_dims=1
+                data,
+                auto_batch_size=self.auto_batch_size,
+                batch_dims=1,
+                device=device,
             )
         elif self.data_keys is None:
             raise RuntimeError(
@@ -467,12 +478,14 @@ def _load_from_dataloader(self, reset: torch.Tensor | None = None):
                 {k: val for k, val in _zip_strict(self.data_keys, data)},
                 auto_batch_size=self.auto_batch_size,
                 batch_dims=1,
+                device=device,
             )
         elif len(self.data_keys) == 1:
             out = TensorDict.from_dict(
                 {self.data_keys[0]: data},
                 auto_batch_size=self.auto_batch_size,
                 batch_dims=1,
+                device=device,
             )
         else:
             raise ValueError(
 
@@ -5313,8 +5313,8 @@ class Tokenizer(UnaryTransform):
 
     def __init__(
         self,
-        in_keys: Sequence[NestedKey],
-        out_keys: Sequence[NestedKey],
+        in_keys: Sequence[NestedKey] | None = None,
+        out_keys: Sequence[NestedKey] | None = None,
         in_keys_inv: Sequence[NestedKey] | None = None,
         out_keys_inv: Sequence[NestedKey] | None = None,
         *,
@@ -5325,6 +5325,9 @@ def __init__(
         add_special_tokens: bool = False,
         padding: bool = True,
         max_length: int | None = None,
+        return_attention_mask: bool = True,
+        missing_tolerance: bool = True,
+        call_before_reset: bool = False,
     ):
         if tokenizer is None:
             from transformers import AutoTokenizer
@@ -5340,6 +5343,8 @@ def __init__(
         self.skip_special_tokens = skip_special_tokens
         self.padding = padding
         self.max_length = max_length
+        self.return_attention_mask = return_attention_mask
+        self.call_before_reset = call_before_reset
         if additional_tokens:
             self.tokenizer.add_tokens(additional_tokens)
         super().__init__(
@@ -5351,6 +5356,7 @@ def __init__(
             inv_fn=self.call_tokenizer_inv_fn,
             use_raw_nontensor=use_raw_nontensor,
         )
+        self._missing_tolerance = missing_tolerance
 
     @property
     def device(self):
@@ -5363,6 +5369,68 @@ def device(self):
         self._device = device
         return device
 
+    def _call(self, next_tensordict: TensorDictBase) -> TensorDictBase:
+        # Specialized for attention mask
+        for in_key, out_key in _zip_strict(self.in_keys, self.out_keys):
+            value = next_tensordict.get(in_key, default=None)
+            if value is not None:
+                observation = self._apply_transform(value)
+                if self.return_attention_mask:
+                    observation, attention_mask = observation
+                    next_tensordict.set(
+                        _replace_last(out_key, "attention_mask"),
+                        attention_mask,
+                    )
+                next_tensordict.set(
+                    out_key,
+                    observation,
+                )
+            elif (
+                self.missing_tolerance
+                and self.return_attention_mask
+                and out_key in next_tensordict.keys(True)
+            ):
+                attention_key = _replace_last(out_key, "attention_mask")
+                if attention_key not in next_tensordict:
+                    next_tensordict[attention_key] = torch.ones_like(
+                        next_tensordict.get(out_key)
+                    )
+            elif not self.missing_tolerance:
+                raise KeyError(
+                    f"{self}: '{in_key}' not found in tensordict {next_tensordict}"
+                )
+        return next_tensordict
+
+    @dispatch(source="in_keys", dest="out_keys")
+    def forward(self, tensordict: TensorDictBase) -> TensorDictBase:
+        for in_key, out_key in _zip_strict(self.in_keys, self.out_keys):
+            data = tensordict.get(in_key, None)
+            if data is not None:
+                data = self._apply_transform(data)
+                if self.return_attention_mask:
+                    data, attention_mask = data
+                    tensordict.set(
+                        _replace_last(out_key, "attention_mask"),
+                        attention_mask,
+                    )
+                tensordict.set(out_key, data)
+            elif not self.missing_tolerance:
+                raise KeyError(f"'{in_key}' not found in tensordict {tensordict}")
+        return tensordict
+
+    def _reset_env_preprocess(self, tensordict: TensorDictBase) -> TensorDictBase:
+        if self.call_before_reset:
+            with _set_missing_tolerance(self, True):
+                tensordict = self._call(tensordict)
+        return tensordict
+
+    def _reset(
+        self, tensordict: TensorDictBase, tensordict_reset: TensorDictBase
+    ) -> TensorDictBase:
+        if self.call_before_reset:
+            return tensordict_reset
+        return super()._reset(tensordict, tensordict_reset)
+
     def call_tokenizer_fn(self, value: str | list[str]):
         device = self.device
         kwargs = {"add_special_tokens": self.add_special_tokens}
@@ -5372,19 +5440,25 @@ def call_tokenizer_fn(self, value: str | list[str]):
         if isinstance(value, str):
             out = self.tokenizer.encode(value, return_tensors="pt", **kwargs)[0]
             # TODO: incorporate attention mask
-            # attention_mask = torch.ones_like(out, dtype=torch.bool)
+            if self.return_attention_mask:
+                attention_mask = torch.ones_like(out, dtype=torch.int64)
         else:
             kwargs["padding"] = (
                 self.padding if self.max_length is None else "max_length"
             )
-            # kwargs["return_attention_mask"] = False
+            kwargs["return_attention_mask"] = self.return_attention_mask
             # kwargs["return_token_type_ids"] = False
             out = self.tokenizer.batch_encode_plus(value, return_tensors="pt", **kwargs)
-            # attention_mask = out["attention_mask"]
+            if self.return_attention_mask:
+                attention_mask = out["attention_mask"]
             out = out["input_ids"]
 
         if device is not None and out.device != device:
             out = out.to(device)
+            if self.return_attention_mask:
+                attention_mask = attention_mask.to(device)
+        if self.return_attention_mask:
+            return out, attention_mask
         return out
 
     def call_tokenizer_inv_fn(self, value: Tensor):
@@ -5396,81 +5470,110 @@ def call_tokenizer_inv_fn(self, value: Tensor):
             out = self.tokenizer.batch_decode(
                 value, skip_special_tokens=self.skip_special_tokens
             )
+        device = self._str_device
         if isinstance(out, list):
-            return NonTensorStack(*out)
-        return NonTensorData(out)
+            result = NonTensorStack(*out)
+            if device:
+                result = result.to(device)
+            return result
+        return NonTensorData(out, device=device)
+
+    @property
+    def _str_device(self):
+        parent = self.parent
+        if parent is None:
+            return None
+        if self.in_keys:
+            in_key = self.in_keys[0]
+        elif self.in_keys_inv:
+            in_key = self.in_keys_inv[0]
+        else:
+            return None
+        if in_key in parent.observation_keys:
+            return parent.full_observation_spec[in_key].device
+        if in_key in parent.action_keys:
+            return parent.full_action_spec[in_key].device
+        if in_key in parent.state_keys:
+            return parent.full_state_spec[in_key].device
+        return None
 
     def transform_input_spec(self, input_spec: Composite) -> Composite:
-        input_spec = super().transform_input_spec(input_spec)
         # We need to cap the spec to generate valid random strings
-        for out_key in self.out_keys_inv:
-            if out_key in input_spec["full_state_spec"].keys(True, True):
-                new_shape = input_spec["full_state_spec"][out_key].shape
-                if self.max_length is None:
-                    # Then we can't tell what the shape will be
-                    new_shape = new_shape[:-1] + torch.Size((-1,))
-                input_spec["full_state_spec"][out_key] = Bounded(
-                    0,
-                    self.tokenizer.vocab_size,
-                    shape=new_shape,
-                    device=input_spec["full_state_spec"][out_key].device,
-                    dtype=input_spec["full_state_spec"][out_key].dtype,
-                )
-            elif out_key in input_spec["full_action_spec"].keys(True, True):
-                new_shape = input_spec["full_action_spec"][out_key].shape
-                if self.max_length is None:
-                    # Then we can't tell what the shape will be
-                    new_shape = new_shape[:-1] + torch.Size((-1,))
-                input_spec["full_action_spec"][out_key] = Bounded(
-                    0,
-                    self.tokenizer.vocab_size,
-                    shape=new_shape,
-                    device=input_spec["full_action_spec"][out_key].device,
-                    dtype=input_spec["full_action_spec"][out_key].dtype,
+        for in_key, out_key in _zip_strict(self.in_keys_inv, self.out_keys_inv):
+            if in_key in input_spec["full_state_spec"].keys(True, True):
+                spec = input_spec["full_state_spec"]
+            elif in_key in input_spec["full_action_spec"].keys(False, True):
+                spec = input_spec["full_action_spec"]
+            else:
+                raise KeyError(
+                    f"The input keys {in_key} wasn't found in the env input specs."
                 )
+            local_spec = spec.pop(in_key)
+            local_dtype = local_spec.dtype
+            if local_dtype is None or local_dtype.is_floating_point:
+                local_dtype = torch.int64
+            new_shape = spec.shape
+            if self.max_length is None:
+                # Then we can't tell what the shape will be
+                new_shape = new_shape + torch.Size((-1,))
+            else:
+                new_shape = new_shape + torch.Size((self.max_length,))
+            spec[out_key] = Bounded(
+                0,
+                self.tokenizer.vocab_size,
+                shape=new_shape,
+                device=local_spec.device,
+                dtype=local_dtype,
+            )
         return input_spec
 
-    def transform_output_spec(self, output_spec: Composite) -> Composite:
-        output_spec = super().transform_output_spec(output_spec)
-        # We need to cap the spec to generate valid random strings
-        for out_key in self.out_keys:
-            if out_key in output_spec["full_observation_spec"].keys(True, True):
-                new_shape = output_spec["full_observation_spec"][out_key].shape
-                if self.max_length is None:
-                    # Then we can't tell what the shape will be
-                    new_shape = new_shape[:-1] + torch.Size((-1,))
-                output_spec["full_observation_spec"][out_key] = Bounded(
-                    0,
-                    self.tokenizer.vocab_size,
-                    shape=new_shape,
-                    device=output_spec["full_observation_spec"][out_key].device,
-                    dtype=output_spec["full_observation_spec"][out_key].dtype,
-                )
-            elif out_key in output_spec["full_reward_spec"].keys(True, True):
-                new_shape = output_spec["full_reward_spec"][out_key].shape
-                if self.max_length is None:
-                    # Then we can't tell what the shape will be
-                    new_shape = new_shape[:-1] + torch.Size((-1,))
-                output_spec["full_reward_spec"][out_key] = Bounded(
-                    0,
-                    self.tokenizer.vocab_size,
-                    shape=new_shape,
-                    device=output_spec["full_reward_spec"][out_key].device,
-                    dtype=output_spec["full_reward_spec"][out_key].dtype,
-                )
-            elif out_key in output_spec["full_done_spec"].keys(True, True):
-                new_shape = output_spec["full_done_spec"][out_key].shape
-                if self.max_length is None:
-                    # Then we can't tell what the shape will be
-                    new_shape = new_shape[:-1] + torch.Size((-1,))
-                output_spec["full_done_spec"][out_key] = Bounded(
+    transform_output_spec = Transform.transform_output_spec
+    transform_reward_spec = Transform.transform_reward_spec
+    transform_done_spec = Transform.transform_done_spec
+
+    def transform_observation_spec(self, observation_spec: TensorSpec) -> TensorSpec:
+        attention_mask_keys = set()
+        for in_key, out_key in _zip_strict(self.in_keys, self.out_keys):
+            new_shape = observation_spec.shape + torch.Size((-1,))
+            try:
+                in_spec = observation_spec[in_key]
+                obs_dtype = in_spec.dtype
+                device = in_spec.device
+            except KeyError:
+                # In some cases (eg, the tokenizer is applied during reset on data that
+                #  originates from a dataloader) we don't have an in_spec
+                in_spec = None
+                obs_dtype = None
+                device = observation_spec.device
+            if obs_dtype is None or obs_dtype.is_floating_point:
+                obs_dtype = torch.int64
+            observation_spec[out_key] = Bounded(
+                0,
+                self.tokenizer.vocab_size,
+                shape=new_shape,
+                device=device,
+                dtype=obs_dtype,
+            )
+            if self.return_attention_mask:
+                attention_mask_key = _replace_last(out_key, "attention_mask")
+                if attention_mask_key in attention_mask_keys:
+                    raise KeyError(
+                        "Conflicting attention_mask keys. Make sure the token tensors are "
+                        "nested at different places in the tensordict such that `(*root, 'attention_mask')` "
+                        "entries are unique."
+                    )
+                attention_mask_keys.add(attention_mask_key)
+                attention_dtype = obs_dtype
+                if attention_dtype is None or attention_dtype.is_floating_point:
+                    attention_dtype = torch.int64
+                observation_spec[attention_mask_key] = Bounded(
                     0,
-                    self.tokenizer.vocab_size,
+                    2,
                     shape=new_shape,
-                    device=output_spec["full_done_spec"][out_key].device,
-                    dtype=output_spec["full_done_spec"][out_key].dtype,
+                    device=device,
+                    dtype=attention_dtype,
                 )
-        return output_spec
+        return observation_spec
 
 
 class Stack(Transform):
@@ -6087,7 +6190,7 @@ def __init__(
             kwargs = primers
         if not isinstance(kwargs, Composite):
             shape = kwargs.pop("shape", None)
-            device = kwargs.pop("device", None)
+            device = self.device
             if "batch_size" in kwargs.keys():
                 extra_kwargs = {"batch_size": kwargs.pop("batch_size")}
             else:
@@ -6160,7 +6263,7 @@ def reset_key(self, value):
     @property
     def device(self):
         device = self._device
-        if device is None and self.parent is not None:
+        if device is None and hasattr(self, "parent") and self.parent is not None:
             device = self.parent.device
             self._device = device
         return device
 
@@ -10,6 +10,8 @@
 import torch
 from tensordict import (
     from_dataclass,
+    lazy_stack,
+    LazyStackedTensorDict,
     maybe_dense_stack,
     NestedKey,
     NonTensorData,
@@ -20,6 +22,7 @@
     TensorDictModule as Mod,
     TensorDictModuleBase,
     TensorDictSequential as Seq,
+    WrapModule,
 )
 from tensordict.utils import _zip_strict
 
@@ -61,6 +64,7 @@ def from_vllm(
     generate: bool = True,
     generate_kwargs: dict | None = None,
     tokenizer_kwargs: dict | None = None,
+    pad_output: bool = True,
 ) -> TensorDictModuleBase:
     """Creates a TensorDictModule from a vLLM model.
 
@@ -151,7 +155,7 @@ def from_vllm(
                 out_keys=["tokens_in"],
                 method_kwargs=tokenizer_kwargs,
                 strict=True,
-                inplace=False,
+                inplace="empty",
             )
         else:
             module_dict["encode"] = Mod(
@@ -164,7 +168,7 @@ def from_vllm(
                 in_keys=[text_key, "text_response"],
                 out_keys=["tokens_in", "tokens_response"],
                 strict=True,
-                inplace=False,
+                inplace="empty",
             )
 
             def select(x, y):
@@ -196,7 +200,7 @@ def stack_for_logprobs(tokens, tokens_response, attention_mask=None):
                 ("tokens_in", "attention_mask"),
             ],
             strict=False,
-            inplace=False,
+            inplace="empty",
         )
     else:
         module_dict["move_inputs"] = Mod(
@@ -205,7 +209,7 @@ def stack_for_logprobs(tokens, tokens_response, attention_mask=None):
             out_keys=[("tokens_in", "input_ids"), ("tokens_in", "attention_mask")],
             # It's ok if there's no mask
             strict=False,
-            inplace=False,
+            inplace="empty",
         )
 
     def to_list(tokens, attention_mask):
@@ -240,11 +244,10 @@ def to_list(tokens, attention_mask):
     )
 
     if generate_kwargs is None:
-        generate_kwargs = {
-            "detokenize": False,
-            "prompt_logprobs": not generate,
-            "logprobs": return_log_probs,
-        }
+        generate_kwargs = {}
+    generate_kwargs.setdefault("detokenize", False)
+    generate_kwargs.setdefault("prompt_logprobs", not generate)
+    generate_kwargs.setdefault("logprobs", return_log_probs)
     if not generate:
         generate_kwargs["max_tokens"] = 1
     sampling_params = SamplingParams(**generate_kwargs)
@@ -261,13 +264,27 @@ def to_list(tokens, attention_mask):
         strict=True,
     )
 
-    def get_output_tokens_and_log_probs(td):
+    padding_value = tokenizer(tokenizer.pad_token)["input_ids"][0]
+
+    def get_output_tokens_and_log_probs(td, padding_value=padding_value):
         td["tokens_out"] = _RequestOutput_tc.from_request_output(td["tokens_out"])
+        if pad_output and td.ndim and not isinstance(td, LazyStackedTensorDict):
+            td = lazy_stack(list(td.unbind(0)))
         if generate:
             # When not generate, we don't want to overwrite this
-            td["tokens_response"] = td["tokens_out"].outputs.token_ids
+            tokens_response_td = td["tokens_out"].outputs._tensordict.select(
+                "token_ids", "logprobs", strict=False
+            )
+            if pad_output:
+                tokens_response_td = tokens_response_td.densify(
+                    layout=torch.strided
+                ).to_padded_tensor(padding=padding_value)
+            tokens_response_td.rename_key_("token_ids", "tokens_response")
+            # td["tokens_response"] = outputs.token_ids
             if return_log_probs:
-                td["log_probs"] = td["tokens_out"].outputs.logprobs.unsqueeze(-1)
+                tokens_response_td.rename_key_("logprobs", "log_probs")
+                # td["log_probs"] = outputs.logprobs.unsqueeze(-1)
+            td.update(tokens_response_td)
         elif not generate:
             td["prompt_logprobs"] = td["tokens_out"].prompt_logprobs.unsqueeze(-1)
         return td
@@ -296,32 +313,41 @@ def translate_lps(tokens_response, x):
         module_dict["to_source_device"] = _maybe_set_device
 
     if generate:
-        module_dict["format"] = Mod(
-            lambda *x: x,
-            in_keys=[
-                "log_probs",
-                "tokens_response",
-                ("tokens_in", "input_ids"),
-                ("tokens_in", "attention_mask"),
-                "text_response",
-            ],
-            out_keys=[
-                "log_probs",
-                "tokens_response",
-                token_key,
-                attention_mask_key,
-                "text_response",
-            ],
-            strict=False,
-            inplace=False,
+        in_keys = [
+            "log_probs",
+            "tokens_response",
+            ("tokens_in", "input_ids"),
+            ("tokens_in", "attention_mask"),
+            "text_response",
+        ]
+        out_keys = [
+            "log_probs",
+            "tokens_response",
+            token_key,
+            attention_mask_key,
+            "text_response",
+        ]
+
+        def format_td(td):
+            td = td.select(*in_keys, strict=False)
+            td.rename_key_(("tokens_in", "input_ids"), token_key)
+            td.rename_key_(("tokens_in", "attention_mask"), attention_mask_key)
+            del td["tokens_in"]
+            return td
+
+        module_dict["format"] = WrapModule(
+            format_td,
+            in_keys=in_keys,
+            out_keys=out_keys,
         )
+
     else:
         module_dict["format"] = Mod(
             lambda *x: x,
             in_keys=["log_probs", "tokens_response"],
             out_keys=["log_probs", "tokens_response"],
             strict=False,
-            inplace=False,
+            inplace="empty",
         )
 
     return Seq(module_dict, inplace=True)