pytorch
diff --git a/‎sota-implementations/post-training/grpo.py
+152 b/‎sota-implementations/post-training/grpo.py
+152
diff --git a/‎sota-implementations/post-training/grpo_utils.py
+152 b/‎sota-implementations/post-training/grpo_utils.py
+152
diff --git a/‎torchrl/data/tensor_specs.py
+1-1 b/‎torchrl/data/tensor_specs.py
+1-1
diff --git a/‎torchrl/envs/common.py
+1 b/‎torchrl/envs/common.py
+1
@@ -0,0 +1,152 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from argparse import ArgumentParser
+
+import torch
+from datasets import load_dataset
+from tensordict import TensorDict
+from torch.utils.data import DataLoader
+from torchrl.collectors import SyncDataCollector
+from torchrl.data import LazyStackStorage, ReplayBuffer, SamplerWithoutReplacement
+from torchrl.envs import DataLoadingPrimer, KLRewardTransform, LLMEnv, StepCounter, Tokenizer
+from torchrl.modules import from_hf_transformers
+from torchrl.objectives import ClipPPOLoss, ReinforceLoss
+from transformers import AutoTokenizer, GPT2Config, GPT2LMHeadModel
+from grpo_utils import ShapedCorrectnessReward, PrepareQuestion
+from torch.utils._pytree import tree_map
+
+parser = ArgumentParser()
+parser.add_argument("--dataset", type=str, default="gsm8k")
+parser.add_argument("--batch_size", type=int, default=4)
+parser.add_argument("--epochs", type=int, default=10)
+parser.add_argument("--repeats", type=int, default=10)
+parser.add_argument("--steps_per_batch", type=int, default=16)
+parser.add_argument("--optim_batch_size", type=int, default=4)
+
+def compute_mc_advantage(trajectories):
+    # Get the question
+    answer = trajectories["answer"]
+    # Identify indices where the answers match
+    answer_ids = tree_map(lambda string: hash(string), answer)
+    answer_ids = torch.tensor(answer_ids)
+    print("answer_ids", answer_ids)
+    unique_qs = answer_ids.view(-1).unique()
+    trajectories["advantage"] = trajectories["next", "reward"] * 0
+    for u in unique_qs:
+        idx =  answer_ids == u
+        rewards = trajectories[idx]["next", "reward"]
+        rewards = (rewards - rewards.mean()) / rewards.std().clamp(min=1e-4)
+        print("rewards", rewards)
+        trajectories.set_at_("advantage", rewards, idx)
+    return trajectories
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    # Create env instance:
+    #  - Load the gsm8k dataset
+    dataset = load_dataset(args.dataset, "main")
+    train_dataset = dataset["train"]
+
+    def collate_fn(batch):
+        batch = torch.stack([TensorDict.from_dict(_batch) for _batch in batch])
+        batch.rename_key_("question", "text")
+        return batch
+
+    # LLM
+    tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    model = GPT2LMHeadModel(GPT2Config())
+
+    tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.padding_side = "left"
+
+    # Env
+    dataloader = DataLoader(
+        train_dataset, batch_size=args.batch_size, shuffle=True, collate_fn=collate_fn
+    )
+    env = LLMEnv.from_dataloader(
+        dataloader=dataloader,
+        tokenizer=tokenizer,
+        str2str=True,
+        batch_size=(args.batch_size * args.repeats,),
+        repeats=args.repeats,
+    )
+    for i, trsf in enumerate(env.transform):
+        if isinstance(trsf, DataLoadingPrimer):
+            env.insert_transform(i, PrepareQuestion())
+            break
+
+    # Finally, we want the env to stop after the first step
+    env.append_transform(StepCounter(max_steps=1))
+
+    print("env", env)
+    print(env.reset())
+
+    policy = from_hf_transformers(
+        model,
+        tokenizer=tokenizer,
+        from_text=False,
+        generate=True,
+        return_log_probs=True,
+    )
+
+    # Reward transform
+    env.append_transform(ShapedCorrectnessReward(tokenizer=tokenizer))
+
+    # Ref model
+    ref_model = GPT2LMHeadModel(GPT2Config())
+    ref_model = from_hf_transformers(
+        ref_model,
+        tokenizer=tokenizer,
+        from_text=False,
+        generate=False,
+        return_log_probs=True,
+    )
+    env.append_transform(KLRewardTransform(actor=ref_model, coef=0.1, log_prob_key="log_probs"))
+
+    # replay buffer
+    rb = ReplayBuffer(storage=LazyStackStorage(args.steps_per_batch), sampler=SamplerWithoutReplacement(), batch_size=args.optim_batch_size)
+
+    # Collector
+    collector = SyncDataCollector(
+        env, policy, frames_per_batch=args.steps_per_batch, total_frames=1_000_000,
+    )
+
+    # Loss module
+    policy_traning = from_hf_transformers(
+        model,
+        tokenizer=tokenizer,
+        from_text=False,
+        generate=False,
+        return_log_probs=True,
+    )
+    loss_fn = ClipPPOLoss(
+        actor_network=policy_traning,
+        critic_network=None,
+        critic_coef=0.0,
+        functional=False,
+    )
+    loss_fn.set_keys(sample_log_prob="log_probs")
+    loss_fn._set_in_keys()
+    optim = torch.optim.Adam(loss_fn.parameters())
+
+    # loss_fn = ReinforceLoss(
+    #     actor_network=policy,
+    #     critic_network=None,
+    #     critic_coef=0.0,
+    # )
+
+    for trajs in collector:
+        trajs = trajs.reshape(-1)
+        print('trajs from collector', trajs)
+        trajs = compute_mc_advantage(trajs)
+        rb.extend(trajs)
+        for i in range(args.epochs):
+            for batch in rb:
+                print('running loss with batch', batch)
+                loss = loss_fn(batch)
+                loss_val = loss.mean(reduce=True)
+                loss_val.backward()
+                optim.step()
+                optim.zero_grad()
@@ -0,0 +1,152 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from __future__ import annotations
+
+import torch
+from tensordict.tensorclass import NonTensorData, NonTensorStack
+from torchrl.envs import Transform
+from torchrl.data import Composite, TensorSpec, Unbounded
+from tensordict.utils import _zip_strict
+from tensordict import TensorDictBase, TensorDict
+from tensordict import NestedKey
+BASE_PROMPT = (
+    "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. "
+    "The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. "
+    "The reasoning process and answer are enclosed within <think></think> and <answer></answer> tags, respectively, "
+    "i.e., <think>reasoning process here</think> <answer>answer here</answer>. User: %s. Assistant: <think>"
+)
+
+class PrepareQuestion(Transform):
+    def __init__(self, in_keys: list[NestedKey] | None = None, out_keys: list[NestedKey] | None = None):
+        if in_keys is None:
+            in_keys = ["text"]
+        if out_keys is None:
+            out_keys = list(in_keys)
+        super().__init__(in_keys, out_keys)
+
+    def _reset_env_preprocess(self, tensordict: TensorDictBase) -> TensorDictBase:
+        for in_key, out_key in _zip_strict(self.in_keys, self.out_keys):
+            string = tensordict.get(in_key)
+            tensordict.set(out_key, self._modify_str(string))
+        return tensordict
+
+    def _modify_str(self, obs: str | list[str] | NonTensorData | NonTensorStack) -> NonTensorData | NonTensorStack:
+        if isinstance(obs, NonTensorData):
+            return self._modify_str(obs.data)
+        if isinstance(obs, NonTensorStack):
+            return self._modify_str(obs.tolist())
+        if isinstance(obs, list):
+            return NonTensorStack(
+                *[BASE_PROMPT % obs for obs in obs]
+            )
+        return NonTensorData(BASE_PROMPT % obs)
+
+    def _apply_transform(self, obs: torch.Tensor) -> None:
+        return obs
+    def transform_observation_spec(self, observation_spec: TensorSpec) -> TensorSpec:
+        for in_key, out_key in _zip_strict(self.in_keys, self.out_keys):
+            if out_key != in_key:
+                observation_spec[out_key] = observation_spec[in_key].clone()
+        return observation_spec
+
+class ShapedCorrectnessReward(Transform):
+    def __init__(self, tokenizer, in_keys: list[NestedKey] | None=None, out_keys: list[NestedKey] | None = None):
+        super().__init__()
+        self.tokenizer = tokenizer
+        if in_keys is None:
+            in_keys = ["text", "answer"]
+        if not isinstance(in_keys, list) or len(in_keys) != 2:
+            raise ValueError("ShapedCorrectnessReward requires in_keys to be of type list and have 2 elements.")
+        if out_keys is None:
+            out_keys = ["reward_answer", "reward_think", "reward_right", "reward_contained", "reward", "success"]
+        super().__init__(in_keys, out_keys)
+
+    def _step(
+        self, tensordict: TensorDictBase, next_tensordict: TensorDictBase
+    ) -> TensorDictBase:
+        from xml.etree import ElementTree as ET
+        # Get the completion
+        responses = next_tensordict[self.in_keys[0]]  # batch_size, grpo_size, L
+        answers = next_tensordict[self.in_keys[1]]  # batch_size, grpo_size
+        if isinstance(responses, torch.Tensor):
+            if responses.ndim  == 3:
+                batch_size, grpo_size, _ = responses.shape
+            # decode
+            text_completion = self.tokenizer.decode(
+                responses.flatten(0, 1).tolist()
+            )
+        else:
+            text_completion = responses
+        # Decomposed reward
+        tds = []
+        for answer, compl in zip(answers, text_completion):
+            try:
+                cot, potential_answer = self.extract_tags("<think>" + compl) #.replace("<<", "").replace(">>", ""))
+            except ET.ParseError:
+                cot, potential_answer = ("", "")
+            tds.append(self.single_shaped_correctness_reward(potential_answer, cot))
+        tds = torch.stack(tds)
+        if isinstance(responses, torch.Tensor) and responses.ndim  == 3:
+            tds = tds.reshape(batch_size, grpo_size)
+        tds = tds.apply(lambda t: t.unsqueeze(-1))
+        return next_tensordict.update(tds)
+
+    def transform_reward_spec(self, reward_spec: Composite) -> Composite:
+        shape = reward_spec.shape + (1,)
+        reward_spec.update(Composite(
+            reward_answer=Unbounded(shape),
+            reward_think=Unbounded(shape),
+            reward_right=Unbounded(shape),
+            reward_contained=Unbounded(shape),
+            reward=Unbounded(shape),
+            success=Unbounded(shape, dtype=torch.bool),
+        ))
+        return reward_spec
+
+    @classmethod
+    def single_shaped_correctness_reward(cls, answer: str, cot: str) -> TensorDict:
+
+        reward_answer = 5.0 * (len(answer) == 1)
+
+        reward_think = 5.0 * (len(cot) == 1)
+
+        # One of the answer tags has the right answer
+        reward_right = 20.0 * (any(attempt == answer for attempt in answer))
+
+        # One of the answer tags contains the right answer (might be e.g. $20 instead of 20)
+        reward_contained = 10.0 * (any((answer in attempt) for attempt in answer))
+
+        success = len(answer) > 0 and answer[-1] == answer
+        # Compose the rewards
+        reward = 100.0 * float(success) + (reward_answer + reward_think + reward_contained + reward_right) * (1- float(success))
+
+        rewards = TensorDict(
+            reward_answer=reward_answer,
+            reward_think=reward_think,
+            reward_right=reward_right,
+            reward_contained=reward_contained,
+            reward=reward,
+            success=success,
+        )
+        return rewards
+
+    @staticmethod
+    def extract_tags(text: str) -> Tuple[str, str]:
+        """
+        Parse XML-like tags from text. Returns a dictionary with keys 'think' and 'answer'.
+        The values are lists of strings, with each string being the content of a tag.
+        """
+        from xml.etree import ElementTree as ET
+
+        xml_string = f"<root>{text}</root>"
+        try:
+            root = ET.fromstring(xml_string)
+        except ET.ParseError as e:
+            return ("", "")
+
+        return (
+            root.find("think").text if root.find("think") is not None else "",
+            root.find("answer").text if root.find("answer") is not None else "",
+        )
@@ -4941,7 +4941,7 @@ def set(self, name: str, spec: TensorSpec) -> Composite:
                     spec.shape = self.shape
                 else:
                     raise ValueError(
-                        f"The shape of the spec {type(spec).__name__} and the Composite {type(self).__name__} mismatch: the first "
+                        f"The shapes of the spec {type(spec).__name__} and the {type(self).__name__} mismatch: the first "
                         f"{self.ndim} dimensions should match but got spec.shape={spec.shape} and "
                         f"Composite.shape={self.shape}."
                     )
 
@@ -3383,6 +3383,7 @@ def _rollout_stop_early(
                 else:
                     tensordict.clear_device_()
             # In case policy(..) does not modify in-place - no-op for TensorDict and related
+            print('policy input', tensordict)
             tensordict.update(policy(tensordict))
             if auto_cast_to_device:
                 if env_device is not None:
Original file line number	Diff line number	Diff line change
`@@ -4941,7 +4941,7 @@ def set(self, name: str, spec: TensorSpec) -> Composite:`
`4941`	`4941`	`spec.shape = self.shape`
`4942`	`4942`	`else:`
`4943`	`4943`	`raise ValueError(`
`4944`		`- f"The shape of the spec {type(spec).__name__} and the Composite {type(self).__name__} mismatch: the first "`
	`4944`	`+ f"The shapes of the spec {type(spec).__name__} and the {type(self).__name__} mismatch: the first "`
`4945`	`4945`	`f"{self.ndim} dimensions should match but got spec.shape={spec.shape} and "`
`4946`	`4946`	`f"Composite.shape={self.shape}."`
`4947`	`4947`	`)`