allenai
diff --git a/‎CHANGELOG.md
+2 b/‎CHANGELOG.md
+2
diff --git a/‎examples/eval_p3/config.jsonnet
+1-1 b/‎examples/eval_p3/config.jsonnet
+1-1
diff --git a/‎examples/finetune/__init__.py b/‎examples/finetune/__init__.py
diff --git a/‎examples/finetune/config.jsonnet
+133 b/‎examples/finetune/config.jsonnet
+133
diff --git a/‎examples/finetune/snli_steps.py
+125 b/‎examples/finetune/snli_steps.py
+125
diff --git a/‎examples/finetune/test.py
+39 b/‎examples/finetune/test.py
+39
diff --git a/‎tango/common/lazy.py
+2-2 b/‎tango/common/lazy.py
+2-2
diff --git a/‎tango/integrations/torch/training_engine.py
+3-15 b/‎tango/integrations/torch/training_engine.py
+3-15
diff --git a/‎tango/integrations/torch/util.py
+2-1 b/‎tango/integrations/torch/util.py
+2-1
@@ -18,13 +18,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - `StepGraph` now prints itself in a readable way.
 - Tango now automatically detects when it's running under a debugger, and disables multicore support accordingly. Many debuggers can't properly follow sub-processes, so this is a convenience for people who love debuggers.
 - Added more models to the stuff we can import from the transformers library.
+- Added new example for finetuning text-to-text models.
 
 ### Changed
 
 - Renamed `click_logger` to `cli_logger`, and we now use [rich](https://github.com/Textualize/rich)'s logging `Handler` as the default handler, which means prettier output, better tracebacks, and you can use rich's markup syntax with the `cli_logger` to easily add style to text.
 - Refactored `tango.step_graph.StepGraph` to allow initialization from a `Dict[str, Step]`.
 - `Executor.execute_step_graph()` now attempts to execute all steps and summarizes success/failures.
 - Upgraded PyTorch version in `tango` Docker image to latest `v1.11.0+cu113`.
+- `RunGeneration` now allows model object as input.
 
 ### Fixed
 
 
@@ -30,7 +30,7 @@ local dataset_steps = std.foldl(
             "max_length": 200,
             "input": {"ref": "dataset_" + dataset_name},
             "batch_size": batch_size,
-            "model_name": model,
+            "model": model,
             "prompt_field": "inputs_pretokenized",
             "output_field": "generation",
             "splits": ["validation"]
 
@@ -0,0 +1,133 @@
+##################
+# Model settings #
+##################
+
+local pretrained_model = "t5-base";
+local load_with_low_cpu_mem_usage = false;
+
+local modules_to_wrap = ["[a-zA-Z_.]+\\.[0-9]+"];  # TODO: works for t5 and gpt2. confirm with other models too.
+
+####################
+# Trainer settings #
+####################
+
+# Trainer settings, adjust to your use-case.
+local training_steps = 20;  # total number of optimization steps to train for
+local validate_every = 5;  # how often to validate and save checkpoints
+
+local devices = 1;  # number of devices to train on (will use GPUs if enough are available, otherwise CPU)
+local grad_accum = 1;  # number of gradient accumulation steps (changes the effective batch size)
+# This is the batch size per GPU, ignoring gradient accumulation:
+local batch_size = 2;
+# So the effective batch size is `batch_size * grad_accum * devices`
+
+local activation_checkpointing = false;  # use activation/gradient checkpointing (probably need this GPT-J 6B, but not gpt2)
+local amp = false;  # use PyTorch's native automatic mixed precision
+local fsdp = false;  # Use FairScale's FullyShardedDataParallel (probably need this GPT-J 6B, but not gpt2)
+local cpu_offloading = false;  # Can only be used with 'fsdp' - saves a lot of GPU memory by offloading params+gradients to CPU, but is very slow.
+
+######################
+# Optimizer settings #
+######################
+
+local warmup_steps = 20;
+local learning_rate = 0.00005;  # you can probably use a higher LR for a small model like "gpt2"
+
+
+assert fsdp == true || cpu_offloading == false : "cpu_offloading only available with fsdp";
+
+# FullyShardedDataParallel config:
+local fsdp_config = if fsdp then {
+    reshard_after_forward: true,
+    move_params_to_cpu: cpu_offloading,
+    move_grads_to_cpu: cpu_offloading,
+    mixed_precision: amp,
+} else null;
+
+local training_engine = {
+    type: if fsdp then "fairscale" else "torch",
+    optimizer: {
+        type: "torch::AdamW",
+        lr: learning_rate,
+        betas: [0.9, 0.95],
+        eps: 1e-6,
+    },
+    lr_scheduler: {
+        type: "transformers::linear",
+        num_warmup_steps: warmup_steps,
+        num_training_steps: training_steps,
+    },
+    amp: amp,
+    [if fsdp then "fsdp_config" else null]: fsdp_config,
+};
+
+local distributed_dataloader = {
+    batch_size: batch_size,
+    sampler: {
+        type: "torch::DistributedSampler",
+        shuffle: true,
+        drop_last: true,
+    },
+};
+
+local single_device_dataloader = {
+    shuffle: true,
+    batch_size: batch_size,
+};
+
+local dataloader = if devices > 1 then distributed_dataloader else single_device_dataloader;
+
+{
+    steps: {
+        raw_data: {
+            type: "datasets::load",
+            path: "snli",
+        },
+        /*"subset_data": {
+            type: "subset-data",
+            data: { type: "ref", ref: "raw_data" },
+            max_samples: 10,
+        },*/
+        processed_data: {
+            type: "snli-text2text",
+            data: { type: "ref", ref: "raw_data" },
+        },
+        trained_model: {
+            type: "transformers::finetune",
+            model: {
+                type: "fairscale::with_wrapped_modules",
+                model: {
+                    type: "transformers::finetune::from_pretrained",
+                    pretrained_model_name_or_path: pretrained_model,
+                    low_cpu_mem_usage: load_with_low_cpu_mem_usage,
+                },
+                modules_to_wrap: modules_to_wrap,  # tell FairScale to wrap the transformer's blocks individually
+                fsdp_config: fsdp_config,
+                activation_checkpointing: activation_checkpointing,
+            },
+            tokenizer: {
+                pretrained_model_name_or_path: pretrained_model
+            },
+            dataset_dict: { type: "ref", ref: "processed_data" },
+            train_dataloader: dataloader,
+            validation_split: "validation",
+            grad_accum: grad_accum,
+            train_steps: training_steps,
+            validate_every: validate_every,
+            checkpoint_every: validate_every,
+            log_every: 1,
+            device_count: devices,
+            training_engine: training_engine,
+        },
+        generations: {
+            type: "transformers::run_generation_dataset",
+            max_length: 5,
+            input: {"type": "ref", "ref": "processed_data"},
+            batch_size: batch_size,
+            model: {"type": "ref", "ref": "trained_model"},
+            prompt_field: "source",
+            output_field: "generation",
+            splits: ["validation"]
+        }
+    }
+}
@@ -0,0 +1,125 @@
+from typing import Union
+
+import datasets as ds
+
+from tango.integrations.datasets import DatasetsFormat
+from tango.step import Step
+
+
+@Step.register("subset-data")
+class SubsetData(Step):
+    """
+    Creates a subset of the data; mostly to be used for testing/debugging.
+    """
+
+    DETERMINISTIC = True
+    CACHEABLE = True
+    VERSION = "001"
+
+    FORMAT = DatasetsFormat()
+
+    def run(  # type: ignore
+        self,
+        data: Union[ds.DatasetDict, ds.Dataset],
+        max_samples: int = 5,
+    ) -> Union[ds.DatasetDict, ds.Dataset]:
+        """
+        Returns a copy of the `data` with number of samples limited to `max_samples` for
+        each split.
+
+        :param data:
+            The dataset or dataset dict object.
+        :param max_samples:
+            The maximum number of samples to return per split.
+        """
+
+        # Unlike `ds.Dataset.select`, this works on both `ds.Dataset` and `ds.DatasetDict`.
+        def filter_fn(example, indices):
+            return indices < max_samples
+
+        return data.filter(filter_fn, with_indices=True)
+
+
+@Step.register("snli-text2text")
+class SnliText2Text(Step):
+    """
+    Converts the snli dataset to a text-to-text format.
+
+    Examples
+    --------
+
+    original_instance = {
+        "premise": "Two cats are sitting on a wall.",
+        "hypothesis": "The cats are chasing a mouse.",
+        "label": 2  # contradiction
+    }
+
+    returned_instance = {
+        "source": "nli premise: Two cats are sitting on a wall. hypothesis: The cats are chasing a mouse. label: "
+        "target": "contradiction"
+    }
+
+    """
+
+    DETERMINISTIC = True
+    CACHEABLE = True
+    VERSION = "001"
+
+    FORMAT = DatasetsFormat()
+
+    def run(  # type: ignore
+        self,
+        data: Union[ds.DatasetDict, ds.Dataset],
+        source_prefix: str = "nli",
+        premise_prefix: str = "premise",
+        hypothesis_prefix: str = "hypothesis",
+        label_prefix: str = "label",
+        num_workers: int = 1,
+    ) -> Union[ds.DatasetDict, ds.Dataset]:
+        """
+        :param data:
+            The snli `Dataset` or `DatasetDict` object.
+        :param source_prefix:
+            The str to add before the start of the source sequence.
+        :param premise_prefix:
+            The str to add before the start of the `premise` in the source sequence.
+        :param hypothesis_prefix:
+            The str to add before the start of the `hypothesis` in the source sequence.
+        :param label_prefix:
+            The str to add as the prompt for the label.
+        :param num_workers:
+            The number of workers to use for processing the data.
+        """
+
+        def filter_no_gold(example, indices):
+            if example["label"] == -1:
+                return False
+            return True
+
+        data = data.filter(filter_no_gold, with_indices=True)
+
+        label_map = {0: "entailment", 1: "neutral", 2: "contradiction"}
+
+        def _mapper(example):
+            return {
+                "source": (
+                    f'{source_prefix} {premise_prefix}: {example["premise"]} '
+                    f'{hypothesis_prefix}: {example["hypothesis"]} {label_prefix}: '
+                ),
+                "target": f'{label_map[example["label"]]}',
+            }
+
+        if isinstance(data, ds.Dataset):
+            old_cols = data.column_names
+        else:
+            old_cols = list(data.column_names.values())[0]
+
+        dataset = data.map(
+            _mapper,
+            batched=False,
+            num_proc=num_workers,
+            remove_columns=old_cols,  # remove all old columns
+            desc="Converting data to text-to-text format",
+        )
+
+        return dataset
@@ -0,0 +1,39 @@
+import datasets as ds
+import pytest
+
+from tango.common import Params
+from tango.common.testing import TangoTestCase, run_experiment
+
+
+class TestFinetuneSNLI(TangoTestCase):
+    @pytest.mark.parametrize(
+        "model, model_type",
+        [("patrickvonplaten/t5-tiny-random", "t5"), ("sshleifer/tiny-gpt2", "gpt2")],
+    )
+    def test_config(self, model: str, model_type: str):
+        overrides = {
+            "steps.trained_model.model.model.pretrained_model_name_or_path": model,
+            "steps.trained_model.tokenizer.pretrained_model_name_or_path": model,
+            "steps.subset_data": {
+                "type": "subset-data",
+                "data": {"type": "ref", "ref": "raw_data"},
+                "max_samples": 10,
+            },
+            "steps.processed_data.data.ref": "subset_data",
+        }
+        config = Params.from_file("config.jsonnet", params_overrides=overrides)
+        # Make sure we've overrode the model entirely.
+        flattened = config.as_flat_dict()
+        for key, value in flattened.items():
+            if "model_name" in key or (isinstance(value, str) and model_type in value):
+                assert value == model
+
+        with run_experiment(config, include_package=["snli_steps.py"]) as run_dir:
+            assert (run_dir / "processed_data").is_dir()
+            processed = ds.load_from_disk(run_dir / "processed_data" / "data")
+            assert len(processed["train"][0].keys()) == 2
+            assert "source" in processed["train"][0].keys()
+            assert "target" in processed["train"][0].keys()
+            assert processed["train"][0]["source"].startswith("nli premise:")
+
+            assert (run_dir / "trained_model").is_dir()
@@ -83,5 +83,5 @@ def construct(self, **kwargs) -> T:
         """
         # If there are duplicate keys between self._constructor_extras and kwargs,
         # this will overwrite the ones in self._constructor_extras with what's in kwargs.
-        contructor_kwargs = {**self._constructor_extras, **kwargs}
-        return self.constructor(**contructor_kwargs)
+        constructor_kwargs = {**self._constructor_extras, **kwargs}
+        return self.constructor(**constructor_kwargs)
@@ -13,6 +13,7 @@
 from .model import Model
 from .optim import LRScheduler, Optimizer
 from .train_config import TrainConfig
+from .util import move_to_device
 
 
 class TrainingEngine(Registrable):
@@ -60,19 +61,6 @@ def _construct_lr_scheduler(self, lr_scheduler: Lazy[LRScheduler]) -> LRSchedule
         lr_scheduler: LRScheduler = lr_scheduler.construct(optimizer=self.optimizer)
         return lr_scheduler
 
-    @classmethod
-    def _move_to_device(cls, o: Any, device: torch.device) -> Any:
-        if isinstance(o, torch.Tensor):
-            return o.to(device)
-        elif isinstance(o, dict):
-            return {k: cls._move_to_device(v, device) for k, v in o.items()}
-        elif isinstance(o, list):
-            return [cls._move_to_device(x, device) for x in o]
-        elif isinstance(o, tuple):
-            return tuple((cls._move_to_device(x, device) for x in o))
-        else:
-            return o
-
     @abstractmethod
     def forward_train(
         self, micro_batch: Dict[str, Any], micro_batch_idx: int, num_micro_batches: int
@@ -207,7 +195,7 @@ def forward_train(
             self.optimizer.zero_grad(set_to_none=True)
 
         # Move tensors to right device.
-        micro_batch = self._move_to_device(micro_batch, self.device)
+        micro_batch = move_to_device(micro_batch, self.device)
 
         with torch.autocast(self.train_config.device_type, enabled=self.amp, dtype=self.amp_dtype):
             outputs = self.model(**micro_batch)
@@ -217,7 +205,7 @@ def forward_train(
 
     def forward_eval(self, batch: Dict[str, Any]) -> Dict[str, Any]:
         # Move tensors to right device.
-        batch = self._move_to_device(batch, self.device)
+        batch = move_to_device(batch, self.device)
 
         with torch.autocast(self.train_config.device_type, enabled=self.amp, dtype=self.amp_dtype):
             with torch.inference_mode():
 
@@ -1,5 +1,6 @@
 import random
 import warnings
+from collections import UserDict
 from typing import Dict, Optional, TypeVar, Union
 
 import numpy as np
@@ -15,7 +16,7 @@
 def move_to_device(o: T, device: torch.device) -> T:
     if isinstance(o, torch.Tensor):
         return o.to(device)  # type: ignore[return-value]
-    elif isinstance(o, dict):
+    elif isinstance(o, dict) or isinstance(o, UserDict):
         return {k: move_to_device(v, device) for k, v in o.items()}  # type: ignore[return-value]
     elif isinstance(o, list):
         return [move_to_device(x, device) for x in o]  # type: ignore[return-value]