Add KTO support for preference tuning

efsiatras · efsiatras · commit 92f14c78f09c · 2025-03-13T23:55:00.000+02:00
diff --git a/configs/recipes/phi3/kto/train.yaml b/configs/recipes/phi3/kto/train.yaml
@@ -0,0 +1,36 @@
+# Phi3 KTO train config.
+#
+# Usage:
+#   oumi train -c configs/recipes/phi3/kto/train.yaml
+#
+# See Also:
+#   - Documentation: https://oumi.ai/docs/en/latest/user_guides/train/train.html
+#   - Config class: oumi.core.configs.TrainingConfig
+#   - Config source: https://github.com/oumi-ai/oumi/blob/main/src/oumi/core/configs/training_config.py
+#   - Other training configs: configs/**/pretraining/, configs/**/sft/, configs/**/dpo/
+
+model:
+  model_name: "microsoft/Phi-3-mini-4k-instruct"
+  trust_remote_code: True
+
+data:
+  train:
+    datasets:
+      - dataset_name: "mlabonne/kto-mix-40k"
+
+training:
+  optimizer: "adamw_torch"
+  use_peft: true
+  output_dir: "output/phi3.kto"
+  trainer_type: "TRL_KTO"
+
+peft:
+  q_lora: False
+  lora_target_modules:
+    - "q_proj"
+    - "k_proj"
+    - "v_proj"
+    - "o_proj"
+    - "gate_proj"
+    - "up_proj"
+    - "down_proj" 
diff --git a/src/oumi/builders/training.py b/src/oumi/builders/training.py
@@ -98,6 +98,8 @@ def _init_oumi_trainer(*args, **kwargs) -> BaseTrainer:
         return _create_hf_builder_fn(trl.SFTTrainer)
     elif trainer_type == TrainerType.TRL_DPO:
         return _create_hf_builder_fn(trl.DPOTrainer)
+    elif trainer_type == TrainerType.TRL_KTO:
+        return _create_hf_builder_fn(trl.KTOTrainer)
     elif trainer_type == TrainerType.TRL_GRPO:
         return _create_hf_builder_fn(trl.GRPOTrainer)
     elif trainer_type == TrainerType.HF:
diff --git a/src/oumi/core/configs/params/training_params.py b/src/oumi/core/configs/params/training_params.py
@@ -45,6 +45,13 @@ class TrainerType(Enum):
     for fine-tuning language models based on human preferences.
     """
 
+    TRL_KTO = "trl_kto"
+    """Kahneman-Tversky Optimization trainer from `trl` library.
+
+    This trainer implements the KTO algorithm for fine-tuning language models
+    based on binary feedback (desirable/undesirable) rather than preference pairs.
+    """
+
     TRL_GRPO = "trl_grpo"
     """Group Relative Policy Optimization trainer from `trl` library.
 
@@ -153,6 +160,8 @@ class TrainingParams(BaseParams):
     - HF: HuggingFace's Trainer
     - TRL_SFT: TRL's SFT Trainer
     - TRL_DPO: TRL's DPO Trainer
+    - TRL_KTO: TRL's KTO Trainer
+    - TRL_GRPO: TRL's GRPO Trainer
     - OUMI: Custom generic trainer implementation
     """
 
@@ -661,6 +670,8 @@ def to_hf(self):
             config_class = trl.SFTConfig
         elif self.trainer_type == TrainerType.TRL_DPO:
             config_class = trl.DPOConfig
+        elif self.trainer_type == TrainerType.TRL_KTO:
+            config_class = trl.KTOConfig
         elif self.trainer_type == TrainerType.TRL_GRPO:
             config_class = trl.GRPOConfig
         else:
diff --git a/src/oumi/core/datasets/__init__.py b/src/oumi/core/datasets/__init__.py
@@ -28,6 +28,7 @@
 from oumi.core.datasets.base_dpo_dataset import BaseExperimentalDpoDataset
 from oumi.core.datasets.base_grpo_dataset import BaseExperimentalGrpoDataset
 from oumi.core.datasets.base_iterable_dataset import BaseIterableDataset
+from oumi.core.datasets.base_kto_dataset import BaseKtoDataset
 from oumi.core.datasets.base_map_dataset import BaseMapDataset
 from oumi.core.datasets.base_pretraining_dataset import BasePretrainingDataset
 from oumi.core.datasets.base_sft_dataset import BaseSftDataset
@@ -41,6 +42,7 @@
     "BaseExperimentalDpoDataset",
     "BaseExperimentalGrpoDataset",
     "BaseIterableDataset",
+    "BaseKtoDataset",
     "BaseMapDataset",
     "BasePretrainingDataset",
     "BaseSftDataset",
diff --git a/src/oumi/core/datasets/base_kto_dataset.py b/src/oumi/core/datasets/base_kto_dataset.py
@@ -0,0 +1,93 @@
+# Copyright 2025 - Oumi
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Base dataset class for KTO (Kahneman-Tversky Optimization).
+
+This module provides a base class for datasets used in KTO training.
+Unlike DPO which requires preference pairs, KTO works with simple binary feedback
+indicating whether an output is desirable or undesirable.
+"""
+
+from typing import Optional
+
+from oumi.core.datasets.base_map_dataset import BaseMapDataset
+from oumi.core.tokenizers.base_tokenizer import BaseTokenizer
+
+_PROMPT_KEY = "prompt"
+_RESPONSE_KEY = "response"
+_LABEL_KEY = "label"  # True for desirable, False for undesirable
+
+class BaseKtoDataset(BaseMapDataset):
+    """Base class for KTO datasets.
+
+    This class provides a foundation for creating KTO datasets that work with
+    binary feedback (desirable/undesirable) rather than preference pairs.
+
+    Warning:
+        This class is experimental and subject to change.
+    """
+
+    def __init__(
+        self,
+        *,
+        dataset_name: Optional[str] = None,
+        dataset_path: Optional[str] = None,
+        split: Optional[str] = None,
+        tokenizer: Optional[BaseTokenizer] = None,
+        return_tensors: bool = False,
+        **kwargs,
+    ) -> None:
+        """Initializes a new instance of the BaseKtoDataset class."""
+        super().__init__(
+            dataset_name=dataset_name,
+            dataset_path=dataset_path,
+            split=split,
+            **kwargs,
+        )
+
+        if return_tensors:
+            raise NotImplementedError(
+                "return_tensors=True is not implemented for this class"
+            )
+
+        self._tokenizer = tokenizer
+        self._return_tensors = return_tensors
+
+        self._data = self._load_data()
+
+    def transform_kto(self, sample: dict) -> dict:
+        """Transform the sample to the KTO format.
+        
+        Args:
+            sample: A dictionary containing the raw sample data.
+            
+        Returns:
+            A dictionary with the following keys:
+            - prompt: The input prompt
+            - response: The model's response
+            - label: Boolean indicating if the response is desirable (True) or undesirable (False)
+        """
+        prompt = sample[_PROMPT_KEY]
+        response = sample[_RESPONSE_KEY]
+        label = sample[_LABEL_KEY]
+
+        return {
+            _PROMPT_KEY: prompt,
+            _RESPONSE_KEY: response,
+            _LABEL_KEY: label,
+        }
+
+    def transform(self, sample: dict) -> dict:
+        """Transform the sample to the KTO format."""
+        return self.transform_kto(sample) 
diff --git a/src/oumi/datasets/debug.py b/src/oumi/datasets/debug.py
@@ -21,6 +21,7 @@
 from typing_extensions import override
 
 from oumi.core.datasets.base_dpo_dataset import BaseExperimentalDpoDataset
+from oumi.core.datasets.base_kto_dataset import BaseKtoDataset
 from oumi.core.datasets.base_pretraining_dataset import BasePretrainingDataset
 from oumi.core.datasets.base_sft_dataset import BaseSftDataset
 from oumi.core.registry import register_dataset
@@ -191,3 +192,45 @@ def _load_data(self) -> pd.DataFrame:
                 ],
             }
         )
+
+
+@register_dataset("debug_kto")
+class DebugKtoDataset(BaseKtoDataset):
+    default_dataset = "debug_kto"
+
+    def __init__(
+        self,
+        dataset_size: int = 5,
+        **kwargs,
+    ):
+        """Initializes a DebugKtoDataset."""
+        self.size = dataset_size
+
+        super().__init__(**kwargs)
+
+    def transform_kto(self, sample: dict) -> dict:
+        """Transforms the sample into a KTO dict."""
+        return {
+            "prompt": sample["prompt"],
+            "completion": sample["completion"],
+            "label": sample["label"],
+        }
+
+    @override
+    def _load_data(self) -> pd.DataFrame:
+        return pd.DataFrame(
+            {
+                "prompt": [
+                    f"Hello, how are you? (Document number {idx})"
+                    for idx in range(self.size)
+                ],
+                "completion": [
+                    f"I'm fine, thank you! (Document number {idx})"
+                    for idx in range(self.size)
+                ],
+                "label": [
+                    idx % 2 == 0  # True for even indices, False for odd indices
+                    for idx in range(self.size)
+                ],
+            }
+        )
diff --git a/src/oumi/datasets/preference_tuning/__init__.py b/src/oumi/datasets/preference_tuning/__init__.py
@@ -14,8 +14,10 @@
 
 """Preference tuning datasets module."""
 
+from oumi.datasets.preference_tuning.kto_mix import KtoMix40kDataset
 from oumi.datasets.preference_tuning.orpo_dpo_mix import OrpoDpoMix40kDataset
 
 __all__ = [
+    "KtoMix40kDataset",
     "OrpoDpoMix40kDataset",
 ]
diff --git a/src/oumi/datasets/preference_tuning/kto_mix.py b/src/oumi/datasets/preference_tuning/kto_mix.py
@@ -0,0 +1,48 @@
+# Copyright 2025 - Oumi
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from oumi.core.datasets import BaseKtoDataset
+from oumi.core.registry import register_dataset
+
+
+@register_dataset("mlabonne/kto-mix-40k")
+class KtoMix40kDataset(BaseKtoDataset):
+    """Preprocess the KTO dataset.
+
+    A dataset designed for KTO (Kahneman-Tversky Optimization) training.
+    This dataset is a combination of high-quality datasets with binary feedback,
+    including:
+    - Capybara-Preferences (converted to binary)
+    - distilabel-intel-orca-dpo-pairs (converted to binary)
+    - ultrafeedback-binarized-preferences-cleaned
+    - distilabel-math-preference-dpo (converted to binary)
+    - toxic-dpo-v0.2 (converted to binary)
+    - prm_dpo_pairs_cleaned (converted to binary)
+    - truthy-dpo-v0.1 (converted to binary)
+
+    Rule-based filtering was applied to remove 'gptisms' in the desirable answers.
+
+    Data Fields:
+        - source: string
+        - prompt: string
+        - response: string
+        - label: boolean (True for desirable, False for undesirable)
+
+    See Also:
+        For more information on how to use this dataset, refer to:
+        - Paper: https://arxiv.org/pdf/2402.01306
+        - Huggingface hub: https://huggingface.co/docs/trl/main/en/kto_trainer
+    """
+
+    default_dataset = "mlabonne/kto-mix-40k" 
diff --git a/tests/integration/train/test_train.py b/tests/integration/train/test_train.py
@@ -194,3 +194,45 @@ def test_train_dpo():
         )
 
         train(config)
+
+
+def test_train_kto():
+    with tempfile.TemporaryDirectory() as output_temp_dir:
+        output_training_dir = str(pathlib.Path(output_temp_dir) / "train")
+        config: TrainingConfig = TrainingConfig(
+            data=DataParams(
+                train=DatasetSplitParams(
+                    datasets=[
+                        DatasetParams(
+                            dataset_name="debug_kto",
+                        )
+                    ],
+                ),
+            ),
+            model=ModelParams(
+                model_name="openai-community/gpt2",
+                model_max_length=1024,
+                trust_remote_code=True,
+                tokenizer_pad_token="<|endoftext|>",
+            ),
+            training=TrainingParams(
+                per_device_train_batch_size=2,
+                trainer_type=TrainerType.TRL_KTO,
+                max_steps=3,
+                logging_steps=3,
+                log_model_summary=True,
+                enable_wandb=False,
+                enable_tensorboard=False,
+                output_dir=output_training_dir,
+                try_resume_from_last_checkpoint=False,
+                save_final_model=True,
+                trainer_kwargs={
+                    "max_length": 512,
+                    "max_prompt_length": 128,
+                    "remove_unused_columns": False,
+                    "desirable_weight": 0.8,
+                },
+            ),
+        )
+
+        train(config)

Original file line number	Diff line number	Diff line change
`@@ -14,8 +14,10 @@`
`14`	`14`
`15`	`15`	`"""Preference tuning datasets module."""`
`16`	`16`
	`17`	`+from oumi.datasets.preference_tuning.kto_mix import KtoMix40kDataset`
`17`	`18`	`from oumi.datasets.preference_tuning.orpo_dpo_mix import OrpoDpoMix40kDataset`
`18`	`19`
`19`	`20`	`__all__ = [`
	`21`	`+ "KtoMix40kDataset",`
`20`	`22`	`"OrpoDpoMix40kDataset",`
`21`	`23`	`]`