Capybara replaced with ultrafeedback_binarized (huggingface#2183)

August-murr · web-flow · commit 4799ba484259 · 2024-10-05T18:49:48.000+02:00
diff --git a/README.md b/README.md
@@ -187,7 +187,7 @@ from trl import DPOConfig, DPOTrainer
 
 model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
 tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
-dataset = load_dataset("trl-lib/Capybara-Preferences", split="train")
+dataset = load_dataset("trl-lib/ultrafeedback_binarized", split="train")
 training_args = DPOConfig(output_dir="Qwen2.5-0.5B-DPO")
 trainer = DPOTrainer(model=model, args=training_args, train_dataset=dataset, tokenizer=tokenizer)
 trainer.train()
diff --git a/docs/source/cpo_trainer.mdx b/docs/source/cpo_trainer.mdx
@@ -10,10 +10,10 @@ CPO aims to mitigate two fundamental shortcomings of SFT. First, SFT’s methodo
 
 ## Quick start
 
-This example demonstrates how to train a model using the CPO method. We use the [Qwen 0.5B model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) as the base model. We use the preference data from the [Capybara dataset](https://huggingface.co/datasets/openbmb/UltraFeedback). You can view the data in the dataset here:
+This example demonstrates how to train a model using the CPO method. We use the [Qwen 0.5B model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) as the base model. We use the preference data from the [UltraFeedback dataset](https://huggingface.co/datasets/openbmb/UltraFeedback). You can view the data in the dataset here:
 
 <iframe
-  src="https://huggingface.co/datasets/trl-lib/Capybara-Preferences/embed/viewer/default/train?row=0"
+  src="https://huggingface.co/datasets/trl-lib/ultrafeedback_binarized/embed/viewer/default/train?row=0"
   frameborder="0"
   width="100%"
   height="560px"
@@ -29,7 +29,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 
 model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
 tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
-train_dataset = load_dataset("trl-lib/Capybara-Preferences", split="train")
+train_dataset = load_dataset("trl-lib/ultrafeedback_binarized", split="train")
 
 training_args = CPOConfig(output_dir="Qwen2-0.5B-CPO", logging_steps=10)
 trainer = CPOTrainer(model=model, args=training_args, tokenizer=tokenizer, train_dataset=train_dataset)
diff --git a/docs/source/dpo_trainer.mdx b/docs/source/dpo_trainer.mdx
@@ -25,10 +25,10 @@ Read more about DPO algorithm in the [original paper](https://huggingface.co/pap
 
 ## Quick start
 
-This example demonstrates how to train a model using the DPO method. We use the [Qwen 0.5B model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) as the base model. We use the preference data from the [Capybara dataset](https://huggingface.co/datasets/openbmb/UltraFeedback). You can view the data in the dataset here:
+This example demonstrates how to train a model using the DPO method. We use the [Qwen 0.5B model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) as the base model. We use the preference data from the [UltraFeedback dataset](https://huggingface.co/datasets/openbmb/UltraFeedback). You can view the data in the dataset here:
 
 <iframe
-  src="https://huggingface.co/datasets/trl-lib/Capybara-Preferences/embed/viewer/default/train?row=0"
+  src="https://huggingface.co/datasets/trl-lib/ultrafeedback_binarized/embed/viewer/default/train?row=0"
   frameborder="0"
   width="100%"
   height="560px"
@@ -44,7 +44,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 
 model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
 tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
-train_dataset = load_dataset("trl-lib/Capybara-Preferences", split="train")
+train_dataset = load_dataset("trl-lib/ultrafeedback_binarized", split="train")
 
 training_args = DPOConfig(output_dir="Qwen2-0.5B-DPO", logging_steps=10)
 trainer = DPOTrainer(model=model, args=training_args, tokenizer=tokenizer, train_dataset=train_dataset)
@@ -190,7 +190,7 @@ First install `unsloth` according to the [official documentation](https://github
 - tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
 + model, tokenizer = FastLanguageModel.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
 + model = FastLanguageModel.get_peft_model(model)
-  train_dataset = load_dataset("trl-lib/Capybara-Preferences", split="train")
+  train_dataset = load_dataset("trl-lib/ultrafeedback_binarized", split="train")
 
 - training_args = DPOConfig(output_dir="Qwen2-0.5B-DPO", logging_steps=10)
 + training_args = DPOConfig(output_dir="Qwen2-0.5B-DPO", logging_steps=10, bf16=True)