Skip to content

Commit 695c651

Browse files
authored
Merge pull request #443 from bbrowning/separation-of-concerns
Split up `generate_data` and add a `mix_datasets` top level API
2 parents a532a8d + 1f9394d commit 695c651

25 files changed

+1037
-215
lines changed

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -167,3 +167,6 @@ cython_debug/
167167

168168
# IDEs
169169
.vscode/
170+
171+
# SDG examples output
172+
docs/examples/**/output
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# An example of how to concatenate two datasets
2+
# Each dataset has a sampling_size of 1.0 to take all samples from both
3+
datasets:
4+
- path: dataset_1.jsonl
5+
sampling_size: 1.0
6+
- path: dataset_2.jsonl
7+
sampling_size: 1.0
8+
sys_prompt: I am a reliable AI assistant.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{"id": "dataset_1_1", "messages": [], "metadata": {}}
2+
{"id": "dataset_1_2", "messages": [], "metadata": {}}
3+
{"id": "dataset_1_3", "messages": [], "metadata": {}}
4+
{"id": "dataset_1_4", "messages": [], "metadata": {}}
5+
{"id": "dataset_1_5", "messages": [], "metadata": {}}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{"id": "dataset_2_1", "messages": [], "metadata": {}}
2+
{"id": "dataset_2_2", "messages": [], "metadata": {}}
3+
{"id": "dataset_2_3", "messages": [], "metadata": {}}
4+
{"id": "dataset_2_4", "messages": [], "metadata": {}}
5+
{"id": "dataset_2_5", "messages": [], "metadata": {}}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
3+
# Standard
4+
from pathlib import Path
5+
6+
# First Party
7+
from instructlab.sdg import mix_datasets
8+
9+
output_dir = Path(__file__).parent.joinpath("output")
10+
output_dir.mkdir(exist_ok=True)
11+
12+
concatenate_recipe_yaml = Path(__file__).parent.joinpath("concatenate_recipe.yaml")
13+
concatenated_output_jsonl = output_dir.joinpath("concatenated.jsonl")
14+
mix_datasets(concatenate_recipe_yaml, concatenated_output_jsonl)
15+
16+
weighted_recipe_yaml = Path(__file__).parent.joinpath("weighted_recipe.yaml")
17+
weighted_output_jsonl = output_dir.joinpath("weighted.jsonl")
18+
mix_datasets(weighted_recipe_yaml, weighted_output_jsonl)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# An example of how to weight one dataset over another
2+
# Dataset 1 has a sampling size of 2.0 to double its samples
3+
# Dataset 2 has a sampling size of 0.2 to take 20% of its samples
4+
datasets:
5+
- path: dataset_1.jsonl
6+
sampling_size: 2.0
7+
- path: dataset_2.jsonl
8+
sampling_size: 0.2
9+
sys_prompt: I am a reliable AI assistant.

src/instructlab/sdg/__init__.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
"FULL_PIPELINES_PACKAGE",
3030
"SIMPLE_PIPELINES_PACKAGE",
3131
"generate_data",
32+
"mix_datasets",
3233
)
3334

3435
# Local
@@ -50,7 +51,7 @@
5051
SelectorBlock,
5152
SetToMajorityValueBlock,
5253
)
53-
from .generate_data import generate_data
54+
from .generate_data import generate_data, mix_datasets
5455
from .pipeline import (
5556
FULL_PIPELINES_PACKAGE,
5657
SIMPLE_PIPELINES_PACKAGE,

src/instructlab/sdg/datamixing.py

+20-3
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ def _create_mixed_dataset(self, num_proc):
160160
Create the final mixed dataset by loading, sampling, and
161161
concatenating all datasets in this recipe
162162
"""
163-
if not self.dataset_added:
163+
if not self.datasets:
164164
logger.error("No dataset added to the recipe")
165165

166166
mixed_ds = self._load_and_sample_datasets(num_proc)
@@ -726,19 +726,36 @@ def collect(
726726
sampling_size=self.NUM_SYNTH_SKILLS,
727727
)
728728

729+
def _write_mixed_recipe(self, recipe, output_file_recipe):
730+
"""
731+
Write the recipes created during data mixing without writing the actual
732+
mixed datasets to disk.
733+
"""
734+
full_recipe_path = os.path.join(self.output_dir, output_file_recipe)
735+
recipe.save_recipe(full_recipe_path)
736+
729737
def _gen_mixed_data(self, recipe, output_file_recipe, output_file_data):
730738
"""
731739
Mix the generated leaf node data into a single dataset and write it to
732740
disk. The heavy lifting is delegated to the Recipe class.
733741
"""
742+
self._write_mixed_recipe(recipe, output_file_recipe)
734743
if recipe.dataset_added:
735-
full_recipe_path = os.path.join(self.output_dir, output_file_recipe)
736-
recipe.save_recipe(full_recipe_path)
737744
recipe.save_mixed_dataset(
738745
os.path.join(self.output_dir, output_file_data),
739746
self.num_procs,
740747
)
741748

749+
def write_recipes(self):
750+
self._write_mixed_recipe(
751+
self.knowledge_recipe,
752+
self.output_file_knowledge_recipe,
753+
)
754+
self._write_mixed_recipe(
755+
self.skills_recipe,
756+
self.output_file_skills_recipe,
757+
)
758+
742759
def generate(self):
743760
self._gen_mixed_data(
744761
self.knowledge_recipe,

0 commit comments

Comments
 (0)