Skip to content

Commit 1f9394d

Browse files
committed
Add examples for how to use data mixing
This adds a new docs/examples/mix_datasets folders with a couple of example recipes, two sample datasets, and an example_mixing.py Python script to show how to mix datasets. This also adds a test_examples.py file that actually runs out examples, ensuring they work without error and generate the expected mixed datasets. Signed-off-by: Ben Browning <[email protected]>
1 parent 6c8544e commit 1f9394d

11 files changed

+203
-77
lines changed

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -167,3 +167,6 @@ cython_debug/
167167

168168
# IDEs
169169
.vscode/
170+
171+
# SDG examples output
172+
docs/examples/**/output
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# An example of how to concatenate two datasets
2+
# Each dataset has a sampling_size of 1.0 to take all samples from both
3+
datasets:
4+
- path: dataset_1.jsonl
5+
sampling_size: 1.0
6+
- path: dataset_2.jsonl
7+
sampling_size: 1.0
8+
sys_prompt: I am a reliable AI assistant.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{"id": "dataset_1_1", "messages": [], "metadata": {}}
2+
{"id": "dataset_1_2", "messages": [], "metadata": {}}
3+
{"id": "dataset_1_3", "messages": [], "metadata": {}}
4+
{"id": "dataset_1_4", "messages": [], "metadata": {}}
5+
{"id": "dataset_1_5", "messages": [], "metadata": {}}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{"id": "dataset_2_1", "messages": [], "metadata": {}}
2+
{"id": "dataset_2_2", "messages": [], "metadata": {}}
3+
{"id": "dataset_2_3", "messages": [], "metadata": {}}
4+
{"id": "dataset_2_4", "messages": [], "metadata": {}}
5+
{"id": "dataset_2_5", "messages": [], "metadata": {}}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
3+
# Standard
4+
from pathlib import Path
5+
6+
# First Party
7+
from instructlab.sdg import mix_datasets
8+
9+
output_dir = Path(__file__).parent.joinpath("output")
10+
output_dir.mkdir(exist_ok=True)
11+
12+
concatenate_recipe_yaml = Path(__file__).parent.joinpath("concatenate_recipe.yaml")
13+
concatenated_output_jsonl = output_dir.joinpath("concatenated.jsonl")
14+
mix_datasets(concatenate_recipe_yaml, concatenated_output_jsonl)
15+
16+
weighted_recipe_yaml = Path(__file__).parent.joinpath("weighted_recipe.yaml")
17+
weighted_output_jsonl = output_dir.joinpath("weighted.jsonl")
18+
mix_datasets(weighted_recipe_yaml, weighted_output_jsonl)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# An example of how to weight one dataset over another
2+
# Dataset 1 has a sampling size of 2.0 to double its samples
3+
# Dataset 2 has a sampling size of 0.2 to take 20% of its samples
4+
datasets:
5+
- path: dataset_1.jsonl
6+
sampling_size: 2.0
7+
- path: dataset_2.jsonl
8+
sampling_size: 0.2
9+
sys_prompt: I am a reliable AI assistant.

tests/functional/conftest.py

+7
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,16 @@
66
import pytest
77

88
TESTS_PATH = pathlib.Path(__file__).parent.parent.absolute()
9+
EXAMPLES_PATH = TESTS_PATH.parent.joinpath("docs", "examples")
910

1011

1112
@pytest.fixture
1213
def testdata_path() -> typing.Generator[pathlib.Path, None, None]:
1314
"""Path to local test data directory"""
1415
yield TESTS_PATH / "testdata"
16+
17+
18+
@pytest.fixture
19+
def examples_path() -> typing.Generator[pathlib.Path, None, None]:
20+
"""Path to examples directory"""
21+
yield EXAMPLES_PATH

tests/functional/test_examples.py

+40
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
3+
# Standard
4+
import pathlib
5+
import shutil
6+
import subprocess
7+
import sys
8+
9+
# First Party
10+
from instructlab.sdg.utils.json import jlload
11+
12+
13+
def test_example_mixing(tmp_path: pathlib.Path, examples_path: pathlib.Path):
14+
example_copy_path = tmp_path.joinpath("mix_datasets")
15+
shutil.copytree(examples_path.joinpath("mix_datasets"), example_copy_path)
16+
script = example_copy_path.joinpath("example_mixing.py")
17+
subprocess.check_call([sys.executable, str(script)], text=True)
18+
19+
concatenated = jlload(example_copy_path.joinpath("output", "concatenated.jsonl"))
20+
assert len(concatenated) == 10
21+
from_ds_1 = []
22+
from_ds_2 = []
23+
for sample in concatenated:
24+
if sample["id"].startswith("dataset_1"):
25+
from_ds_1.append(sample)
26+
else:
27+
from_ds_2.append(sample)
28+
assert len(from_ds_1) == len(from_ds_2) == 5
29+
30+
weighted = jlload(example_copy_path.joinpath("output", "weighted.jsonl"))
31+
assert len(weighted) == 11
32+
from_ds_1 = []
33+
from_ds_2 = []
34+
for sample in weighted:
35+
if sample["id"].startswith("dataset_1"):
36+
from_ds_1.append(sample)
37+
else:
38+
from_ds_2.append(sample)
39+
assert len(from_ds_1) == 10
40+
assert len(from_ds_2) == 1

tests/functional/test_granular_api.py

+99-71
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,13 @@
44
from datetime import datetime
55
from unittest.mock import MagicMock
66
import glob
7+
import os
78
import pathlib
9+
import unittest
810

911
# Third Party
1012
import git
13+
import pytest
1114

1215
# First Party
1316
from instructlab.sdg import BlockRegistry
@@ -20,6 +23,7 @@
2023

2124
# Local
2225
from ..mockllmblock import MockLLMBlock
26+
from ..taxonomy import load_test_skills
2327

2428

2529
def _clone_instructlab_taxonomy(taxonomy_dir):
@@ -29,75 +33,99 @@ def _clone_instructlab_taxonomy(taxonomy_dir):
2933
repo.git.checkout(taxonomy_commit)
3034

3135

32-
def test_granular_api_end_to_end(testdata_path: pathlib.Path, tmp_path: pathlib.Path):
33-
# Registry our mock block so we can reference it in pipelines
34-
BlockRegistry.register("MockLLMBlock")(MockLLMBlock)
35-
36-
# Clone a taxonomy and edit 1 file in it
37-
taxonomy_dir = tmp_path.joinpath("taxonomy")
38-
_clone_instructlab_taxonomy(taxonomy_dir)
39-
changed_qna_yaml = taxonomy_dir.joinpath(
40-
"knowledge", "science", "animals", "birds", "black_capped_chickadee", "qna.yaml"
41-
)
42-
with open(changed_qna_yaml, "a", encoding="utf-8") as file:
43-
file.write("")
44-
45-
pipeline_dir = testdata_path.joinpath("mock_pipelines")
46-
date_suffix = datetime.now().replace(microsecond=0).isoformat().replace(":", "_")
47-
48-
preprocessed_dir = tmp_path.joinpath("preprocessed")
49-
teacher_model_path = testdata_path.joinpath("models/instructlab/granite-7b-lab")
50-
preprocess_taxonomy(
51-
taxonomy_dir=taxonomy_dir,
52-
output_dir=preprocessed_dir,
53-
teacher_model_path=teacher_model_path,
54-
)
55-
chickadee_docs = glob.glob(
56-
str(
57-
preprocessed_dir.joinpath(
58-
"documents", "knowledge_science_*", "chickadee.md"
59-
)
36+
class TestGranularAPI(unittest.TestCase):
37+
@pytest.fixture(autouse=True)
38+
def _init_taxonomy(self, taxonomy_dir, testdata_path, tmp_path):
39+
self.test_taxonomy = taxonomy_dir
40+
self.testdata_path = testdata_path
41+
self.tmp_path = tmp_path
42+
43+
def setUp(self):
44+
test_valid_knowledge_skill_file = self.testdata_path.joinpath(
45+
"test_valid_knowledge_skill.yaml"
46+
)
47+
untracked_knowledge_file = os.path.join("knowledge", "new", "qna.yaml")
48+
test_valid_knowledge_skill = load_test_skills(test_valid_knowledge_skill_file)
49+
self.test_taxonomy.create_untracked(
50+
untracked_knowledge_file, test_valid_knowledge_skill
51+
)
52+
53+
def file_list(self):
54+
return glob.glob(str(self.tmp_path.joinpath("**/*")), recursive=True)
55+
56+
def test_granular_api_end_to_end(self):
57+
# Registry our mock block so we can reference it in pipelines
58+
BlockRegistry.register("MockLLMBlock")(MockLLMBlock)
59+
60+
# Clone a taxonomy and edit 1 file in it
61+
taxonomy_dir = self.tmp_path
62+
63+
pipeline_dir = self.testdata_path.joinpath("mock_pipelines")
64+
date_suffix = (
65+
datetime.now().replace(microsecond=0).isoformat().replace(":", "_")
66+
)
67+
68+
preprocessed_dir = self.tmp_path.joinpath("preprocessed")
69+
teacher_model_path = self.testdata_path.joinpath(
70+
"models/instructlab/granite-7b-lab"
71+
)
72+
preprocess_taxonomy(
73+
taxonomy_dir=taxonomy_dir,
74+
output_dir=preprocessed_dir,
75+
teacher_model_path=teacher_model_path,
76+
)
77+
docs = glob.glob(
78+
str(preprocessed_dir.joinpath("documents", "knowledge_new_*", "phoenix.md"))
79+
)
80+
assert docs, f"Expected docs not found in {self.file_list()}"
81+
samples_path = preprocessed_dir.joinpath("knowledge_new.jsonl")
82+
assert (
83+
samples_path.is_file()
84+
), f"Expected samples file not found in {self.file_list()}"
85+
86+
client = MagicMock()
87+
client.server_supports_batched = False
88+
generated_dir = self.tmp_path.joinpath("generated")
89+
generate_taxonomy(
90+
client=client,
91+
input_dir=preprocessed_dir,
92+
output_dir=generated_dir,
93+
pipeline=pipeline_dir,
94+
num_cpus=1, # Test is faster running on a single CPU vs forking
95+
batch_size=0, # Disable batch for tiny dataset and fastest test
96+
)
97+
generated_samples_path = generated_dir.joinpath("knowledge_new.jsonl")
98+
assert (
99+
generated_samples_path.is_file()
100+
), f"Generated samples not found in {self.file_list()}"
101+
102+
postprocessed_dir = self.tmp_path.joinpath("postprocessed")
103+
postprocess_taxonomy(
104+
input_dir=generated_dir,
105+
output_dir=postprocessed_dir,
106+
date_suffix=date_suffix,
107+
pipeline=pipeline_dir,
108+
)
109+
knowledge_recipe_file = postprocessed_dir.joinpath(
110+
f"knowledge_recipe_{date_suffix}.yaml"
111+
)
112+
assert (
113+
knowledge_recipe_file.is_file()
114+
), f"Generated knowledge recipe file not found in {self.file_list()}"
115+
skills_recipe_file = postprocessed_dir.joinpath(
116+
f"skills_recipe_{date_suffix}.yaml"
117+
)
118+
assert (
119+
skills_recipe_file.is_file()
120+
), f"Generated skills recipe file not found in {self.file_list()}"
121+
122+
mixed_skills_output_file = (
123+
f"{postprocessed_dir}/skills_train_msgs_{date_suffix}.jsonl"
124+
)
125+
mix_datasets(
126+
recipe_file=f"{postprocessed_dir}/skills_recipe_{date_suffix}.yaml",
127+
output_file=mixed_skills_output_file,
60128
)
61-
)
62-
assert chickadee_docs
63-
chickadee_samples_path = preprocessed_dir.joinpath(
64-
"knowledge_science_animals_birds_black_capped_chickadee.jsonl"
65-
)
66-
assert chickadee_samples_path.is_file()
67-
68-
client = MagicMock()
69-
client.server_supports_batched = False
70-
generated_dir = tmp_path.joinpath("generated")
71-
generate_taxonomy(
72-
client=client,
73-
input_dir=preprocessed_dir,
74-
output_dir=generated_dir,
75-
pipeline=pipeline_dir,
76-
)
77-
generated_chickadee_samples_path = generated_dir.joinpath(
78-
"knowledge_science_animals_birds_black_capped_chickadee.jsonl"
79-
)
80-
assert generated_chickadee_samples_path.is_file()
81-
82-
postprocessed_dir = tmp_path.joinpath("postprocessed")
83-
postprocess_taxonomy(
84-
input_dir=generated_dir,
85-
output_dir=postprocessed_dir,
86-
date_suffix=date_suffix,
87-
pipeline=pipeline_dir,
88-
)
89-
knowledge_recipe_file = postprocessed_dir.joinpath(
90-
f"knowledge_recipe_{date_suffix}.yaml"
91-
)
92-
assert knowledge_recipe_file.is_file()
93-
skills_recipe_file = postprocessed_dir.joinpath(f"skills_recipe_{date_suffix}.yaml")
94-
assert skills_recipe_file.is_file()
95-
96-
mixed_skills_output_file = (
97-
f"{postprocessed_dir}/skills_train_msgs_{date_suffix}.jsonl"
98-
)
99-
mix_datasets(
100-
recipe_file=f"{postprocessed_dir}/skills_recipe_{date_suffix}.yaml",
101-
output_file=mixed_skills_output_file,
102-
)
103-
assert pathlib.Path(mixed_skills_output_file).is_file()
129+
assert pathlib.Path(
130+
mixed_skills_output_file
131+
).is_file(), f"Generated mixed output not found in {self.file_list()}"

tests/taxonomy.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
# Standard
44
from pathlib import Path
5-
from typing import Any, Dict, List
5+
from typing import Any, Dict, List, Union
66
import shutil
77

88
# Third Party
@@ -68,3 +68,8 @@ def __enter__(self):
6868

6969
def __exit__(self, *args):
7070
self.teardown()
71+
72+
73+
def load_test_skills(skills_file_path) -> Union[Dict[str, Any], None]:
74+
with open(skills_file_path, "r", encoding="utf-8") as skills_file:
75+
return yaml.safe_load(skills_file)

tests/test_generate_data.py

+3-5
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@
2828
generate_data,
2929
)
3030

31+
# Local
32+
from .taxonomy import load_test_skills
33+
3134
TEST_SYS_PROMPT = "I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant."
3235

3336
TEST_TAXONOMY_BASE = "main"
@@ -232,11 +235,6 @@ def add_question_mark(q):
232235
return train_samples
233236

234237

235-
def load_test_skills(skills_file_path) -> Union[Dict[str, Any], None]:
236-
with open(skills_file_path, "r", encoding="utf-8") as skills_file:
237-
return yaml.safe_load(skills_file)
238-
239-
240238
def _noop_llmblock_generate(self, samples):
241239
"""Generate mock output based on input samples.
242240

0 commit comments

Comments
 (0)