4
4
from datetime import datetime
5
5
from unittest .mock import MagicMock
6
6
import glob
7
+ import os
7
8
import pathlib
9
+ import unittest
8
10
9
11
# Third Party
10
12
import git
13
+ import pytest
11
14
12
15
# First Party
13
16
from instructlab .sdg import BlockRegistry
20
23
21
24
# Local
22
25
from ..mockllmblock import MockLLMBlock
26
+ from ..taxonomy import load_test_skills
23
27
24
28
25
29
def _clone_instructlab_taxonomy (taxonomy_dir ):
@@ -29,75 +33,99 @@ def _clone_instructlab_taxonomy(taxonomy_dir):
29
33
repo .git .checkout (taxonomy_commit )
30
34
31
35
32
- def test_granular_api_end_to_end (testdata_path : pathlib .Path , tmp_path : pathlib .Path ):
33
- # Registry our mock block so we can reference it in pipelines
34
- BlockRegistry .register ("MockLLMBlock" )(MockLLMBlock )
35
-
36
- # Clone a taxonomy and edit 1 file in it
37
- taxonomy_dir = tmp_path .joinpath ("taxonomy" )
38
- _clone_instructlab_taxonomy (taxonomy_dir )
39
- changed_qna_yaml = taxonomy_dir .joinpath (
40
- "knowledge" , "science" , "animals" , "birds" , "black_capped_chickadee" , "qna.yaml"
41
- )
42
- with open (changed_qna_yaml , "a" , encoding = "utf-8" ) as file :
43
- file .write ("" )
44
-
45
- pipeline_dir = testdata_path .joinpath ("mock_pipelines" )
46
- date_suffix = datetime .now ().replace (microsecond = 0 ).isoformat ().replace (":" , "_" )
47
-
48
- preprocessed_dir = tmp_path .joinpath ("preprocessed" )
49
- teacher_model_path = testdata_path .joinpath ("models/instructlab/granite-7b-lab" )
50
- preprocess_taxonomy (
51
- taxonomy_dir = taxonomy_dir ,
52
- output_dir = preprocessed_dir ,
53
- teacher_model_path = teacher_model_path ,
54
- )
55
- chickadee_docs = glob .glob (
56
- str (
57
- preprocessed_dir .joinpath (
58
- "documents" , "knowledge_science_*" , "chickadee.md"
59
- )
36
+ class TestGranularAPI (unittest .TestCase ):
37
+ @pytest .fixture (autouse = True )
38
+ def _init_taxonomy (self , taxonomy_dir , testdata_path , tmp_path ):
39
+ self .test_taxonomy = taxonomy_dir
40
+ self .testdata_path = testdata_path
41
+ self .tmp_path = tmp_path
42
+
43
+ def setUp (self ):
44
+ test_valid_knowledge_skill_file = self .testdata_path .joinpath (
45
+ "test_valid_knowledge_skill.yaml"
46
+ )
47
+ untracked_knowledge_file = os .path .join ("knowledge" , "new" , "qna.yaml" )
48
+ test_valid_knowledge_skill = load_test_skills (test_valid_knowledge_skill_file )
49
+ self .test_taxonomy .create_untracked (
50
+ untracked_knowledge_file , test_valid_knowledge_skill
51
+ )
52
+
53
+ def file_list (self ):
54
+ return glob .glob (str (self .tmp_path .joinpath ("**/*" )), recursive = True )
55
+
56
+ def test_granular_api_end_to_end (self ):
57
+ # Registry our mock block so we can reference it in pipelines
58
+ BlockRegistry .register ("MockLLMBlock" )(MockLLMBlock )
59
+
60
+ # Clone a taxonomy and edit 1 file in it
61
+ taxonomy_dir = self .tmp_path
62
+
63
+ pipeline_dir = self .testdata_path .joinpath ("mock_pipelines" )
64
+ date_suffix = (
65
+ datetime .now ().replace (microsecond = 0 ).isoformat ().replace (":" , "_" )
66
+ )
67
+
68
+ preprocessed_dir = self .tmp_path .joinpath ("preprocessed" )
69
+ teacher_model_path = self .testdata_path .joinpath (
70
+ "models/instructlab/granite-7b-lab"
71
+ )
72
+ preprocess_taxonomy (
73
+ taxonomy_dir = taxonomy_dir ,
74
+ output_dir = preprocessed_dir ,
75
+ teacher_model_path = teacher_model_path ,
76
+ )
77
+ docs = glob .glob (
78
+ str (preprocessed_dir .joinpath ("documents" , "knowledge_new_*" , "phoenix.md" ))
79
+ )
80
+ assert docs , f"Expected docs not found in { self .file_list ()} "
81
+ samples_path = preprocessed_dir .joinpath ("knowledge_new.jsonl" )
82
+ assert (
83
+ samples_path .is_file ()
84
+ ), f"Expected samples file not found in { self .file_list ()} "
85
+
86
+ client = MagicMock ()
87
+ client .server_supports_batched = False
88
+ generated_dir = self .tmp_path .joinpath ("generated" )
89
+ generate_taxonomy (
90
+ client = client ,
91
+ input_dir = preprocessed_dir ,
92
+ output_dir = generated_dir ,
93
+ pipeline = pipeline_dir ,
94
+ num_cpus = 1 , # Test is faster running on a single CPU vs forking
95
+ batch_size = 0 , # Disable batch for tiny dataset and fastest test
96
+ )
97
+ generated_samples_path = generated_dir .joinpath ("knowledge_new.jsonl" )
98
+ assert (
99
+ generated_samples_path .is_file ()
100
+ ), f"Generated samples not found in { self .file_list ()} "
101
+
102
+ postprocessed_dir = self .tmp_path .joinpath ("postprocessed" )
103
+ postprocess_taxonomy (
104
+ input_dir = generated_dir ,
105
+ output_dir = postprocessed_dir ,
106
+ date_suffix = date_suffix ,
107
+ pipeline = pipeline_dir ,
108
+ )
109
+ knowledge_recipe_file = postprocessed_dir .joinpath (
110
+ f"knowledge_recipe_{ date_suffix } .yaml"
111
+ )
112
+ assert (
113
+ knowledge_recipe_file .is_file ()
114
+ ), f"Generated knowledge recipe file not found in { self .file_list ()} "
115
+ skills_recipe_file = postprocessed_dir .joinpath (
116
+ f"skills_recipe_{ date_suffix } .yaml"
117
+ )
118
+ assert (
119
+ skills_recipe_file .is_file ()
120
+ ), f"Generated skills recipe file not found in { self .file_list ()} "
121
+
122
+ mixed_skills_output_file = (
123
+ f"{ postprocessed_dir } /skills_train_msgs_{ date_suffix } .jsonl"
124
+ )
125
+ mix_datasets (
126
+ recipe_file = f"{ postprocessed_dir } /skills_recipe_{ date_suffix } .yaml" ,
127
+ output_file = mixed_skills_output_file ,
60
128
)
61
- )
62
- assert chickadee_docs
63
- chickadee_samples_path = preprocessed_dir .joinpath (
64
- "knowledge_science_animals_birds_black_capped_chickadee.jsonl"
65
- )
66
- assert chickadee_samples_path .is_file ()
67
-
68
- client = MagicMock ()
69
- client .server_supports_batched = False
70
- generated_dir = tmp_path .joinpath ("generated" )
71
- generate_taxonomy (
72
- client = client ,
73
- input_dir = preprocessed_dir ,
74
- output_dir = generated_dir ,
75
- pipeline = pipeline_dir ,
76
- )
77
- generated_chickadee_samples_path = generated_dir .joinpath (
78
- "knowledge_science_animals_birds_black_capped_chickadee.jsonl"
79
- )
80
- assert generated_chickadee_samples_path .is_file ()
81
-
82
- postprocessed_dir = tmp_path .joinpath ("postprocessed" )
83
- postprocess_taxonomy (
84
- input_dir = generated_dir ,
85
- output_dir = postprocessed_dir ,
86
- date_suffix = date_suffix ,
87
- pipeline = pipeline_dir ,
88
- )
89
- knowledge_recipe_file = postprocessed_dir .joinpath (
90
- f"knowledge_recipe_{ date_suffix } .yaml"
91
- )
92
- assert knowledge_recipe_file .is_file ()
93
- skills_recipe_file = postprocessed_dir .joinpath (f"skills_recipe_{ date_suffix } .yaml" )
94
- assert skills_recipe_file .is_file ()
95
-
96
- mixed_skills_output_file = (
97
- f"{ postprocessed_dir } /skills_train_msgs_{ date_suffix } .jsonl"
98
- )
99
- mix_datasets (
100
- recipe_file = f"{ postprocessed_dir } /skills_recipe_{ date_suffix } .yaml" ,
101
- output_file = mixed_skills_output_file ,
102
- )
103
- assert pathlib .Path (mixed_skills_output_file ).is_file ()
129
+ assert pathlib .Path (
130
+ mixed_skills_output_file
131
+ ).is_file (), f"Generated mixed output not found in { self .file_list ()} "
0 commit comments