|
13 | 13 | vLLMEngineProcessorConfig, |
14 | 14 | ) |
15 | 15 |
|
| 16 | +from skythought.evals.scoring.apps import APPSScorer |
16 | 17 | from skythought.evals.scoring.math import MathEqualScorer |
| 18 | +from skythought.evals.scoring.taco import TACOScorer |
17 | 19 |
|
18 | | -from .postprocess import APPSScorer, TACOScorer, convert_to_sharegpt_format |
| 20 | +from .postprocess import convert_to_sharegpt_format |
19 | 21 | from .preprocess import APPSPreprocessor, NUMINAPreprocessor, TACOPreprocessor |
20 | | -from .prompts import CONVERT_PROMPT |
| 22 | +from .prompts import CONVERT_PROMPT, CONVERT_PROMPT_EXAMPLE |
21 | 23 |
|
22 | 24 | parser = argparse.ArgumentParser() |
23 | 25 | parser.add_argument("--as-test", action="store_true") |
|
26 | 28 | SYSTEM_PROMPT = "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step." # noqa: E501 |
27 | 29 |
|
28 | 30 | # 1. Load datasets |
29 | | -apps_ds = datasets.load_dataset("codeparrot/apps", split="test", streaming=True) |
30 | | -taco_ds_medium = datasets.load_dataset( |
31 | | - "BAAI/TACO", split="test", name="MEDIUM", streaming=True |
| 31 | +apps_ds = datasets.load_dataset( |
| 32 | + "codeparrot/apps", |
| 33 | + split="test", |
32 | 34 | ) |
33 | | -numina_ds = datasets.load_dataset("AI-MO/NuminaMath-CoT", split="train", streaming=True) |
| 35 | +taco_ds_medium = datasets.load_dataset("BAAI/TACO", split="test", name="MEDIUM") |
| 36 | +numina_ds = datasets.load_dataset("AI-MO/NuminaMath-CoT", split="train") |
34 | 37 |
|
35 | 38 | # convert all to ray dataset |
36 | 39 | apps_ds = ray.data.from_huggingface(apps_ds) |
|
45 | 48 |
|
46 | 49 |
|
47 | 50 | if args.as_test: |
48 | | - apps_ds = apps_ds.limit(100) |
49 | | - taco_ds_medium = taco_ds_medium.limit(100) |
50 | | - numina_ds_amc_aime = numina_ds_amc_aime.limit(100) |
51 | | - numina_ds_olympiads = numina_ds_olympiads.limit(100) |
52 | | - numina_ds_math = numina_ds_math.limit(100) |
| 51 | + apps_ds = apps_ds.limit(5) |
| 52 | + taco_ds_medium = taco_ds_medium.limit(5) |
| 53 | + numina_ds_amc_aime = numina_ds_amc_aime.limit(5) |
| 54 | + numina_ds_olympiads = numina_ds_olympiads.limit(5) |
| 55 | + numina_ds_math = numina_ds_math.limit(5) |
53 | 56 |
|
54 | 57 | # 2. Get model responses for each of the datasets |
55 | 58 | datasets = [ |
|
69 | 72 | NUMINAPreprocessor(), |
70 | 73 | ] |
71 | 74 |
|
| 75 | +numina_scorer = MathEqualScorer( |
| 76 | + response_column="formatted_response", answer_column="solution" |
| 77 | +) |
| 78 | +scorers = [ |
| 79 | + APPSScorer(response_column="formatted_response"), |
| 80 | + TACOScorer(response_column="formatted_response"), |
| 81 | + numina_scorer, |
| 82 | + numina_scorer, |
| 83 | + numina_scorer, |
| 84 | +] |
| 85 | + |
72 | 86 | for i, ds in enumerate(datasets): |
73 | 87 | datasets[i] = ds.map(preprocess_fns[i]) |
74 | 88 |
|
75 | | - # our API |
76 | 89 | config = vLLMEngineProcessorConfig( |
77 | 90 | # model="Qwen/QwQ-32B-Preview", |
78 | 91 | model="Qwen/Qwen2-0.5B-Instruct", |
|
85 | 98 | batch_size=64, |
86 | 99 | ) |
87 | 100 |
|
88 | | - # our API |
89 | 101 | processor = build_llm_processor( |
90 | 102 | config, |
91 | 103 | preprocess=lambda row: dict( |
92 | 104 | messages=[ |
93 | | - SYSTEM_PROMPT, |
| 105 | + {"role": "system", "content": SYSTEM_PROMPT}, |
94 | 106 | {"role": "user", "content": row["user_input"]}, |
95 | 107 | ], |
96 | 108 | sampling_params=dict( |
|
104 | 116 | **row, # This will return all the original columns in the dataset. |
105 | 117 | ), |
106 | 118 | ) |
107 | | - # our API |
108 | | - datasets[i] = processor(ds) |
109 | | - |
110 | | -# 3. Reformat the examples into a structured format |
111 | | -# define a configuration for the reformatter |
112 | | -config = HttpRequestProcessorConfig( |
113 | | - url="https://api.openai.com/v1/chat/completions", |
114 | | - headers={"Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}"}, |
115 | | - # number of processors to run in parallel |
116 | | - # Each handles a batch of requests |
117 | | - concurrency=1, |
118 | | -) |
119 | | -# define the reformatter |
120 | | -reformatter = build_llm_processor( |
121 | | - config=config, |
122 | | - preprocess=lambda row: dict( |
123 | | - # define the payload / the exact arguments to the OpenAI chat completions API |
124 | | - payload=dict( |
125 | | - model="gpt-4o-mini", |
126 | | - messages=[ |
127 | | - {"role": "system", "content": "You are a solution format convertor."}, |
128 | | - { |
129 | | - "role": "user", |
130 | | - "content": CONVERT_PROMPT.format( |
131 | | - content=f"{row['question']}\n{row['assistant_response']}" |
132 | | - ), |
133 | | - }, |
134 | | - ], |
135 | | - temperature=0.7, |
136 | | - max_tokens=16384, |
| 119 | + datasets[i] = processor(datasets[i]) |
| 120 | + |
| 121 | + # 3. Reformat the examples into a structured format |
| 122 | + # define a configuration for the reformatter |
| 123 | + config = HttpRequestProcessorConfig( |
| 124 | + url="https://api.openai.com/v1/chat/completions", |
| 125 | + headers={"Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}"}, |
| 126 | + # number of processors to run in parallel |
| 127 | + # Each handles a batch of requests |
| 128 | + concurrency=1, |
| 129 | + ) |
| 130 | + # define the reformatter |
| 131 | + reformatter = build_llm_processor( |
| 132 | + config, |
| 133 | + preprocess=lambda row: dict( |
| 134 | + # define the payload / the exact arguments to the OpenAI chat completions API |
| 135 | + payload=dict( |
| 136 | + model="gpt-4o-mini", |
| 137 | + messages=[ |
| 138 | + { |
| 139 | + "role": "system", |
| 140 | + "content": "You are a solution format convertor.", |
| 141 | + }, |
| 142 | + { |
| 143 | + "role": "user", |
| 144 | + "content": CONVERT_PROMPT.format( |
| 145 | + example=CONVERT_PROMPT_EXAMPLE, |
| 146 | + content=f"{row['question']}\n{row['assistant_response']}", |
| 147 | + ), |
| 148 | + }, |
| 149 | + ], |
| 150 | + temperature=0.7, |
| 151 | + max_tokens=2048, |
| 152 | + ), |
137 | 153 | ), |
138 | | - ), |
139 | | - postprocess=lambda row: dict( |
140 | | - formatted_response=row["http_response"]["choices"][0]["message"]["content"], |
141 | | - ), |
142 | | - batch_size=64, |
143 | | -) |
144 | | - |
145 | | -for i, dataset in enumerate(datasets): |
146 | | - datasets[i] = reformatter(dataset) |
147 | | - |
148 | | - |
149 | | -# 4. Rejection Sampling based on scoring |
150 | | -# apps, taco, numina-amc-aime, numina-olympiads, numina-math |
151 | | -numina_scorer = MathEqualScorer( |
152 | | - response_key="formatted_response", answer_key="solution" |
153 | | -) |
154 | | -scorers = [APPSScorer(), TACOScorer(), numina_scorer, numina_scorer, numina_scorer] |
| 154 | + postprocess=lambda row: dict( |
| 155 | + formatted_response=row["http_response"]["choices"][0]["message"]["content"], |
| 156 | + **row, |
| 157 | + ), |
| 158 | + ) |
| 159 | + datasets[i] = reformatter(datasets[i]) |
155 | 160 |
|
156 | | -for i, dataset in enumerate(datasets): |
157 | | - fn = scorers[i] |
158 | | - datasets[i] = dataset.map(fn) |
| 161 | + # 4. Rejection Sampling based on scoring |
| 162 | + # apps, taco, numina-amc-aime, numina-olympiads, numina-math |
| 163 | + datasets[i] = datasets[i].map(scorers[i]) |
| 164 | + # datasets[i] = datasets[i].filter(lambda x: x[scorers[i].SCORE_COLUMN]) |
159 | 165 |
|
160 | | -# 5. Convert to ShareGPT format |
161 | | -for i, dataset in enumerate(datasets): |
162 | | - datasets[i] = dataset.map(convert_to_sharegpt_format) |
| 166 | + # 5. Convert to ShareGPT format |
| 167 | + datasets[i] = datasets[i].map(convert_to_sharegpt_format) |
163 | 168 |
|
164 | | -# 6. Union + Save datasets |
165 | | -datasets = datasets[0].union(*datasets[1:]) |
166 | | -datasets.write_parquet("sky-t1-preview.parquet") |
| 169 | + # 6. Save datasets |
| 170 | + dir_name = f"sky-t1-preview-{i}_parquet" |
| 171 | + datasets[i].write_parquet(os.path.abspath(dir_name)) |
0 commit comments