Skip to content

Commit 6fdca5c

Browse files
committed
more
Signed-off-by: SumanthRH <[email protected]>
1 parent e113874 commit 6fdca5c

File tree

16 files changed

+1457
-137
lines changed

16 files changed

+1457
-137
lines changed

.pre-commit-config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,12 @@ repos:
55
- id: ruff
66
args: [ --fix, --exit-non-zero-on-fix ]
77
# NOTE (sumanthrh): Many of the files excluded here are used for validating code generation, and linters do not recognize some of the logic in these files. skythought/train is excluded for now because it's a fork of Llamafactory
8-
exclude: (^skythought/train/.*|^skythought/skythought-rl/.*|tasks/taco/pyext2\.py|tasks/taco/taco_util\.py|tasks/apps/apps_util\.py|scripts/prompts\.py|skythought/test-time-scaling/.*)$
8+
exclude: (^skythought/train/.*|^skythought/skythought-rl/.*|pyext2\.py|taco_util\.py|apps_util\.py|scripts/prompts\.py|skythought/test-time-scaling/.*)$
99

1010

1111
# Black needs to be ran after ruff with --fix
1212
- repo: https://github.com/psf/black
1313
rev: 24.10.0
1414
hooks:
1515
- id: black
16-
exclude: (^skythought/train/.*|^skythought/skythought-rl/.*|tasks/taco/pyext2\.py|skythought/test-time-scaling/.*)$
16+
exclude: (^skythought/train/.*|^skythought/skythought-rl/.*|pyext2\.py|skythought/test-time-scaling/.*)$

recipes/sky-t1-preview/postprocess.py

Lines changed: 0 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,5 @@
1-
import copy
2-
import json
31
from typing import Any, Dict
42

5-
import numpy as np
6-
import ray
7-
8-
from skythought.evals.scoring.base import Scorer
9-
from skythought.evals.tasks.apps.apps_util import run_test as apps_run_test
10-
from skythought.evals.util.common import has_code
11-
123
STILL2_SYSTEM_PROMPT = "Your role as an assistant involves thoroughly exploring questions through a systematic long \
134
thinking process before providing the final precise and accurate solutions. This requires \
145
engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, \
@@ -30,41 +21,6 @@
3021
Now, try to solve the following question through the above guidelines:"
3122

3223

33-
class APPSScorer(Scorer):
34-
def score(self, row: Dict[str, Any]):
35-
TIMEOUT = 10
36-
code_filter_result = has_code(row["response"])
37-
if len(code_filter_result) == 0:
38-
return False
39-
else:
40-
last_code = code_filter_result[-1]
41-
problem_to_check = copy.deepcopy(row)
42-
problem_to_check["input_output"] = json.loads(row["input_output"])
43-
try:
44-
problem_to_check["solutions"] = json.loads(row["solutions"])
45-
except Exception:
46-
problem_to_check["solutions"] = ""
47-
48-
@ray.remote
49-
def _temp_run(problem, generation, debug):
50-
try:
51-
result = apps_run_test(problem=problem, test=generation, debug=debug)
52-
return result
53-
except Exception:
54-
pass
55-
56-
result = ray.get(
57-
_temp_run.remote(problem_to_check, last_code, False), timeout=TIMEOUT + 1
58-
)
59-
60-
return bool(result and np.all(result[0]))
61-
62-
63-
class TACOScorer(Scorer):
64-
def score(self, row: Dict[str, Any]):
65-
return True
66-
67-
6824
def convert_to_sharegpt_format(row: Dict[str, Any]):
6925
prompt = row["user_input"]
7026
# accept

recipes/sky-t1-preview/prompts.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
convert_prompt_example = ( # noqa: E501
1+
CONVERT_PROMPT_EXAMPLE = ( # noqa: E501
22
"<|begin_of_thought|>\n\n"
33
"Okay, so I've got this problem here. Mr. Wang leaves home at 6 AM, riding his bike at 12 km/h, "
44
"and he stops to rest for 6 minutes after every 30 minutes of riding. Then, when he arrives at a park "
@@ -65,5 +65,5 @@
6565
"{example}\n"
6666
"Important: You should almost copy all the contents word-by-word of the original solution. Just convert them into two sections. "
6767
"Make sure you include: <|begin_of_slow_thought|>, <|end_of_slow_thought|>, <|begin_of_solution|>,<|end_of_solution|> These four headers explicitly. "
68-
"Content to be converted: {{content}}".format(example=convert_prompt_example)
68+
"Content to be converted: {content}"
6969
)

recipes/sky-t1-preview/recipe.py

Lines changed: 75 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,13 @@
1313
vLLMEngineProcessorConfig,
1414
)
1515

16+
from skythought.evals.scoring.apps import APPSScorer
1617
from skythought.evals.scoring.math import MathEqualScorer
18+
from skythought.evals.scoring.taco import TACOScorer
1719

18-
from .postprocess import APPSScorer, TACOScorer, convert_to_sharegpt_format
20+
from .postprocess import convert_to_sharegpt_format
1921
from .preprocess import APPSPreprocessor, NUMINAPreprocessor, TACOPreprocessor
20-
from .prompts import CONVERT_PROMPT
22+
from .prompts import CONVERT_PROMPT, CONVERT_PROMPT_EXAMPLE
2123

2224
parser = argparse.ArgumentParser()
2325
parser.add_argument("--as-test", action="store_true")
@@ -26,11 +28,12 @@
2628
SYSTEM_PROMPT = "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step." # noqa: E501
2729

2830
# 1. Load datasets
29-
apps_ds = datasets.load_dataset("codeparrot/apps", split="test", streaming=True)
30-
taco_ds_medium = datasets.load_dataset(
31-
"BAAI/TACO", split="test", name="MEDIUM", streaming=True
31+
apps_ds = datasets.load_dataset(
32+
"codeparrot/apps",
33+
split="test",
3234
)
33-
numina_ds = datasets.load_dataset("AI-MO/NuminaMath-CoT", split="train", streaming=True)
35+
taco_ds_medium = datasets.load_dataset("BAAI/TACO", split="test", name="MEDIUM")
36+
numina_ds = datasets.load_dataset("AI-MO/NuminaMath-CoT", split="train")
3437

3538
# convert all to ray dataset
3639
apps_ds = ray.data.from_huggingface(apps_ds)
@@ -45,11 +48,11 @@
4548

4649

4750
if args.as_test:
48-
apps_ds = apps_ds.limit(100)
49-
taco_ds_medium = taco_ds_medium.limit(100)
50-
numina_ds_amc_aime = numina_ds_amc_aime.limit(100)
51-
numina_ds_olympiads = numina_ds_olympiads.limit(100)
52-
numina_ds_math = numina_ds_math.limit(100)
51+
apps_ds = apps_ds.limit(5)
52+
taco_ds_medium = taco_ds_medium.limit(5)
53+
numina_ds_amc_aime = numina_ds_amc_aime.limit(5)
54+
numina_ds_olympiads = numina_ds_olympiads.limit(5)
55+
numina_ds_math = numina_ds_math.limit(5)
5356

5457
# 2. Get model responses for each of the datasets
5558
datasets = [
@@ -69,10 +72,20 @@
6972
NUMINAPreprocessor(),
7073
]
7174

75+
numina_scorer = MathEqualScorer(
76+
response_column="formatted_response", answer_column="solution"
77+
)
78+
scorers = [
79+
APPSScorer(response_column="formatted_response"),
80+
TACOScorer(response_column="formatted_response"),
81+
numina_scorer,
82+
numina_scorer,
83+
numina_scorer,
84+
]
85+
7286
for i, ds in enumerate(datasets):
7387
datasets[i] = ds.map(preprocess_fns[i])
7488

75-
# our API
7689
config = vLLMEngineProcessorConfig(
7790
# model="Qwen/QwQ-32B-Preview",
7891
model="Qwen/Qwen2-0.5B-Instruct",
@@ -85,12 +98,11 @@
8598
batch_size=64,
8699
)
87100

88-
# our API
89101
processor = build_llm_processor(
90102
config,
91103
preprocess=lambda row: dict(
92104
messages=[
93-
SYSTEM_PROMPT,
105+
{"role": "system", "content": SYSTEM_PROMPT},
94106
{"role": "user", "content": row["user_input"]},
95107
],
96108
sampling_params=dict(
@@ -104,63 +116,56 @@
104116
**row, # This will return all the original columns in the dataset.
105117
),
106118
)
107-
# our API
108-
datasets[i] = processor(ds)
109-
110-
# 3. Reformat the examples into a structured format
111-
# define a configuration for the reformatter
112-
config = HttpRequestProcessorConfig(
113-
url="https://api.openai.com/v1/chat/completions",
114-
headers={"Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}"},
115-
# number of processors to run in parallel
116-
# Each handles a batch of requests
117-
concurrency=1,
118-
)
119-
# define the reformatter
120-
reformatter = build_llm_processor(
121-
config=config,
122-
preprocess=lambda row: dict(
123-
# define the payload / the exact arguments to the OpenAI chat completions API
124-
payload=dict(
125-
model="gpt-4o-mini",
126-
messages=[
127-
{"role": "system", "content": "You are a solution format convertor."},
128-
{
129-
"role": "user",
130-
"content": CONVERT_PROMPT.format(
131-
content=f"{row['question']}\n{row['assistant_response']}"
132-
),
133-
},
134-
],
135-
temperature=0.7,
136-
max_tokens=16384,
119+
datasets[i] = processor(datasets[i])
120+
121+
# 3. Reformat the examples into a structured format
122+
# define a configuration for the reformatter
123+
config = HttpRequestProcessorConfig(
124+
url="https://api.openai.com/v1/chat/completions",
125+
headers={"Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}"},
126+
# number of processors to run in parallel
127+
# Each handles a batch of requests
128+
concurrency=1,
129+
)
130+
# define the reformatter
131+
reformatter = build_llm_processor(
132+
config,
133+
preprocess=lambda row: dict(
134+
# define the payload / the exact arguments to the OpenAI chat completions API
135+
payload=dict(
136+
model="gpt-4o-mini",
137+
messages=[
138+
{
139+
"role": "system",
140+
"content": "You are a solution format convertor.",
141+
},
142+
{
143+
"role": "user",
144+
"content": CONVERT_PROMPT.format(
145+
example=CONVERT_PROMPT_EXAMPLE,
146+
content=f"{row['question']}\n{row['assistant_response']}",
147+
),
148+
},
149+
],
150+
temperature=0.7,
151+
max_tokens=2048,
152+
),
137153
),
138-
),
139-
postprocess=lambda row: dict(
140-
formatted_response=row["http_response"]["choices"][0]["message"]["content"],
141-
),
142-
batch_size=64,
143-
)
144-
145-
for i, dataset in enumerate(datasets):
146-
datasets[i] = reformatter(dataset)
147-
148-
149-
# 4. Rejection Sampling based on scoring
150-
# apps, taco, numina-amc-aime, numina-olympiads, numina-math
151-
numina_scorer = MathEqualScorer(
152-
response_key="formatted_response", answer_key="solution"
153-
)
154-
scorers = [APPSScorer(), TACOScorer(), numina_scorer, numina_scorer, numina_scorer]
154+
postprocess=lambda row: dict(
155+
formatted_response=row["http_response"]["choices"][0]["message"]["content"],
156+
**row,
157+
),
158+
)
159+
datasets[i] = reformatter(datasets[i])
155160

156-
for i, dataset in enumerate(datasets):
157-
fn = scorers[i]
158-
datasets[i] = dataset.map(fn)
161+
# 4. Rejection Sampling based on scoring
162+
# apps, taco, numina-amc-aime, numina-olympiads, numina-math
163+
datasets[i] = datasets[i].map(scorers[i])
164+
# datasets[i] = datasets[i].filter(lambda x: x[scorers[i].SCORE_COLUMN])
159165

160-
# 5. Convert to ShareGPT format
161-
for i, dataset in enumerate(datasets):
162-
datasets[i] = dataset.map(convert_to_sharegpt_format)
166+
# 5. Convert to ShareGPT format
167+
datasets[i] = datasets[i].map(convert_to_sharegpt_format)
163168

164-
# 6. Union + Save datasets
165-
datasets = datasets[0].union(*datasets[1:])
166-
datasets.write_parquet("sky-t1-preview.parquet")
169+
# 6. Save datasets
170+
dir_name = f"sky-t1-preview-{i}_parquet"
171+
datasets[i].write_parquet(os.path.abspath(dir_name))
Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
from .base import Scorer
22
from .gsm8k import GSM8KScorer
3-
from .ifeval import IfEvalScorer
4-
from .livecodebench import LiveCodeBenchScorer
5-
from .math import MathScorer
3+
from .math import MathEqualScorer, MathVerifyScorer
64

7-
__all__ = ["Scorer", "MathScorer", "GSM8KScorer", "LiveCodeBenchScorer", "IfEvalScorer"]
5+
__all__ = ["Scorer", "MathEqualScorer", "MathVerifyScorer", "GSM8KScorer"]
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from .apps import APPSScorer
2+
3+
__all__ = ["APPSScorer"]
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
import copy
2+
import json
3+
from typing import Any, Dict
4+
5+
import numpy as np
6+
import ray
7+
from ray.exceptions import GetTimeoutError
8+
9+
from skythought.evals.scoring.base import Scorer
10+
from skythought.evals.tasks.apps.apps_util import run_test as apps_run_test
11+
from skythought.evals.util.common import has_code
12+
13+
14+
class APPSScorer(Scorer):
15+
SCORE_COLUMN = "apps_score"
16+
17+
def __init__(
18+
self,
19+
response_column="response",
20+
answer_column="solutions",
21+
input_column="input_output",
22+
) -> None:
23+
super().__init__()
24+
self.response_column = response_column
25+
self.answer_column = answer_column
26+
self.input_column = input_column
27+
28+
def score(self, row: Dict[str, Any]):
29+
TIMEOUT = 10
30+
code_filter_result = has_code(row[self.response_column])
31+
if len(code_filter_result) == 0:
32+
return {self.SCORE_COLUMN: False}
33+
else:
34+
last_code = code_filter_result[-1]
35+
problem_to_check = copy.deepcopy(row)
36+
problem_to_check[self.input_column] = json.loads(row[self.input_column])
37+
try:
38+
problem_to_check[self.answer_column] = json.loads(
39+
row[self.answer_column]
40+
)
41+
except Exception:
42+
problem_to_check[self.answer_column] = ""
43+
44+
@ray.remote
45+
def _temp_run(problem, generation, debug):
46+
try:
47+
result = apps_run_test(problem=problem, test=generation, debug=debug)
48+
return result
49+
except Exception:
50+
pass
51+
52+
try:
53+
result = ray.get(
54+
_temp_run.remote(problem_to_check, last_code, False),
55+
timeout=TIMEOUT + 1,
56+
)
57+
except GetTimeoutError:
58+
result = []
59+
60+
score = bool(result and np.all(result[0]))
61+
return {self.SCORE_COLUMN: score}

skythought/evals/scoring/ifeval/instructions.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,13 @@
1616
"""Library of instructions."""
1717
import collections
1818
import json
19+
import logging
1920
import random
2021
import re
2122
import string
2223
from typing import Dict, Optional, Sequence, Union
2324

2425
import langdetect
25-
from absl import logging
2626

2727
from . import instructions_util
2828

0 commit comments

Comments
 (0)