NovaSky-AI
diff --git a/‎recipes/sky-t1-preview/__init__.py‎ b/‎recipes/sky-t1-preview/__init__.py‎
diff --git a/‎recipes/sky-t1-preview/postprocess.py‎
Lines changed: 86 additions & 0 deletions b/‎recipes/sky-t1-preview/postprocess.py‎
Lines changed: 86 additions & 0 deletions
diff --git a/‎recipes/sky-t1-preview/preprocess.py‎
Lines changed: 70 additions & 0 deletions b/‎recipes/sky-t1-preview/preprocess.py‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎recipes/sky-t1-preview/prompts.py‎
Lines changed: 69 additions & 0 deletions b/‎recipes/sky-t1-preview/prompts.py‎
Lines changed: 69 additions & 0 deletions
diff --git a/‎recipes/sky-t1-preview/recipe.py‎
Lines changed: 151 additions & 0 deletions b/‎recipes/sky-t1-preview/recipe.py‎
Lines changed: 151 additions & 0 deletions
diff --git a/‎skythought/evals/scoring/__init__.py‎
Lines changed: 7 additions & 0 deletions b/‎skythought/evals/scoring/__init__.py‎
Lines changed: 7 additions & 0 deletions
@@ -0,0 +1,86 @@
+import copy
+import json
+from typing import Any, Dict
+
+import numpy as np
+import ray
+
+from skythought.evals.scoring.base import Scorer
+from skythought.evals.tasks.apps.apps_util import run_test as apps_run_test
+from skythought.evals.util.common import has_code
+
+STILL2_SYSTEM_PROMPT = "Your role as an assistant involves thoroughly exploring questions through a systematic long \
+thinking process before providing the final precise and accurate solutions. This requires \
+engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, \
+backtracing, and iteration to develop well-considered thinking process. \
+Please structure your response into two main sections: Thought and Solution. \
+In the Thought section, detail your reasoning process using the specified format: \
+<|begin_of_thought|> {thought with steps separated with '\n\n'} \
+<|end_of_thought|> \
+Each step should include detailed considerations such as analisying questions, summarizing \
+relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining \
+any errors, and revisiting previous steps. \
+In the Solution section, based on various attempts, explorations, and reflections from the Thought \
+section, systematically present the final solution that you deem correct. The solution should \
+remain a logical, accurate, concise expression style and detail necessary step needed to reach the \
+conclusion, formatted as follows: \
+<|begin_of_solution|> \
+{final formatted, precise, and clear solution} \
+<|end_of_solution|> \
+Now, try to solve the following question through the above guidelines:"
+
+
+class APPSScorer(Scorer):
+    def score(self, row: Dict[str, Any]):
+        TIMEOUT = 10
+        code_filter_result = has_code(row["response"])
+        if len(code_filter_result) == 0:
+            return False
+        else:
+            last_code = code_filter_result[-1]
+            problem_to_check = copy.deepcopy(row)
+            problem_to_check["input_output"] = json.loads(row["input_output"])
+            try:
+                problem_to_check["solutions"] = json.loads(row["solutions"])
+            except Exception:
+                problem_to_check["solutions"] = ""
+
+        @ray.remote
+        def _temp_run(problem, generation, debug):
+            try:
+                result = apps_run_test(problem=problem, test=generation, debug=debug)
+                return result
+            except Exception:
+                pass
+
+        result = ray.get(
+            _temp_run.remote(problem_to_check, last_code, False), timeout=TIMEOUT + 1
+        )
+
+        return bool(result and np.all(result[0]))
+
+
+class TACOScorer(Scorer):
+    def score(self, row: Dict[str, Any]):
+        return True
+
+
+def convert_to_sharegpt_format(row: Dict[str, Any]):
+    prompt = row["user_input"]
+    # accept
+    # Create the conversation format
+    conversations = [
+        {"from": "user", "value": prompt},
+        {
+            "from": "assistant",
+            "value": row["formatted_response"],
+        },
+    ]
+
+    # Prepare the final structure
+    cur_data = {
+        "system": STILL2_SYSTEM_PROMPT,
+        "conversations": conversations,
+    }
+
+    return cur_data
@@ -0,0 +1,70 @@
+import json
+
+
+class APPSPreprocessor:
+    WITH_FN_NAME_TEMPLATE = "Generate an executable Python function generated from the given prompt. The function should take stdin as input and print the output. Simply call the function after the definition. {prompt}"  # noqa: E501
+
+    WITHOUT_FN_NAME_TEMPLATE = "Generate an executable Python function generated from the given prompt. Return the function body without invoking it at the final solution. {prompt}"  # noqa: E501
+
+    WITH_STARTER_CODE_TEMPLATE = "{input}\n{starter_code}"
+
+    def __call__(self, row):
+        test_case = json.loads(row["input_output"])
+        starter_code = row["starter_code"]
+        prompt = row["question"]
+        if not test_case.get("fn_name"):
+            _input = self.WITH_FN_NAME_TEMPLATE.format(prompt=prompt)
+        else:
+            _input = self.WITHOUT_FN_NAME_TEMPLATE.format(prompt=prompt)
+
+        if starter_code is not None:
+            _input = self.WITH_STARTER_CODE_TEMPLATE.format(
+                input=_input, starter_code=starter_code
+            )
+
+        return {**row, "user_input": _input}
+
+
+class TACOPreprocessor:
+    INITIAL_TEMPLATE = "\nQUESTION:\n{prompt}"
+    STARTER_CODE_TEMPLATE = "{input}\n{starter_code}"
+    STDIN_TEMPLATE = "{input}\nUse Standard Input format\nANSWER:\n"
+    CALL_TEMPLATE = "{input}\nUse Call-Based format\nANSWER:\n"
+
+    def __call__(self, problem):
+
+        prompt = problem["question"]
+        starter_code = (
+            None if len(problem["starter_code"]) == 0 else problem["starter_code"]
+        )
+        try:
+            input_outpout = json.loads(problem["input_output"])
+            fn_name = (
+                None if not input_outpout.get("fn_name") else input_outpout["fn_name"]
+            )
+        except ValueError:
+            fn_name = None
+
+        _input = self.INITIAL_TEMPLATE.format(prompt=prompt)
+
+        if starter_code:
+            _input = self.STARTER_CODE_TEMPLATE.format(
+                input=_input, starter_code=starter_code
+            )
+        else:
+            _input = self.INITIAL_TEMPLATE.format(prompt=prompt)
+        if (not fn_name) and (not starter_code):
+            _input = self.STDIN_TEMPLATE.format(input=_input)
+        else:
+            _input = self.CALL_TEMPLATE.format(input=_input)
+
+        return {**problem, "user_input": _input}
+
+
+class NUMINAPreprocessor:
+    TEMPLATE = "Return your final response within \\boxed{{}}. {prompt}"
+
+    def __call__(self, row):
+        prompt = row["problem"]
+        _input = self.TEMPLATE.format(prompt=prompt)
+        return {**row, "user_input": _input}
@@ -0,0 +1,69 @@
+convert_prompt_example = (  # noqa: E501
+    "<|begin_of_thought|>\n\n"
+    "Okay, so I've got this problem here. Mr. Wang leaves home at 6 AM, riding his bike at 12 km/h, "
+    "and he stops to rest for 6 minutes after every 30 minutes of riding. Then, when he arrives at a park "
+    "that's 16.8 km away, I need to find out the angle between the hour and minute hands on his watch.\n\n"
+    "Alright, first things first, I need to figure out how long it takes Mr. Wang to ride 16.8 km, including "
+    "his rest periods.\n\n"
+    "So, his speed is 12 km/h. To find out how long it takes to go 16.8 km without any stops, I can use the formula "
+    "time = distance/speed. That would be 16.8 divided by 12, which is 1.4 hours. To make it easier, that's 1 hour and 24 minutes.\n\n"
+    "But wait, he doesn't ride straight through. He stops for 6 minutes after every 30 minutes of riding. So, I need to see how many "
+    "of those 30-minute riding periods are there in his total riding time.\n\n"
+    "In 1 hour and 24 minutes of riding, how many 30-minute segments are there? Well, 1 hour is 60 minutes, plus 24 minutes makes 84 minutes "
+    "total riding time. So, 84 divided by 30 is 2.8. That means he has two full 30-minute riding periods and a partial one.\n\n"
+    "After each full 30-minute riding period, he rests for 6 minutes. So, for two full periods, he rests twice, which is 12 minutes of rest.\n\n"
+    "Now, for the partial riding period. Since 2 times 30 minutes is 60 minutes, and he has 84 minutes of riding, the remaining riding time is 84 minus 60, "
+    "which is 24 minutes. So, he rides for 24 minutes without another rest because he doesn't complete another 30-minute segment.\n\n"
+    "So, total time taken is riding time plus rest time. That's 84 minutes riding plus 12 minutes resting, totaling 96 minutes.\n\n"
+    "Wait a minute, but he stops after every 30 minutes of riding, but in the last partial period of 24 minutes, does he rest again? I think he only rests after "  # noqa: E501
+    "completing 30 minutes of riding, so in this case, since the last riding period is only 24 minutes, he doesn't take an additional rest after that.\n\n"
+    "So, total time should be 84 minutes riding plus 12 minutes resting, which is indeed 96 minutes, or 1 hour and 36 minutes.\n\n"
+    "So, he leaves at 6 AM and takes 1 hour and 36 minutes to reach the park, arriving at 7:36 AM.\n\n"
+    "Now, I need to find the angle between the hour and minute hands at 7:36.\n\n"
+    "To find the angle between the hour and minute hands, I can use the formula:\n\n"
+    "|30H - 5.5M|\n\n"
+    "where H is the hour and M is the minutes.\n\n"
+    "At 7:36, H is 7 and M is 36.\n\n"
+    "So, plugging in:\n\n"
+    "30*7 = 210\n\n"
+    "5.5*36 = 198\n\n"
+    "210 - 198 = 12\n\n"
+    "So, the angle is 12 degrees.\n\n"
+    "Wait, but I should make sure that's the smaller angle. Sometimes, the larger angle is considered, but usually, the smaller one is what is asked for.\n\n"
+    "So, the angle between the hour and minute hands at 7:36 AM is 12 degrees.\n\n"
+    "I think that's the answer.<|end_of_thought|>\n\n"
+    "<|begin_of_solution|>\n\n"
+    "Mr. Wang leaves home at 6 AM and rides at a speed of 12 km/h, stopping to rest for 6 minutes after every 30 minutes of riding. "
+    "He arrives at a park 16.8 km away. To determine the angle between the hour and minute hands on his watch when he arrives, we first calculate the total time taken.\n\n"  # noqa: E501
+    "1. **Riding time without stops**:\n\n"
+    "$$\\text{Time} = \\frac{\\text{Distance}}{\\text{Speed}} = \\frac{16.8 \\text{ km}}{12 \\text{ km/h}} = 1.4 \\text{ hours} = 84 \\text{ minutes}$$\n\n"
+    "2. **Rest periods**:\n\n"
+    "  - He rests for 6 minutes after every 30 minutes of riding.\n\n"
+    "  - In 84 minutes of riding, he completes 2 full 30-minute segments and a partial 24-minute segment.\n\n"
+    "  - He rests twice, totaling 12 minutes of rest.\n\n"
+    "3. **Total time**:\n\n"
+    "$$\\text{Total time} = 84 \\text{ minutes (riding)} + 12 \\text{ minutes (rest)} = 96 \\text{ minutes} = 1 \\text{ hour and } 36 \\text{ minutes}$$\n\n"
+    "  - He arrives at 7:36 AM.\n\n"
+    "4. **Angle between hour and minute hands at 7:36**:\n\n"
+    "  - Use the formula:\n\n"
+    "$$\\text{Angle} = |30H - 5.5M|$$\n\n"
+    "  - At 7:36, $H = 7$ and $M = 36$:\n\n"
+    "$$\\text{Angle} = |30 \\times 7 - 5.5 \\times 36| = |210 - 198| = 12 \\text{ degrees}$$\n\n"
+    "Thus, the angle between the hour and minute hands on his watch is $\\boxed{12}$.<|end_of_solution|>\n"  # noqa: E501
+)
+
+# From https://arxiv.org/pdf/2412.09413
+CONVERT_PROMPT = (
+    "Another solution is written in an unstructured way. Your job is to convert them into two sections:"
+    "<|begin_of_thought|>"
+    "(Thought process, you should copy exactly the thinking process of the original solution.)"
+    "<|end_of_thought|>"
+    "<|begin_of_solution|>"
+    "(Final formatted, precise, and clear solution; make sure there is only one solution in this section; If it is a coding problem, make sure there is only one code block)"  # noqa: E501
+    "<|end_of_solution|>"
+    "Here is an example demonstration of a different question, you can refer to its format: "
+    "{example}\n"
+    "Important: You should almost copy all the contents word-by-word of the original solution. Just convert them into two sections. "
+    "Make sure you include: <|begin_of_slow_thought|>, <|end_of_slow_thought|>,  <|begin_of_solution|>,<|end_of_solution|>  These four headers explicitly. "
+    "Content to be converted: {{content}}".format(example=convert_prompt_example)
+)
@@ -0,0 +1,151 @@
+"""
+This is the recipe for data curation for the Sky T1 Preview model . 
+"""
+
+import datasets
+import ray
+from ray.data.llm import (
+    HttpRequestProcessorConfig,
+    build_llm_processor,
+    vLLMEngineProcessorConfig,
+)
+
+from skythought.evals.scoring.math import MathEqualScorer
+
+from .postprocess import APPSScorer, TACOScorer, convert_to_sharegpt_format
+from .preprocess import APPSPreprocessor, NUMINAPreprocessor, TACOPreprocessor
+from .prompts import CONVERT_PROMPT
+
+SYSTEM_PROMPT = "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step."  # noqa: E501
+
+# 1. Load datasets
+apps_ds = datasets.load_dataset("codeparrot/apps", split="test", streaming=True)
+taco_ds_medium = datasets.load_dataset(
+    "BAAI/TACO", split="test", name="MEDIUM", streaming=True
+)
+numina_ds = datasets.load_dataset("AI-MO/NuminaMath-CoT", split="train", streaming=True)
+
+
+# convert all to ray dataset
+apps_ds = ray.data.from_huggingface(apps_ds)
+taco_ds_medium = ray.data.from_huggingface(taco_ds_medium)
+numina_ds = ray.data.from_huggingface(numina_ds)
+
+
+# get subsets from numina based on the source column
+numina_ds_amc_aime = numina_ds.filter(lambda x: x["source"] == "amc_aime")
+numina_ds_olympiads = numina_ds.filter(lambda x: x["source"] == "olympiads")
+numina_ds_math = numina_ds.filter(lambda x: x["source"] == "math")
+
+# 2. Get model responses for each of the datasets
+datasets = [
+    apps_ds,
+    taco_ds_medium,
+    numina_ds_amc_aime,
+    numina_ds_olympiads,
+    numina_ds_math,
+]
+
+# these are user-defined simple preprocessing functions to go from entry -> prompt
+preprocess_fns = [
+    APPSPreprocessor(),
+    TACOPreprocessor(),
+    NUMINAPreprocessor(),
+    NUMINAPreprocessor(),
+    NUMINAPreprocessor(),
+]
+
+for i, ds in enumerate(datasets):
+    datasets[i] = ds.map(preprocess_fns[i])
+
+    # our API
+    config = vLLMEngineProcessorConfig(
+        model="Qwen/QwQ-32B-Preview",
+        engine_kwargs=dict(
+            enable_prefix_caching=True,
+            enable_chunked_prefill=True,
+            max_num_batched_tokens=16384,
+        ),
+        concurrency=2,
+        batch_size=64,
+    )
+
+    # our API
+    processor = build_llm_processor(
+        config,
+        preprocess=lambda row: dict(
+            messages=[
+                SYSTEM_PROMPT,
+                {"role": "user", "content": row["user_input"]},
+            ],
+            sampling_params=dict(
+                temperature=0.3,
+                max_tokens=20,
+                detokenize=False,
+            ),
+        ),
+        postprocess=lambda row: dict(
+            assistant_response=row["generated_text"],
+            **row,  # This will return all the original columns in the dataset.
+        ),
+    )
+    # our API
+    datasets[i] = processor(ds)
+
+# 3. Reformat the examples into a structured format
+# define a configuration for the reformatter
+config = HttpRequestProcessorConfig(
+    url="https://api.openai.com/v1/chat/completions",
+    headers={"Authorization": "Bearer sk-..."},
+    # number of processors to run in parallel
+    # Each handles a batch of requests
+    concurrency=1,
+)
+# define the reformatter
+reformatter = build_llm_processor(
+    config=config,
+    preprocess=lambda row: dict(
+        # define the payload / the exact arguments to the OpenAI chat completions API
+        payload=dict(
+            model="gpt-4o-mini",
+            messages=[
+                {"role": "system", "content": "You are a solution format convertor."},
+                {
+                    "role": "user",
+                    "content": CONVERT_PROMPT.format(
+                        content=f"{row['question']}\n{row['assistant_response']}"
+                    ),
+                },
+            ],
+            temperature=0.7,
+            max_tokens=16384,
+        ),
+    ),
+    postprocess=lambda row: dict(
+        formatted_response=row["http_response"]["choices"][0]["message"]["content"],
+    ),
+    batch_size=64,
+)
+
+for i, dataset in enumerate(datasets):
+    datasets[i] = reformatter(dataset)
+
+
+# 4. Rejection Sampling based on scoring
+# apps, taco, numina-amc-aime, numina-olympiads, numina-math
+numina_scorer = MathEqualScorer(
+    response_key="formatted_response", answer_key="solution"
+)
+scorers = [APPSScorer(), TACOScorer(), numina_scorer, numina_scorer, numina_scorer]
+
+for i, dataset in enumerate(datasets):
+    fn = scorers[i]
+    datasets[i] = dataset.map(fn)
+
+# 5. Convert to ShareGPT format
+for i, dataset in enumerate(datasets):
+    datasets[i] = dataset.map(convert_to_sharegpt_format)
+
+# 6. Union + Save datasets
+datasets = datasets[0].union(*datasets[1:])
+datasets.write_parquet("sky-t1-preview.parquet")
@@ -0,0 +1,7 @@
+from .base import Scorer
+from .gsm8k import GSM8KScorer
+from .ifeval import IfEvalScorer
+from .livecodebench import LiveCodeBenchScorer
+from .math import MathScorer
+
+__all__ = ["Scorer", "MathScorer", "GSM8KScorer", "LiveCodeBenchScorer", "IfEvalScorer"]