rework dependencies; fix openai stuff

SumanthRH · SumanthRH · commit 323ffd08246a · 2025-02-20T16:04:12.000-08:00
Signed-off-by: SumanthRH &lt;sumanthrh@anyscale.com&gt;
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
-name = "skythought"
+name = "skythought_evals"
 version = "0.1.0"
-description = "Skythought Evals"
+description = "Skythought Evals: Evaluation and Data Generation Tools for Reasoning Models"
 authors = [
     { name = "NovaSky Team"}
 ]
diff --git a/setup.py b/setup.py
diff --git a/skythought/skythought_evals/common/entities.py b/skythought/skythought_evals/common/entities.py
@@ -3,9 +3,11 @@
 from enum import Enum
 from importlib import resources
 from pathlib import Path
-from typing import Any, Dict, Literal, Optional, Union
+from typing import Literal, Optional, Union
 
 import yaml
+from openai import NOT_GIVEN, NotGiven
+from openai.types.chat import ChatCompletionReasoningEffort
 from pydantic import BaseModel, ConfigDict, Field
 from vllm import SamplingParams as VLLMSamplingParams
 
@@ -21,18 +23,20 @@ class Backend(str, Enum):
 
 
 class OpenAISamplingParams(BaseModel):
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
     temperature: float = TEMPERATURE_DEFAULT
     top_p: float = TOP_P_DEFAULT
     n: int = 1
     max_tokens: int = MAX_TOKENS_DEFAULT
-    reasoning_effort: Optional[float] = None
+    reasoning_effort: Union[ChatCompletionReasoningEffort, NotGiven] = NOT_GIVEN
     frequency_penalty: Optional[float] = None
 
 
 class SamplingParameters(BaseModel):
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
-    params: Union[Dict[str, Any], OpenAISamplingParams, VLLMSamplingParams]
+    params: Union[OpenAISamplingParams, VLLMSamplingParams]
 
     @classmethod
     def from_dict(cls, backend: Backend, params: dict):
diff --git a/skythought/skythought_evals/inference_and_check.py b/skythought/skythought_evals/inference_and_check.py
@@ -77,29 +77,26 @@ def fetch_response_openai(
     model_name = model_config.name
     # Ensure model_name has been resolved to a string
     assert model_name
-    if "o1" in model_name:
+    if model_name.startswith("o1") or model_name.startswith("o3"):
         # O1 doesn't support system prompt
         # NOTE: might want to implement this inside handler instead
         for p in prompt:
             p["role"] = "user"
-
         response = client.chat.completions.create(
             model=model_config.model_id,
             messages=prompt,
             n=sampling_params.n,
-            temperature=sampling_params.temperature,
-            max_tokens=sampling_params.max_tokens,
             reasoning_effort=sampling_params.reasoning_effort,
-            frequency_penalty=sampling_params.frequency_penalty,
             max_completion_tokens=sampling_params.max_tokens,
         )
     else:
+        if sampling_params.reasoning_effort is not None:
+            raise ValueError("Reasoning effort is only supported for reasoning models")
         response = client.chat.completions.create(
             model=model_config.model_id,
             messages=prompt,
             n=sampling_params.n,
             temperature=sampling_params.temperature,
-            max_tokens=sampling_params.max_tokens,
             frequency_penalty=sampling_params.frequency_penalty,
             max_completion_tokens=sampling_params.max_tokens,
         )
@@ -170,12 +167,13 @@ def inference(
         responses = copy.deepcopy(responses)
         responses = sorted(responses, key=lambda x: x.index)
     elif backend == Backend.OPENAI:
-        llm = OpenAI(**backend_params)
+        llm = OpenAI(**backend_params.to_dict())
+        assert isinstance(sampling_params.params, OpenAISamplingParams)
         fetch_partial = partial(
             fetch_response_openai,
             llm,
             model_config,
-            sampling_params,
+            sampling_params.params,
         )
 
         with concurrent.futures.ThreadPoolExecutor(max_workers=16) as e: