Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions assess.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,8 +157,6 @@ def slugify(value):
"OpenAI O4 Mini (High Reasoning)": "https://openai.com/favicon.ico",
"OpenAI O3 (High Reasoning)": "https://openai.com/favicon.ico",
"OpenAI O3 (Medium Reasoning)": "https://openai.com/favicon.ico",
"ChatGPT-4o (Medium Reasoning)": "https://openai.com/favicon.ico",
"ChatGPT-4o (High Reasoning)": "https://openai.com/favicon.ico",
"OpenAI O1 Pro": "https://openai.com/favicon.ico",
"GPT-4.1 Mini": "https://openai.com/favicon.ico",
"GPT-4.1 Nano": "https://openai.com/favicon.ico",
Expand Down Expand Up @@ -261,6 +259,8 @@ def run_model_with_prompt(model_name, model, assessment):
with open("./model_results.json", "r") as file:
final_results = orjson.loads(file.read())

# Legacy normalization removed per request; load raw results as-is.

model_providers = {
"OpenAI O4 Mini": "",
"GPT-4.1": "",
Expand Down Expand Up @@ -320,8 +320,8 @@ def run_model_with_prompt(model_name, model, assessment):
"OpenAI O4 Mini (Medium Reasoning)": OpenAIModel(model_id="o4-mini"),
"OpenAI O3 (Medium Reasoning)": OpenAIModel(model_id="o3"),
"GPT-4.1": OpenAIModel(model_id="gpt-4.1"),
"ChatGPT-4o (Medium Reasoning)": OpenAIModel(model_id="chatgpt-4o-latest"),
"ChatGPT-4o (High Reasoning)": OpenAIModel(model_id="chatgpt-4o-latest", reasoning_effort="high"),
# 4o is not a reasoning model; expose a single canonical name without reasoning level.
"ChatGPT-4o": OpenAIModel(model_id="chatgpt-4o-latest"),
"GPT-4.1 Mini": OpenAIModel(model_id="gpt-4.1-mini"),
"GPT-4.1 Nano": OpenAIModel(model_id="gpt-4.1-nano"),
"OpenAI O1": OpenAIModel(model_id="o1"),
Expand Down Expand Up @@ -1041,4 +1041,4 @@ def on_created(self, event):
except KeyboardInterrupt:
observer.stop()
print("Stopping observer...")
observer.join()
observer.join()
100 changes: 65 additions & 35 deletions models/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,33 @@
from .model import Model

OPENAI_TEMPERATURE = 0.1
SKIP_TEMPERATURE = ["gpt-5-2025-08-07", "o4-mini", "chatgpt-4o-latest", "o3", "o1", "o3-pro", "o4-mini-high", "gpt-5-mini", "gpt-5-nano", "gpt-5-chat", "gpt-5"]
SKIP_TEMPERATURE = [
"gpt-5-2025-08-07",
"o4-mini",
"chatgpt-4o-latest",
"o3",
"o1",
"o3-pro",
"o4-mini-high",
"gpt-5-mini",
"gpt-5-nano",
"gpt-5-chat",
"gpt-5",
]

# Models that support OpenAI "reasoning_effort" parameter.
# Includes O-series and GPT-5 family.
ALLOWED_REASONING_MODELS = {
"o3",
"o4-mini",
"o1",
"o3-pro",
"gpt-5-2025-08-07",
"gpt-5-mini",
"gpt-5-nano",
"gpt-5-chat",
"gpt-5",
}


class OpenAIModel(Model):
Expand All @@ -25,59 +51,63 @@ def run(
image_dtype = "image/" + image_name.split(".")[-1].replace("jpg", "jpeg")
if structured_output_format:
try:
return (
self.client.beta.chat.completions.parse(
model=self.model_id,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:{image_dtype};base64,{base64.b64encode(image).decode()}"
},
kwargs = {
"model": self.model_id,
"messages": [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:{image_dtype};base64,{base64.b64encode(image).decode()}"
},
],
},
],
temperature=(
OPENAI_TEMPERATURE
if self.model_id not in SKIP_TEMPERATURE
else 1
),
reasoning_effort=reasoning_effort,
response_format=structured_output_format,
)
.choices[0]
.message.parsed
},
],
}
],
"temperature": (
OPENAI_TEMPERATURE if self.model_id not in SKIP_TEMPERATURE else 1
),
"response_format": structured_output_format,
}
if self.model_id in ALLOWED_REASONING_MODELS:
kwargs["reasoning_effort"] = reasoning_effort

return (
self.client.beta.chat.completions.parse(**kwargs).choices[0].message.parsed
or {}
)
except BadRequestError as e:
print(f"Error parsing structured output: {e}")
pass

# if reasoning_effort != "medium":
completion = self.client.chat.completions.create(
model=self.model_id,
messages=[
kwargs = {
"model": self.model_id,
"messages": [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {"url": f"data:{image_dtype};base64,{base64.b64encode(image).decode()}"}
"image_url": {
"url": f"data:{image_dtype};base64,{base64.b64encode(image).decode()}"
},
},
],
},
}
],
reasoning_effort=reasoning_effort,
temperature=(
"temperature": (
OPENAI_TEMPERATURE if self.model_id not in SKIP_TEMPERATURE else 1
),
)
}
if self.model_id in ALLOWED_REASONING_MODELS:
kwargs["reasoning_effort"] = reasoning_effort

completion = self.client.chat.completions.create(**kwargs)

return completion.choices[0].message.content
# else:
Expand Down