Skip to content

Commit a9876d8

Browse files
authored
Merge pull request #249 from eshwarprasadS/add-longbench
Add LongBench V2 benchmark
2 parents 3b6bb0e + 5ba7e6e commit a9876d8

File tree

5 files changed

+317
-1
lines changed

5 files changed

+317
-1
lines changed

pyproject.toml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ issues = "https://github.com/instructlab/eval/issues"
4141
"mt_bench" = "instructlab.eval.mt_bench:MTBenchEvaluator"
4242
"mt_bench_branch" = "instructlab.eval.mt_bench:MTBenchBranchEvaluator"
4343
"leaderboard_v2" = "instructlab.eval.leaderboard:LeaderboardV2Evaluator"
44+
"longbench" = "instructlab.eval.longbench:LongBenchEvaluator"
4445
"ruler" = "instructlab.eval.ruler:RulerEvaluator"
4546

4647
[tool.setuptools_scm]
@@ -53,7 +54,11 @@ package-dir = {"" = "src"}
5354

5455
[tool.setuptools.dynamic]
5556
dependencies = {file = ["requirements.txt"]}
56-
optional-dependencies = {leaderboard = {file = ["requirements-leaderboard.txt"]}}
57+
58+
[tool.setuptools.dynamic.optional-dependencies]
59+
leaderboard = {file = ["requirements-leaderboard.txt"]}
60+
longbench = {file = ["requirements-longbench.txt"]}
61+
cuda = {file = ["requirements-cuda.txt"]}
5762

5863
[tool.setuptools.packages.find]
5964
where = ["src"]

requirements-cuda.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
vllm
2+
flash-attn

requirements-longbench.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
lm-eval[longbench]>=0.4.8

src/instructlab/eval/longbench.py

Lines changed: 306 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,306 @@
1+
# Standard
2+
from collections import defaultdict
3+
import json
4+
import os
5+
import typing as t
6+
7+
# Third Party
8+
from lm_eval.evaluator import simple_evaluate
9+
from torch import cuda
10+
11+
# Local
12+
from .evaluator import Evaluator
13+
14+
15+
class LongBenchResult(t.TypedDict, total=False):
16+
"""Dict containing averages for each task type and language"""
17+
18+
overall_score: float
19+
en_multidoc: float
20+
zh_multidoc: float
21+
en_singledoc: float
22+
zh_singledoc: float
23+
en_summ: float
24+
zh_summ: float
25+
en_fewshot: float
26+
zh_fewshot: float
27+
en_synthetic: float
28+
zh_synthetic: float
29+
code_avg: float
30+
31+
32+
# Define task categories
33+
TASK_CATEGORIES = {
34+
"en_multidoc": ["longbench_hotpotqa", "longbench_2wikimqa", "longbench_musique"],
35+
"zh_multidoc": ["longbench_dureader"],
36+
"en_singledoc": [
37+
"longbench_multifieldqa_en",
38+
"longbench_narrativeqa",
39+
"longbench_qasper",
40+
],
41+
"zh_singledoc": ["longbench_multifieldqa_zh"],
42+
"en_summ": ["longbench_gov_report", "longbench_qmsum", "longbench_multi_news"],
43+
"zh_summ": ["longbench_vcsum"],
44+
"en_fewshot": ["longbench_triviaqa", "longbench_samsum", "longbench_trec"],
45+
"zh_fewshot": ["longbench_lsht"],
46+
"en_synthetic": ["longbench_passage_retrieval_en", "longbench_passage_count"],
47+
"zh_synthetic": ["longbench_passage_retrieval_zh"],
48+
"code_avg": ["longbench_lcc", "longbench_repobench-p"],
49+
}
50+
51+
# Flatten the categories to get all tasks
52+
ALL_LONGBENCH_TASKS = []
53+
for task in TASK_CATEGORIES.values():
54+
ALL_LONGBENCH_TASKS.extend(task)
55+
56+
# Task to metric mapping
57+
TASK_METRICS = {
58+
"longbench_hotpotqa": "qa_f1_score",
59+
"longbench_2wikimqa": "qa_f1_score",
60+
"longbench_musique": "qa_f1_score",
61+
"longbench_dureader": "rouge_zh_score",
62+
"longbench_multifieldqa_en": "qa_f1_score",
63+
"longbench_narrativeqa": "qa_f1_score",
64+
"longbench_qasper": "qa_f1_score",
65+
"longbench_multifieldqa_zh": "qa_f1_zh_score",
66+
"longbench_gov_report": "rouge_score",
67+
"longbench_qmsum": "rouge_score",
68+
"longbench_multi_news": "rouge_score",
69+
"longbench_vcsum": "rouge_zh_score",
70+
"longbench_triviaqa": "qa_f1_score",
71+
"longbench_samsum": "rouge_score",
72+
"longbench_trec": "classification_score",
73+
"longbench_lsht": "classification_score",
74+
"longbench_passage_retrieval_en": "retrieval_score",
75+
"longbench_passage_count": "count_score",
76+
"longbench_passage_retrieval_zh": "retrieval_zh_score",
77+
"longbench_lcc": "code_sim_score",
78+
"longbench_repobench-p": "code_sim_score",
79+
}
80+
81+
# Default configuration parameters
82+
# pylint: disable=use-dict-literal
83+
DEFAULT_EVAL_CONFIG = dict(
84+
batch_size="auto",
85+
apply_chat_template=True,
86+
fewshot_as_multiturn=True,
87+
confirm_run_unsafe_code=True,
88+
system_instruction=None,
89+
cache_requests=False,
90+
)
91+
92+
# vLLM-specific configuration - using longer context window than leaderboard
93+
# pylint: disable=use-dict-literal
94+
DEFAULT_VLLM_CONFIG = dict(
95+
dtype="float16",
96+
gpu_memory_utilization=0.8,
97+
disable_custom_all_reduce=True,
98+
enforce_eager=False,
99+
max_model_len=131072, # 128K context for LongBench
100+
)
101+
102+
# OpenAI API configuration parameters
103+
# pylint: disable=use-dict-literal
104+
DEFAULT_OPENAI_CONFIG = dict(
105+
max_tokens=768,
106+
temperature=0.0,
107+
seed=1337,
108+
)
109+
110+
111+
class LongBenchEvaluator(Evaluator):
112+
"""
113+
Evaluator for LongBench benchmark.
114+
115+
Attributes:
116+
model_path: Path to the model or model name for API
117+
tasks: List of subtasks to evaluate (default is all tasks)
118+
num_gpus: Number of GPUs to use for local evaluation
119+
output_file: Path to save results to
120+
eval_config: Configuration for evaluation parameters
121+
vllm_config: Configuration for vLLM-specific parameters
122+
openai_config: Configuration for OpenAI API parameters
123+
api_endpoint: Optional OpenAI-compatible API endpoint
124+
"""
125+
126+
name = "longbench"
127+
128+
def __init__(
129+
self,
130+
model_path: str,
131+
model_name: str,
132+
tasks: t.Optional[t.List[str]] = None,
133+
num_gpus: t.Optional[int] = None,
134+
output_file: t.Optional[str] = None,
135+
eval_config: t.Optional[t.Dict[str, t.Any]] = None,
136+
vllm_config: t.Optional[t.Dict[str, t.Any]] = None,
137+
openai_config: t.Optional[t.Dict[str, t.Any]] = None,
138+
api_endpoint: t.Optional[str] = None,
139+
):
140+
self.model_path = model_path
141+
self.model_name = model_name
142+
self.tasks = tasks or ALL_LONGBENCH_TASKS
143+
144+
# If using API, no need to check CUDA
145+
self.api_endpoint = api_endpoint
146+
if not api_endpoint and not cuda.is_available():
147+
raise ValueError(
148+
"Running without CUDA is currently unsupported unless using an API endpoint"
149+
)
150+
151+
self.num_gpus = num_gpus or cuda.device_count()
152+
self.output_file = output_file
153+
self.eval_config = eval_config if eval_config else {}
154+
self.vllm_config = vllm_config if vllm_config else {}
155+
self.openai_config = openai_config if openai_config else {}
156+
self._results: t.Optional[LongBenchResult] = None
157+
self._lm_eval_results: t.Optional[t.Dict[str, t.Any]] = None
158+
159+
def _get_task_averages(self, results: dict) -> LongBenchResult:
160+
"""Calculate averages for each task type and language from raw results"""
161+
eval_results = defaultdict(float)
162+
results_data = results["results"]
163+
164+
# Track which categories have data
165+
active_categories = {}
166+
167+
# Process each category
168+
for category, category_tasks in TASK_CATEGORIES.items():
169+
# Filter tasks that were actually run
170+
active_tasks = [task for task in category_tasks if task in results_data]
171+
172+
if active_tasks:
173+
# Get scores for active tasks
174+
scores = []
175+
# pylint: disable=redefined-outer-name
176+
for task in active_tasks:
177+
metric_key = f"{TASK_METRICS[task]},none"
178+
if task in results_data and metric_key in results_data[task]:
179+
scores.append(results_data[task][metric_key])
180+
181+
if scores:
182+
# Calculate average for this category
183+
eval_results[category] = sum(scores) / len(scores)
184+
active_categories[category] = len(scores)
185+
186+
# Calculate overall score from active categories
187+
category_scores = [v for k, v in eval_results.items() if k != "overall_score"]
188+
if category_scores:
189+
eval_results["overall_score"] = sum(category_scores) / len(category_scores)
190+
else:
191+
eval_results["overall_score"] = 0.0
192+
193+
return t.cast(LongBenchResult, dict(eval_results))
194+
195+
def run(
196+
self,
197+
model_path: t.Optional[str] = None,
198+
model_name: t.Optional[str] = None,
199+
tasks: t.Optional[t.List[str]] = None,
200+
num_gpus: t.Optional[int] = None,
201+
output_file: t.Optional[str] = None,
202+
eval_config: t.Optional[t.Dict[str, t.Any]] = None,
203+
vllm_config: t.Optional[t.Dict[str, t.Any]] = None,
204+
openai_config: t.Optional[t.Dict[str, t.Any]] = None,
205+
api_endpoint: t.Optional[str] = None,
206+
) -> LongBenchResult:
207+
"""Run the LongBench evaluation"""
208+
model_path = model_path or self.model_path
209+
model_name = model_name or self.model_name
210+
tasks = tasks or self.tasks
211+
num_gpus = num_gpus or self.num_gpus
212+
output_file = output_file or self.output_file
213+
api_endpoint = api_endpoint or self.api_endpoint
214+
215+
# Merge configurations
216+
final_eval_config = {}
217+
final_eval_config.update(DEFAULT_EVAL_CONFIG)
218+
final_eval_config.update(self.eval_config)
219+
if eval_config:
220+
final_eval_config.update(eval_config)
221+
222+
final_vllm_config = {}
223+
final_vllm_config.update(DEFAULT_VLLM_CONFIG)
224+
final_vllm_config.update(self.vllm_config)
225+
if vllm_config:
226+
final_vllm_config.update(vllm_config)
227+
228+
final_openai_config = {}
229+
final_openai_config.update(DEFAULT_OPENAI_CONFIG)
230+
final_openai_config.update(self.openai_config)
231+
if openai_config:
232+
final_openai_config.update(openai_config)
233+
234+
# Extract system_instruction if provided
235+
system_instruction = final_eval_config.pop("system_instruction", None)
236+
237+
# Run evaluation with the appropriate backend
238+
if api_endpoint:
239+
base_url = api_endpoint
240+
api_key = final_openai_config.pop("api_key", None)
241+
242+
# Build model args
243+
model_args = {
244+
"model": model_name,
245+
"tokenizer": model_path,
246+
"base_url": base_url,
247+
}
248+
# Optionally add max_length
249+
if "max_length" in final_openai_config:
250+
model_args["max_length"] = str(final_openai_config["max_length"])
251+
252+
if api_key:
253+
model_args["api_key"] = str(api_key)
254+
255+
# Add any other openai_config keys if needed
256+
# model_args.update(final_openai_config)
257+
258+
# Run evaluation
259+
results = simple_evaluate(
260+
tasks=tasks,
261+
model="local-completions",
262+
model_args=model_args,
263+
system_instruction=system_instruction,
264+
**final_eval_config,
265+
)
266+
else:
267+
# Prepare vLLM model args
268+
model_args = {
269+
"pretrained": model_path,
270+
"data_parallel_size": str(num_gpus),
271+
}
272+
# Add vllm config properly - convert all values to strings
273+
string_vllm_config = {k: str(v) for k, v in final_vllm_config.items()}
274+
model_args.update(string_vllm_config)
275+
276+
# Run evaluation
277+
results = simple_evaluate(
278+
tasks=tasks,
279+
model="vllm",
280+
model_args=model_args,
281+
system_instruction=system_instruction,
282+
**final_eval_config,
283+
)
284+
285+
self._lm_eval_results = results
286+
self._results = self._get_task_averages(results)
287+
288+
if output_file:
289+
self.save_to_file(output_file)
290+
291+
return self._results
292+
293+
@property
294+
def results(self) -> t.Optional[LongBenchResult]:
295+
"""Returns the results of the most recent evaluation"""
296+
return self._results
297+
298+
def save_to_file(self, output_file: t.Optional[str] = None) -> None:
299+
"""Save results to a JSON file"""
300+
output_file = output_file or self.output_file
301+
if not output_file:
302+
raise ValueError("Output file path cannot be empty")
303+
304+
os.makedirs(os.path.dirname(output_file), exist_ok=True)
305+
with open(output_file, "w", encoding="utf-8") as f:
306+
json.dump(self._results, f, indent=2)

tests/test_project.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
# First Party
66
from instructlab.eval.evaluator import Evaluator
77
from instructlab.eval.leaderboard import LeaderboardV2Evaluator
8+
from instructlab.eval.longbench import LongBenchEvaluator
89
from instructlab.eval.mmlu import MMLUBranchEvaluator, MMLUEvaluator
910
from instructlab.eval.mt_bench import MTBenchBranchEvaluator, MTBenchEvaluator
1011
from instructlab.eval.ruler import RulerEvaluator
@@ -17,6 +18,7 @@ def test_evaluator_eps():
1718
"mt_bench": MTBenchEvaluator,
1819
"mt_bench_branch": MTBenchBranchEvaluator,
1920
"leaderboard_v2": LeaderboardV2Evaluator,
21+
"longbench": LongBenchEvaluator,
2022
"ruler": RulerEvaluator,
2123
}
2224
eps = entry_points(group="instructlab.eval.evaluator")

0 commit comments

Comments
 (0)