下面是一版单文件、可直接跑的 Python 测评代码 。它兼容 OpenAI-compatible API,支持:
- 拉取
cais/hle - 批量请求模型
- 保存预测结果
- 可选用 judge model 做判分
- 输出 accuracy 和 calibration error
python
# hle_eval_single.py
# -*- coding: utf-8 -*-
import os
import re
import json
import math
import time
import copy
import argparse
import asyncio
from typing import Any, Dict, List, Optional, Tuple
import numpy as np
from datasets import load_dataset
from openai import AsyncOpenAI
from tqdm.asyncio import tqdm_asyncio
SYSTEM_PROMPT = """Your response should be in the following format:
Explanation: {your explanation for your answer choice}
Answer: {your chosen answer}
Confidence: {your confidence score between 0% and 100% for your answer}
"""
JUDGE_PROMPT = """Judge whether the following [response] to [question] is correct or not based on the precise and unambiguous [correct_answer] below.
[question]:
{question}
[response]:
{response}
Your judgement must be a JSON object with the following keys:
- extracted_final_answer: The final exact answer extracted from [response]. Put "None" if there is no exact final answer.
- reasoning: Explain only whether the extracted_final_answer matches [correct_answer]. Do not solve the question again.
- correct: "yes" if extracted_final_answer matches [correct_answer], or is within a small margin of error for numerical problems. Otherwise "no".
- confidence: The extracted confidence score between 0 and 100 from [response]. Put 100 if there is no confidence score available.
[correct_answer]:
{correct_answer}
"""
def normalize_text(s: Optional[str]) -> str:
if s is None:
return ""
s = str(s).strip().lower()
s = s.replace("%", "%")
s = re.sub(r"\s+", " ", s)
return s
def normalize_answer_for_match(s: Optional[str]) -> str:
s = normalize_text(s)
s = s.strip(" .,:;!?'\"()[]{}")
return s
def extract_answer_and_confidence(response_text: str) -> Tuple[str, int]:
"""
从模型输出中提取:
- Answer: ...
- Confidence: ...
"""
answer = ""
confidence = 100
# 提取 Answer
m_answer = re.search(
r"(?im)^\s*answer\s*:\s*(.+?)\s*$",
response_text,
)
if m_answer:
answer = m_answer.group(1).strip()
else:
# 兜底:取最后一行
lines = [x.strip() for x in response_text.splitlines() if x.strip()]
if lines:
answer = lines[-1]
# 提取 Confidence
m_conf = re.search(
r"(?im)^\s*confidence\s*:\s*([0-9]{1,3})(?:\s*%|\b)",
response_text,
)
if m_conf:
confidence = int(m_conf.group(1))
confidence = max(0, min(100, confidence))
else:
# 再兜底搜全文
m_conf2 = re.search(r"([0-9]{1,3})\s*%", response_text)
if m_conf2:
confidence = int(m_conf2.group(1))
confidence = max(0, min(100, confidence))
return answer, confidence
def try_mcq_match(pred_answer: str, gold_answer: str, sample: Dict[str, Any]) -> bool:
"""
对多选题做一个较宽松的本地匹配:
1) 直接和 gold answer 对比
2) 如果 pred 是 A/B/C/D,而 gold 是选项全文,则尝试映射
"""
pa = normalize_answer_for_match(pred_answer)
ga = normalize_answer_for_match(gold_answer)
if pa == ga:
return True
# 常见 letter 形式
pa_letter = None
m = re.match(r"^\(?([a-z])\)?(?:[.:)\- ]|$)", pa)
if m:
pa_letter = m.group(1).upper()
elif len(pa) == 1 and pa.isalpha():
pa_letter = pa.upper()
# 尝试读取选项字段
options = None
for key in ["options", "choices", "answer_choices", "candidate_answers"]:
if key in sample and sample[key]:
options = sample[key]
break
if options and pa_letter:
# 支持 list[str] 或 dict
if isinstance(options, list):
idx = ord(pa_letter) - ord("A")
if 0 <= idx < len(options):
opt_text = normalize_answer_for_match(str(options[idx]))
if opt_text == ga:
return True
elif isinstance(options, dict):
# 例如 {"A": "...", "B": "..."}
if pa_letter in options:
opt_text = normalize_answer_for_match(str(options[pa_letter]))
if opt_text == ga:
return True
return False
def calib_err(confidence: np.ndarray, correct: np.ndarray, p: str = "2", beta: int = 100) -> float:
"""
与官方 judge 脚本同风格的 calibration error 计算。
"""
if len(confidence) == 0:
return 0.0
idxs = np.argsort(confidence)
confidence = confidence[idxs]
correct = correct[idxs]
if len(confidence) < beta:
beta = max(1, len(confidence))
bins = [[i * beta, (i + 1) * beta] for i in range(max(1, len(confidence) // beta))]
bins[-1] = [bins[-1][0], len(confidence)]
cerr = 0.0
total_examples = len(confidence)
for i in range(len(bins)):
start, end = bins[i]
bin_confidence = confidence[start:end]
bin_correct = correct[start:end]
num_examples_in_bin = len(bin_confidence)
if num_examples_in_bin > 0:
difference = abs(np.nanmean(bin_confidence) - np.nanmean(bin_correct))
if p == "2":
cerr += num_examples_in_bin / total_examples * (difference ** 2)
elif p == "1":
cerr += num_examples_in_bin / total_examples * difference
elif p in ("infty", "infinity", "max"):
cerr = max(cerr, difference)
else:
raise ValueError("p must be '1', '2', or 'infty'")
if p == "2":
cerr = math.sqrt(cerr)
return float(cerr)
class HLEEvaluator:
def __init__(self, args: argparse.Namespace):
self.args = args
self.client = AsyncOpenAI(
api_key=args.api_key or os.getenv("OPENAI_API_KEY"),
base_url=args.base_url or os.getenv("OPENAI_BASE_URL"),
timeout=args.timeout,
max_retries=1,
)
self.judge_client = AsyncOpenAI(
api_key=args.judge_api_key or args.api_key or os.getenv("OPENAI_API_KEY"),
base_url=args.judge_base_url or args.base_url or os.getenv("OPENAI_BASE_URL"),
timeout=args.timeout,
max_retries=1,
)
def load_questions(self) -> List[Dict[str, Any]]:
ds = load_dataset(
self.args.dataset,
split=self.args.split,
token=self.args.hf_token or os.getenv("HF_TOKEN"),
).to_dict()
questions = [dict(zip(ds.keys(), values)) for values in zip(*ds.values())]
if self.args.max_samples:
questions = questions[: self.args.max_samples]
return questions
def build_messages(self, q: Dict[str, Any]) -> List[Dict[str, Any]]:
question_text = q["question"]
text_content = {"type": "text", "text": question_text}
content = [text_content]
# 官方数据里 image 字段可能为空字符串
image_url = q.get("image")
if image_url:
content.append({"type": "image_url", "image_url": {"url": image_url}})
system_role = "user" if "o1" in self.args.model else "system"
return [
{"role": system_role, "content": SYSTEM_PROMPT},
{"role": "user", "content": content},
]
async def call_model_once(self, q: Dict[str, Any]) -> Optional[Tuple[str, Dict[str, Any]]]:
try:
response = await self.client.chat.completions.create(
model=self.args.model,
messages=self.build_messages(q),
max_completion_tokens=self.args.max_completion_tokens,
temperature=self.args.temperature if "o1" not in self.args.model else None,
stream=False,
)
content = response.choices[0].message.content or ""
usage = {}
if getattr(response, "usage", None) is not None:
try:
usage = json.loads(response.usage.json())
except Exception:
usage = {
"prompt_tokens": getattr(response.usage, "prompt_tokens", None),
"completion_tokens": getattr(response.usage, "completion_tokens", None),
"total_tokens": getattr(response.usage, "total_tokens", None),
}
pred_answer, pred_conf = extract_answer_and_confidence(content)
result = {
"id": q["id"],
"model": self.args.model,
"response": content,
"pred_answer": pred_answer,
"pred_confidence": pred_conf,
"usage": usage,
}
return q["id"], result
except Exception as e:
print(f"[ERROR] model call failed for {q.get('id')}: {e}")
return None
async def predict_all(self, questions: List[Dict[str, Any]]) -> Dict[str, Any]:
save_path = self.args.predictions_out
if os.path.exists(save_path):
with open(save_path, "r", encoding="utf-8") as f:
predictions = json.load(f)
else:
predictions = {}
pending = [q for q in questions if q["id"] not in predictions]
sem = asyncio.Semaphore(self.args.num_workers)
async def bound_call(q: Dict[str, Any]):
async with sem:
return await self.call_model_once(q)
tasks = [bound_call(q) for q in pending]
results = await tqdm_asyncio.gather(*tasks)
for item in results:
if item is None:
continue
qid, result = item
predictions[qid] = result
with open(save_path, "w", encoding="utf-8") as f:
json.dump(predictions, f, ensure_ascii=False, indent=2)
return predictions
async def judge_one_with_llm(self, q: Dict[str, Any], pred: Dict[str, Any]) -> Dict[str, Any]:
prompt = JUDGE_PROMPT.format(
question=q["question"],
response=pred["response"],
correct_answer=q["answer"],
)
try:
response = await self.judge_client.chat.completions.create(
model=self.args.judge_model,
messages=[{"role": "user", "content": prompt}],
max_completion_tokens=2048,
temperature=0,
stream=False,
)
text = response.choices[0].message.content or ""
# 尝试从回复里抓 JSON
json_text = text.strip()
m = re.search(r"\{.*\}", text, flags=re.S)
if m:
json_text = m.group(0)
obj = json.loads(json_text)
correct = str(obj.get("correct", "no")).strip().lower()
confidence = obj.get("confidence", pred.get("pred_confidence", 100))
try:
confidence = int(confidence)
except Exception:
confidence = pred.get("pred_confidence", 100)
confidence = max(0, min(100, confidence))
return {
"correct_answer": q["answer"],
"model_answer": obj.get("extracted_final_answer", pred.get("pred_answer", "")),
"reasoning": obj.get("reasoning", ""),
"correct": "yes" if correct == "yes" else "no",
"confidence": confidence,
}
except Exception as e:
print(f"[ERROR] judge failed for {q.get('id')}: {e}")
# judge 失败时退回本地 exact match
local_correct = self.local_judge(q, pred)
return {
"correct_answer": q["answer"],
"model_answer": pred.get("pred_answer", ""),
"reasoning": "fallback local judge",
"correct": "yes" if local_correct else "no",
"confidence": pred.get("pred_confidence", 100),
}
def local_judge(self, q: Dict[str, Any], pred: Dict[str, Any]) -> bool:
gold = q["answer"]
pa = pred.get("pred_answer", "")
# 先试 MCQ 规则
if try_mcq_match(pa, gold, q):
return True
# 再试普通 exact/normalized exact match
return normalize_answer_for_match(pa) == normalize_answer_for_match(gold)
async def judge_all(self, questions: List[Dict[str, Any]], predictions: Dict[str, Any]) -> Dict[str, Any]:
save_path = self.args.judged_out
if os.path.exists(save_path):
with open(save_path, "r", encoding="utf-8") as f:
judged = json.load(f)
else:
judged = {}
qmap = {q["id"]: q for q in questions}
pending_ids = [qid for qid in predictions if qid in qmap and qid not in judged]
if not self.args.judge_model:
for qid in pending_ids:
q = qmap[qid]
pred = predictions[qid]
correct = self.local_judge(q, pred)
judged[qid] = copy.deepcopy(pred)
judged[qid]["judge_response"] = {
"correct_answer": q["answer"],
"model_answer": pred.get("pred_answer", ""),
"reasoning": "local exact/MCQ match",
"correct": "yes" if correct else "no",
"confidence": pred.get("pred_confidence", 100),
}
with open(save_path, "w", encoding="utf-8") as f:
json.dump(judged, f, ensure_ascii=False, indent=2)
return judged
sem = asyncio.Semaphore(self.args.num_workers)
async def bound_judge(qid: str):
async with sem:
q = qmap[qid]
pred = predictions[qid]
jr = await self.judge_one_with_llm(q, pred)
out = copy.deepcopy(pred)
out["judge_response"] = jr
return qid, out
tasks = [bound_judge(qid) for qid in pending_ids]
results = await tqdm_asyncio.gather(*tasks)
for qid, item in results:
judged[qid] = item
with open(save_path, "w", encoding="utf-8") as f:
json.dump(judged, f, ensure_ascii=False, indent=2)
return judged
@staticmethod
def dump_metrics(judged: Dict[str, Any], n_total: int) -> None:
correct = []
confidence = []
for _, item in judged.items():
jr = item.get("judge_response", {})
correct.append(1 if jr.get("correct") == "yes" else 0)
confidence.append(jr.get("confidence", 100) / 100.0)
correct = np.array(correct, dtype=np.float32)
confidence = np.array(confidence, dtype=np.float32)
n_pred = len(correct)
if n_pred == 0:
print("No judged predictions found.")
return
accuracy = round(100.0 * float(correct.sum()) / float(n_total), 2)
half_width = round(1.96 * math.sqrt(accuracy * (100 - accuracy) / max(1, n_total)), 2)
cal_error = round(100.0 * calib_err(confidence, correct, p="2", beta=min(100, max(1, n_pred))), 2)
print("\n*** Metrics ***")
print(f"Available judged predictions: {n_pred} / total questions: {n_total}")
print(f"Accuracy: {accuracy}% +/- {half_width}% | n = {n_total}")
print(f"Calibration Error: {cal_error}")
async def run(self) -> None:
t0 = time.time()
questions = self.load_questions()
print(f"Loaded {len(questions)} questions from {self.args.dataset}:{self.args.split}")
predictions = await self.predict_all(questions)
print(f"Saved predictions -> {self.args.predictions_out}")
judged = await self.judge_all(questions, predictions)
print(f"Saved judged results -> {self.args.judged_out}")
self.dump_metrics(judged, n_total=len(questions))
print(f"\nDone in {time.time() - t0:.1f}s")
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Single-file HLE evaluator")
# Dataset
parser.add_argument("--dataset", type=str, default="cais/hle")
parser.add_argument("--split", type=str, default="test")
parser.add_argument("--hf_token", type=str, default=None)
parser.add_argument("--max_samples", type=int, default=None)
# Model
parser.add_argument("--model", type=str, required=True)
parser.add_argument("--api_key", type=str, default=None)
parser.add_argument("--base_url", type=str, default=None)
parser.add_argument("--temperature", type=float, default=0.0)
parser.add_argument("--max_completion_tokens", type=int, default=8192)
parser.add_argument("--timeout", type=float, default=600.0)
parser.add_argument("--num_workers", type=int, default=16)
# Judge
parser.add_argument("--judge_model", type=str, default=None,
help="例如 o3-mini-2025-01-31;不填则用本地 exact/MCQ match")
parser.add_argument("--judge_api_key", type=str, default=None)
parser.add_argument("--judge_base_url", type=str, default=None)
# Outputs
parser.add_argument("--predictions_out", type=str, default="hle_predictions.json")
parser.add_argument("--judged_out", type=str, default="hle_judged.json")
return parser.parse_args()
def main():
args = parse_args()
evaluator = HLEEvaluator(args)
asyncio.run(evaluator.run())
if __name__ == "__main__":
main()
安装
bash
pip install datasets openai tqdm numpy
运行示例
1)只做预测 + 本地简单判分
bash
export OPENAI_API_KEY=your_key
python hle_eval_single.py \
--model gpt-4o-2024-11-20 \
--max_samples 50 \
--num_workers 8
2)预测 + judge model 判分
bash
export OPENAI_API_KEY=your_key
python hle_eval_single.py \
--model gpt-4o-2024-11-20 \
--judge_model o3-mini-2025-01-31 \
--max_samples 50 \
--num_workers 8
3)接 vLLM / OpenAI-compatible 本地服务
bash
python hle_eval_single.py \
--model your-model-name \
--base_url http://127.0.0.1:8000/v1 \
--api_key EMPTY \
--judge_model your-judge-model-name \
--judge_base_url http://127.0.0.1:8000/v1 \
--judge_api_key EMPTY \
--max_samples 50
说明
这版脚本是按官方思路整理的"单文件版":
-
预测 prompt 沿用了官方格式
-
支持
question / image / answer / id -
judge 逻辑优先走 LLM judge;不配 judge 时,就退化为本地 exact match / 多选匹配
-
会输出两份文件:
hle_predictions.jsonhle_judged.json
更稳的正式复现实验,还是建议直接跑官方仓库里的两步脚本;官方 README 里给了标准命令。 ([GitHub][1])
参考链接:
1\]: https://github.com/centerforaisafety/hle "GitHub - centerforaisafety/hle: Humanity's Last Exam · GitHub"