LLM自动化评测

使用的数据集：ceval-exam
python 复制代码
import requests
from datasets import load_dataset, concatenate_datasets
import re
from tqdm import tqdm
import re, time, tiktoken, ollama
from ollama import ChatResponse
from ollama import Options


def llm(model, query, temperature=0.6, stream=False, encoding=tiktoken.encoding_for_model("gpt-4"), max_tokens=None):
    # return "A"
    options = Options(
        temperature=temperature,
        num_gpu=0, # num_gpu=0即使用CPU计算
        # num_thread=32,
        # num_ctx=4096, # 上下文窗口大小
    )
    # 流式输出
    response = ollama.chat(
        model=model,
        messages=[
            {
                "role": "system",
                "content": "你是一个做题专家。请完成下列单项选择题。\n\n## output format\n只能输出一个选项编号字母，不要有解析等其他任何内容。",
            },
            {
                "role": "user",
                "content": query,
            },
        ],
        options=options,
        stream=stream,
        keep_alive=0
    )
    if stream:
        chunks = ""
        # 逐块打印响应内容
        for chunk in response:
            chunks += chunk["message"]["content"]
            # print(chunk["message"]["content"], end="", flush=True)
            if max_tokens != None and len(encoding.encode(chunks)) > max_tokens:
                break
        response = chunks
    else:
        # print(response["message"]["content"])
        response = response["message"]["content"]
    
    # stream=True时无效
    # with open("tmp.txt", "a", encoding="utf-8") as f:
    #     f.write(response + "\n"+ 100*'*' + '\n')
    if '<think>' in response and '</think>' in response:
        response = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL)
    return response.strip()


task_list = [
    "computer_network",
    "operating_system",
    "computer_architecture",
    "college_programming",
    "college_physics",
    "college_chemistry",
    "advanced_mathematics",
    "probability_and_statistics",
    "discrete_mathematics",
    "electrical_engineer",
    "metrology_engineer",
    "high_school_mathematics",
    "high_school_physics",
    "high_school_chemistry",
    "high_school_biology",
    "middle_school_mathematics",
    "middle_school_biology",
    "middle_school_physics",
    "middle_school_chemistry",
    "veterinary_medicine",
    "college_economics",
    "business_administration",
    "marxism",
    "mao_zedong_thought",
    "education_science",
    "teacher_qualification",
    "high_school_politics",
    "high_school_geography",
    "middle_school_politics",
    "middle_school_geography",
    "modern_chinese_history",
    "ideological_and_moral_cultivation",
    "logic",
    "law",
    "chinese_language_and_literature",
    "art_studies",
    "professional_tour_guide",
    "legal_professional",
    "high_school_chinese",
    "high_school_history",
    "middle_school_history",
    "civil_servant",
    "sports_science",
    "plant_protection",
    "basic_medicine",
    "clinical_medicine",
    "urban_and_rural_planner",
    "accountant",
    "fire_engineer",
    "environmental_impact_assessment_engineer",
    "tax_accountant",
    "physician",
]
task_chinese_name_list = [
    "计算机网络",
    "操作系统",
    "计算机架构",
    "大学编程",
    "大学物理",
    "大学化学",
    "高等数学",
    "概率与统计",
    "离散数学",
    "电气工程师",
    "计量工程师",
    "高中数学",
    "高中物理",
    "高中化学",
    "高中生物学",
    "中学数学",
    "中学生物学",
    "中学物理",
    "中学化学",
    "兽医学",
    "大学经济学",
    "工商管理",
    "马克思主义",
    "毛泽东思想",
    "教育科学",
    "教师资格",
    "高中政治",
    "高中地理",
    "中学政治",
    "中学地理",
    "现代中国史",
    "思想道德修养",
    "逻辑",
    "法律",
    "汉语与文学",
    "艺术研究",
    "专业旅游指南",
    "法律专业",
    "高中汉语",
    "高中历史",
    "中学历史",
    "公务员",
    "体育科学",
    "植物保护",
    "基础医学",
    "临床医学",
    "城市与农村规划",
    "会计",
    "消防工程师",
    "环境影响评估工程师",
    "税务会计",
    "医生",
]

def test_split(model_name):
    encoding = tiktoken.encoding_for_model("gpt-4")
    model_name_write = model_name.replace(":", "_").replace("/", "_")
    # with open(f"{model_name_write}.txt", "w", encoding="utf-8") as f:
    #     f.write(f"")
    # 加载数据集
    sum_total = 0
    sum_correct = 0
    for i in range(26, len(task_list)):
        try:
            dataset_tmp = load_dataset(r"ceval/data", name=task_list[i])
            dataset = concatenate_datasets(
                [dataset_tmp["dev"], dataset_tmp["val"]]
            )
            print(f"\nNo.{i}: {task_list[i]}({task_chinese_name_list[i]})数据集加载完成, len(dataset)={len(dataset)}")
        except:
            print(f"\nNo.{i}: {task_list[i]}({task_chinese_name_list[i]})数据集加载失败")
            continue

        # 初始化统计变量
        correct = 0
        total = len(dataset)
        for item in tqdm(dataset, desc=f"No.{i}: Processing"):
        # for item in dataset:
            try:
                # 构造完整问题
                user_prompt = f"{item['question']}\nA. {item['A']}\nB. {item['B']}\nC. {item['C']}\nD. {item['D']}\n答案："

                # 调用Ollama API
                model_answer = llm(model_name, user_prompt, stream=True, encoding=encoding, max_tokens=4096)
                # 提取并验证答案
                """从模型输出中提取答案选项（A/B/C/D）"""
                match = re.search(r"[A-D]", model_answer.upper())
                extracted = match.group(0) if match else None
                if extracted and extracted == item["answer"]:
                    correct += 1
            except:
                print("\nerror.")
        # 输出结果
        sum_total += total
        sum_correct += correct
        print(f"No.{i}: {task_list[i]}({task_chinese_name_list[i]})数据集准确率: {correct}/{total} = {correct/total:.2%}")
        with open(f"{model_name_write}.txt", "a", encoding="utf-8") as f:
            f.write(f"No.{i}: {task_list[i]}({task_chinese_name_list[i]})数据集准确率: {correct}/{total} = {correct/total:.2%}\n\n")

    with open(f"{model_name_write}.txt", "a", encoding="utf-8") as f:
        f.write(f"总准确率: {sum_correct}/{sum_total} = {sum_correct/sum_total:.2%}\n\n")
    print(f"总准确率: {sum_correct}/{sum_total} = {sum_correct/sum_total:.2%}")

# huihui_ai/qwen2.5-abliterate:7b-instruct-q4_K_M    
# qwen2.5:3b-instruct-q8_0                           
# qwen2.5:7b-instruct-q5_K_M                         
# deepseek-r1-7b:latest 
# test_split(model_name="qwen2.5:3b-instruct-q8_0")
# test_split(model_name="qwen2.5:7b-instruct-q5_K_M")
# test_split(model_name="huihui_ai/qwen2.5-abliterate:7b-instruct-q4_K_M")
# test_split(model_name="qwen2.5:1.5b")
# test_split(model_name="qwen2.5:1.5b-instruct-fp16")
# test_split(model_name="qwen2.5:3b")
# test_split(model_name="gemma3:4b")
# test_split(model_name="qwen2.5:7b")
# test_split(model_name="gemma3:4b-it-q8_0")
# test_split(model_name="qwen2.5:0.5b-instruct-fp16")
# test_split(model_name="qwen2.5:0.5b")

test_split(model_name="deepseek-r1:1.5b")
# test_split(model_name="deepseek-r1:1.5b-qwen-distill-fp16")
# test_split(model_name="deepseek-r1:7b")