https://github.com/modelscope/evalscope
司南是目前国内主流的(我觉得)评测框架,包含LLM的客观评测,VLM的客观评测(移动到vlmevalkit),长文本评测。
Swift(阿里的)集成了司南的,使得评测更为简单。
这里以本地部署的VLM的OpenAI格式接口为例,只需要配置好evalscope环境,然后遵循下述代码
python
# Copyright (c) Alibaba, Inc. and its affiliates.
"""
1. Installation
EvalScope: pip install evalscope[vlmeval]
2. Deploy judge model
3. Run eval task
"""
from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
from evalscope.run import run_task
from evalscope.summarizer import Summarizer
from evalscope.utils.logger import get_logger
logger = get_logger()
def run_swift_eval():
# List all datasets
#print(f'** All models from VLMEvalKit backend: {VLMEvalKitBackendManager.list_supported_models().keys()}')
#print(f'** All datasets from VLMEvalKit backend: {VLMEvalKitBackendManager.list_supported_datasets()}')
# Prepare the config
# # Option 1: Use dict format
task_cfg = {'eval_backend': 'VLMEvalKit',
'eval_config': {#'LOCAL_LLM': 'qwen2-7b-instruct', # 判别模型
#'OPENAI_API_BASE': 'http://localhost:8866/v1/chat/completions', # judge model api
#'OPENAI_API_KEY': 'EMPTY',
'data': ["HallusionBench"], #'COCO_VAL', # 数据集,下载到~/MDH
'limit': -1, # 全部样本
'mode': 'all',
'model': [{'api_base': 'http://ip:端口/v1/chat/completions', # 待评估模型
'key': 'EMPTY',
'name': 'CustomAPIModel', # 不能改
'temperature': 0.0, #
'type': 'InternVL2-26B' # 部署的mode_id
}],
'rerun': True,
'work_dir': 'eval/vlm/output'}} # 输出路径
# Option 2: Use yaml file
#task_cfg = "examples/tasks/eval_vlm_swift.yaml"
# Run task
run_task(task_cfg=task_cfg)
# [Optional] Get the final report with summarizer
logger.info('>> Start to get the report with summarizer ...')
report_list = Summarizer.get_report_from_cfg(task_cfg)
logger.info(f'\n>> The report list: {report_list}')
if __name__ == '__main__':
run_swift_eval()
很简洁的部署,结果示例:
powershell
"split","aAcc","fAcc","qAcc"
"Overall","55.47368421052632","31.213872832369944","30.54945054945055"
"VD","57.69881556683587","35.21739130434783","28.880866425992778"
"VS","51.81058495821726","23.275862068965516","33.146067415730336"
"VS_ocr","37.735849056603776","19.230769230769234","11.11111111111111"
"VD_illusion","54.166666666666664","37.096774193548384","20.833333333333336"
"VS_map","39.0625","22.727272727272727","15.625"
"VS_chart","46.92307692307692","17.5","36.84210526315789"
"VD_ocr","76.40449438202246","60.46511627906976","55.81395348837209"
"VD_video","50.0","10.416666666666668","10.144927536231885"
"VD_math","50.92592592592593","19.444444444444446","31.48148148148148"
"VS_table","71.42857142857143","35.714285714285715","53.48837209302325"
"VD_figure","68.75","48.78048780487805","43.58974358974359"