OpenCompass
一、环境配置
python
# 创建虚拟环境
conda create --name opencompass python=3.10 -y
# 激活虚拟环境
conda activate opencompass
# 下载最新版本的opencompass
pip install -U opencompass
# 下载推理后端
pip install "opencompass[vllm]"
# 下载modelscope
pip install modelscope
# 测试推理后端
vllm serve Qwen/Qwen2.5-0.5B-Instruct
二、评估
1)准备数据集
jsonl
{"question": "752+361+181+933+235+986=", "answer": "3448"}
{"question": "712+165+223+711=", "answer": "1811"}
{"question": "921+975+888+539=", "answer": "3323"}
{"question": "752+321+388+643+568+982+468+397=", "answer": "4519"}
2)准备模型
python
from modelscope import snapshot_download
snapshot_download("Qwen/Qwen2.5-7B-Instruct",cache_dir="/mnt/workspace/.cache/modelscope/hub")
3)准备评估器
注意:一定把此python文件放到opencompass.evaluator下,如果是pip安装需要自己找对应路径,如果是源码安装,则直接放到对应路径
python
import evaluate
import numpy as np
from opencompass.openicl import BaseEvaluator, JiebaRougeEvaluator
import openai
import os
os.environ['OPENAI_API_KEY'] = 'xxx'
os.environ['OPENAI_BASE_URL'] = 'xxx'
# 放到opencompass.evaluator 目录下
class MyEvaluator(BaseEvaluator):
def __init__(self) -> None:
super().__init__()
def score(self, predictions, references):
if len(predictions) != len(references):
return {'error': 'preds and refrs have different length'}
# sacrebleu
metric = evaluate.load('/mnt/workspace/opencompass/opencompass/openicl/icl_evaluator/hf_metrics/sacrebleu.py')
sacrebleu_results = metric.compute(predictions=predictions, references=references)
print(f"sacrebleu的results为{sacrebleu_results}")
# rouge
metric = evaluate.load('/mnt/workspace/opencompass/opencompass/openicl/icl_evaluator/hf_metrics/rouge.py')
rouge_results = metric.compute(predictions=predictions, references=references)
print(f"rouge的results为{rouge_results}")
# jiebarouge
jb_rouge = JiebaRougeEvaluator()
jiebarouge_results = jb_rouge.score(predictions, references)
print(f"jb_rouge的results为{jiebarouge_results}")
# 相似度评分
sim_results = score_similarity_aliyun_2(predictions, references)
print(f"sim_results的results为{sim_results}")
return {
'sacrebleu_results': sacrebleu_results,
'rouge_results': rouge_results,
'jiebarouge_results': jiebarouge_results,
'sim_results': sim_results
}
def score_similarity(predictions, references):
embedding1 = openai.embeddings.create(
model="text-embedding-ada-002",
input=predictions
).data
embedding2 = openai.embeddings.create(
model="text-embedding-ada-002",
input=references
).data
sum_similarity = 0
count = 0
for i, j in zip(embedding1, embedding2):
i = i.embedding
j = j.embedding
similarity = cosine_similarity(i, j)
sum_similarity += similarity
count += 1
return {
'scores': sum_similarity / count
}
def score_similarity_2(predictions, references):
sum_similarity = 0
count = 0
for i, j in zip(predictions, references):
# 每对进行embedding
embedding_result = openai.embeddings.create(
model="text-embedding-ada-002",
input=[i, j]
).data
similarity = cosine_similarity(embedding_result[0].embedding, embedding_result[1].embedding)
sum_similarity += similarity
count += 1
print(f'第{count}条数据的相似度为{similarity}')
return {
'scores': sum_similarity / count
}
def score_similarity_aliyun(predictions, references):
from openai import OpenAI
client = OpenAI(
api_key='xxx',
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
)
completion_pre = client.embeddings.create(
model="text-embedding-v3",
input=predictions,
dimensions=1024,
encoding_format="float"
).data
completion_ref = client.embeddings.create(
model="text-embedding-v3",
input=references,
dimensions=1024,
encoding_format="float"
).data
sum_similarity = 0
count = 0
for i, j in zip(completion_pre, completion_ref):
i = i.embedding
j = j.embedding
similarity = cosine_similarity(i, j)
sum_similarity += similarity
count += 1
print(f'第{count}条数据的相似度为{similarity}')
return {
'scores': sum_similarity / count
}
def score_similarity_aliyun_2(predictions, references):
from openai import OpenAI
client = OpenAI(
api_key='xxx',
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
)
sum_similarity = 0
count = 0
for i, j in zip(predictions, references):
# 每对进行embedding
embedding_result = client.embeddings.create(
model="text-embedding-v3",
input=[i, j],
dimensions=1024,
encoding_format="float"
).data
similarity = cosine_similarity(embedding_result[0].embedding, embedding_result[1].embedding)
sum_similarity += similarity
count += 1
print(f'第{count}条数据的相似度为{similarity}')
return {
'scores': sum_similarity / count
}
def cosine_similarity(v1, v2):
# 计算点积
dot_product = np.dot(v1, v2)
# 计算向量的范数
norm_v1 = np.linalg.norm(v1)
norm_v2 = np.linalg.norm(v2)
# 计算余弦相似度
return dot_product / (norm_v1 * norm_v2)
4)准备评估配置文件
-
不带提示词
pythonfrom opencompass.models import VLLMwithChatTemplate from opencompass.openicl import GenInferencer, PromptTemplate, ZeroRetriever from opencompass.datasets import CustomDataset from opencompass.evaluator.my_evaluator import MyEvaluator models = [ dict( type=VLLMwithChatTemplate, abbr='qwen2.5-7b-instruct-vllm', path='/mnt/workspace/.cache/modelscope/hub/Qwen/Qwen2___5-7B-Instruct', model_kwargs=dict(tensor_parallel_size=1, max_model_len=20480), max_out_len=1024, batch_size=2, generation_kwargs=dict(temperature=0), run_cfg=dict(num_gpus=1), ) ] datasets = [ dict( abbr='data', eval_cfg=dict( evaluator=dict( type=MyEvaluator ), pred_role='BOT'), infer_cfg=dict( inferencer=dict( type=GenInferencer ), prompt_template=dict( template=dict( round=[ dict(prompt='{question}', role='HUMAN'), dict(prompt='{answer}', role='BOT'), ]), type=PromptTemplate), retriever=dict( type=ZeroRetriever )), path='/mnt/workspace/data.jsonl', reader_cfg=dict( input_columns=[ 'question', ], output_column='answer'), type=CustomDataset) ] work_dir = '/mnt/workspace/eval_project/outputs/qwen2_5_7b_instruct/'
-
带系统提示词
pythonfrom opencompass.models import VLLMwithChatTemplate from opencompass.openicl import GenInferencer, PromptTemplate, ZeroRetriever from opencompass.datasets import CustomDataset from opencompass.evaluator.my_evaluator import MyEvaluator models = [ dict( type=VLLMwithChatTemplate, abbr='qwen2.5-7b-instruct-vllm', path='/mnt/workspace/.cache/modelscope/hub/Qwen/Qwen2___5-7B-Instruct', model_kwargs=dict(tensor_parallel_size=1, max_model_len=20480), max_out_len=1024, batch_size=2, generation_kwargs=dict(temperature=0), run_cfg=dict(num_gpus=1), meta_template=dict( round=[ dict(role='HUMAN', api_role='HUMAN'), dict(role='BOT', api_role='BOT', generate=True) ], reserved_roles=[ dict(role='SYSTEM', api_role='SYSTEM'), ], ) ) ] datasets = [ dict( abbr='data', eval_cfg=dict( evaluator=dict( type=MyEvaluator ), pred_role='BOT'), infer_cfg=dict( inferencer=dict( type=GenInferencer ), prompt_template=dict( template=dict( begin=[ dict(role='SYSTEM', fallback_role='HUMAN', prompt='你的提示词'), ], round=[ dict(prompt='{question}', role='HUMAN'), dict(prompt='{answer}', role='BOT'), ]), type=PromptTemplate), retriever=dict( type=ZeroRetriever )), path='/mnt/workspace/data.jsonl', reader_cfg=dict( input_columns=[ 'question', ], output_column='answer'), type=CustomDataset) ] work_dir = '/mnt/workspace/eval_project/outputs/qwen2_5_7b_instruct_with_prompt/'
5)开始评估
python
opencompass 第四步评估配置文件.py --debug
6)日志添加
python
添加日志,可以看到具体推理时候的提示词对不对
/opt/conda/envs/opencompass/lib/python3.10/site-packages/opencompass/models/vllm_with_tf_above_v4_33.py下
120行
print(f'messages为:{messages}')
7)查看日志
shell
# 可以看到具体传入的聊天信息。
vim /mnt/workspace/outputs/default/20250210_jian/xxxx/logs/infer/qwen2.5-7b-instruct-vllm/data.out
# 其他日志也在对应文件下,自己找
8)工具
-
提示词可视化
shellpython tools/prompt_viewer.py 评估配置文件.py
9)小技巧
-
opencompass是先用gpu进行推理,然后用cpu进行评估
-
推理和评估可以分开运行,如果推理完成了,评估阶段报错,我们可以解决报错后,单独运行评估部分
shellopencompass 评估配置文件.py -m eval -r --debug