Opencompass使用

OpenCompass

一、环境配置

python 复制代码
# 创建虚拟环境
conda create --name opencompass python=3.10 -y

# 激活虚拟环境
conda activate opencompass

# 下载最新版本的opencompass
pip install -U opencompass

# 下载推理后端
pip install "opencompass[vllm]"

# 下载modelscope
pip install modelscope

# 测试推理后端
vllm serve Qwen/Qwen2.5-0.5B-Instruct

二、评估

1)准备数据集
jsonl 复制代码
{"question": "752+361+181+933+235+986=", "answer": "3448"}
{"question": "712+165+223+711=", "answer": "1811"}
{"question": "921+975+888+539=", "answer": "3323"}
{"question": "752+321+388+643+568+982+468+397=", "answer": "4519"}
2)准备模型
python 复制代码
from modelscope import snapshot_download

snapshot_download("Qwen/Qwen2.5-7B-Instruct",cache_dir="/mnt/workspace/.cache/modelscope/hub")
3)准备评估器

注意:一定把此python文件放到opencompass.evaluator下,如果是pip安装需要自己找对应路径,如果是源码安装,则直接放到对应路径

python 复制代码
import evaluate
import numpy as np
from opencompass.openicl import BaseEvaluator, JiebaRougeEvaluator
import openai
import os

os.environ['OPENAI_API_KEY'] = 'xxx'
os.environ['OPENAI_BASE_URL'] = 'xxx'


# 放到opencompass.evaluator 目录下

class MyEvaluator(BaseEvaluator):
    def __init__(self) -> None:
        super().__init__()

    def score(self, predictions, references):
        if len(predictions) != len(references):
            return {'error': 'preds and refrs have different length'}
        # sacrebleu
        metric = evaluate.load('/mnt/workspace/opencompass/opencompass/openicl/icl_evaluator/hf_metrics/sacrebleu.py')
        sacrebleu_results = metric.compute(predictions=predictions, references=references)
        print(f"sacrebleu的results为{sacrebleu_results}")
        # rouge
        metric = evaluate.load('/mnt/workspace/opencompass/opencompass/openicl/icl_evaluator/hf_metrics/rouge.py')
        rouge_results = metric.compute(predictions=predictions, references=references)
        print(f"rouge的results为{rouge_results}")

        # jiebarouge
        jb_rouge = JiebaRougeEvaluator()
        jiebarouge_results = jb_rouge.score(predictions, references)
        print(f"jb_rouge的results为{jiebarouge_results}")

        # 相似度评分
        sim_results = score_similarity_aliyun_2(predictions, references)
        print(f"sim_results的results为{sim_results}")

        return {
            'sacrebleu_results': sacrebleu_results,
            'rouge_results': rouge_results,
            'jiebarouge_results': jiebarouge_results,
            'sim_results': sim_results
        }


def score_similarity(predictions, references):
    embedding1 = openai.embeddings.create(
        model="text-embedding-ada-002",
        input=predictions
    ).data
    embedding2 = openai.embeddings.create(
        model="text-embedding-ada-002",
        input=references
    ).data

    sum_similarity = 0
    count = 0
    for i, j in zip(embedding1, embedding2):
        i = i.embedding
        j = j.embedding
        similarity = cosine_similarity(i, j)
        sum_similarity += similarity
        count += 1

    return {
        'scores': sum_similarity / count
    }


def score_similarity_2(predictions, references):
    sum_similarity = 0
    count = 0

    for i, j in zip(predictions, references):
        # 每对进行embedding
        embedding_result = openai.embeddings.create(
            model="text-embedding-ada-002",
            input=[i, j]
        ).data
        similarity = cosine_similarity(embedding_result[0].embedding, embedding_result[1].embedding)
        sum_similarity += similarity
        count += 1
        print(f'第{count}条数据的相似度为{similarity}')

    return {
        'scores': sum_similarity / count
    }


def score_similarity_aliyun(predictions, references):
    from openai import OpenAI
    client = OpenAI(
        api_key='xxx',
        base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
    )

    completion_pre = client.embeddings.create(
        model="text-embedding-v3",
        input=predictions,
        dimensions=1024,
        encoding_format="float"
    ).data
    completion_ref = client.embeddings.create(
        model="text-embedding-v3",
        input=references,
        dimensions=1024,
        encoding_format="float"
    ).data

    sum_similarity = 0
    count = 0
    for i, j in zip(completion_pre, completion_ref):
        i = i.embedding
        j = j.embedding
        similarity = cosine_similarity(i, j)
        sum_similarity += similarity
        count += 1
        print(f'第{count}条数据的相似度为{similarity}')

    return {
        'scores': sum_similarity / count
    }


def score_similarity_aliyun_2(predictions, references):
    from openai import OpenAI
    client = OpenAI(
        api_key='xxx',
        base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
    )

    sum_similarity = 0
    count = 0

    for i, j in zip(predictions, references):
        # 每对进行embedding
        embedding_result = client.embeddings.create(
            model="text-embedding-v3",
            input=[i, j],
            dimensions=1024,
            encoding_format="float"
        ).data
        similarity = cosine_similarity(embedding_result[0].embedding, embedding_result[1].embedding)
        sum_similarity += similarity
        count += 1
        print(f'第{count}条数据的相似度为{similarity}')

    return {
        'scores': sum_similarity / count
    }


def cosine_similarity(v1, v2):
    # 计算点积
    dot_product = np.dot(v1, v2)
    # 计算向量的范数
    norm_v1 = np.linalg.norm(v1)
    norm_v2 = np.linalg.norm(v2)
    # 计算余弦相似度
    return dot_product / (norm_v1 * norm_v2)
4)准备评估配置文件
  • 不带提示词

    python 复制代码
    from opencompass.models import VLLMwithChatTemplate
    
    from opencompass.openicl import GenInferencer, PromptTemplate, ZeroRetriever
    from opencompass.datasets import CustomDataset
    
    from opencompass.evaluator.my_evaluator import MyEvaluator
    
    models = [
        dict(
            type=VLLMwithChatTemplate,
            abbr='qwen2.5-7b-instruct-vllm',
            path='/mnt/workspace/.cache/modelscope/hub/Qwen/Qwen2___5-7B-Instruct',
            model_kwargs=dict(tensor_parallel_size=1, max_model_len=20480),
            max_out_len=1024,
            batch_size=2,
            generation_kwargs=dict(temperature=0),
            run_cfg=dict(num_gpus=1),
        )
    ]
    
    datasets = [
        dict(
            abbr='data',
            eval_cfg=dict(
                evaluator=dict(
                    type=MyEvaluator
                ),
                pred_role='BOT'),
            infer_cfg=dict(
                inferencer=dict(
                    type=GenInferencer
                ),
                prompt_template=dict(
                    template=dict(
                        round=[
                            dict(prompt='{question}', role='HUMAN'),
                            dict(prompt='{answer}', role='BOT'),
                        ]),
                    type=PromptTemplate),
                retriever=dict(
                    type=ZeroRetriever
                )),
            path='/mnt/workspace/data.jsonl',
            reader_cfg=dict(
                input_columns=[
                    'question',
                ], output_column='answer'),
            type=CustomDataset)
    
    ]
    
    work_dir = '/mnt/workspace/eval_project/outputs/qwen2_5_7b_instruct/'
  • 带系统提示词

    python 复制代码
    from opencompass.models import VLLMwithChatTemplate
    
    from opencompass.openicl import GenInferencer, PromptTemplate, ZeroRetriever
    from opencompass.datasets import CustomDataset
    
    from opencompass.evaluator.my_evaluator import MyEvaluator
    
    models = [
        dict(
            type=VLLMwithChatTemplate,
            abbr='qwen2.5-7b-instruct-vllm',
            path='/mnt/workspace/.cache/modelscope/hub/Qwen/Qwen2___5-7B-Instruct',
            model_kwargs=dict(tensor_parallel_size=1, max_model_len=20480),
            max_out_len=1024,
            batch_size=2,
            generation_kwargs=dict(temperature=0),
            run_cfg=dict(num_gpus=1),
            meta_template=dict(
                round=[
                    dict(role='HUMAN', api_role='HUMAN'),
                    dict(role='BOT', api_role='BOT', generate=True)
                ],
                reserved_roles=[
                    dict(role='SYSTEM', api_role='SYSTEM'),
                ],
            )
        )
    ]
    
    datasets = [
        dict(
            abbr='data',
            eval_cfg=dict(
                evaluator=dict(
                    type=MyEvaluator
                ),
                pred_role='BOT'),
            infer_cfg=dict(
                inferencer=dict(
                    type=GenInferencer
                ),
                prompt_template=dict(
                    template=dict(
                        begin=[
                            dict(role='SYSTEM', fallback_role='HUMAN',
                                 prompt='你的提示词'),
                        ],
                        round=[
                            dict(prompt='{question}', role='HUMAN'),
                            dict(prompt='{answer}', role='BOT'),
                        ]),
                    type=PromptTemplate),
                retriever=dict(
                    type=ZeroRetriever
                )),
            path='/mnt/workspace/data.jsonl',
            reader_cfg=dict(
                input_columns=[
                    'question',
                ], output_column='answer'),
            type=CustomDataset)
    
    ]
    
    work_dir = '/mnt/workspace/eval_project/outputs/qwen2_5_7b_instruct_with_prompt/'
5)开始评估
python 复制代码
opencompass 第四步评估配置文件.py  --debug
6)日志添加
python 复制代码
添加日志,可以看到具体推理时候的提示词对不对
/opt/conda/envs/opencompass/lib/python3.10/site-packages/opencompass/models/vllm_with_tf_above_v4_33.py下
120行
print(f'messages为:{messages}') 
7)查看日志
shell 复制代码
# 可以看到具体传入的聊天信息。
vim /mnt/workspace/outputs/default/20250210_jian/xxxx/logs/infer/qwen2.5-7b-instruct-vllm/data.out

# 其他日志也在对应文件下,自己找
8)工具
  • 提示词可视化

    shell 复制代码
    python tools/prompt_viewer.py 评估配置文件.py
9)小技巧
  • opencompass是先用gpu进行推理,然后用cpu进行评估

  • 推理和评估可以分开运行,如果推理完成了,评估阶段报错,我们可以解决报错后,单独运行评估部分

    shell 复制代码
    opencompass 评估配置文件.py -m eval -r --debug

相关推荐
小西学编程1 天前
如何使用deepseek编程,从 "Hello World" 开始
python·aigc·deepseek
LeeZhao@2 天前
【AIGC魔童】DeepSeek v3提示词Prompt书写技巧
人工智能·语言模型·自然语言处理·面试·prompt·aigc
悟空码字2 天前
一文玩转 DeepSeek,解锁 AI 高效应用新姿势
aigc·deepseek
North_D2 天前
ML.NET库学习004:ML.NET基础知识复盘
人工智能·深度学习·神经网络·目标检测·机器学习·数据挖掘·aigc
春末的南方城市3 天前
单张照片可生成写实3D头部模型!Adobe提出FaceLift,从单一的人脸图像中重建出360度的头部模型。
人工智能·机器学习·计算机视觉·3d·adobe·aigc
老丝瓜、3 天前
stable diffusion安装包与常用模型下载
ai作画·stable diffusion·aigc·ai绘画·ai模型
LittleNyima3 天前
【笔记】扩散模型(一〇):Dreambooth 理论与实现|主题驱动生成
人工智能·笔记·深度学习·aigc·扩散模型
爱研究的小牛3 天前
讯飞智作 AI 配音技术浅析(三):自然语言处理
人工智能·深度学习·机器学习·自然语言处理·aigc
敖天羽3 天前
瞎逼逼:2025 年还没有升级研发工具的你落伍了吗
aigc