from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import nltk
# 下载必要的资源(第一次运行需要)
nltk.download('wordnet') # 用于 BLEU 的 tokenizer 等
nltk.download('punkt') # 用于 tokenize
def batch_bleu(references, candidates):
"""
计算批量 BLEU 分数 (BLEU-4)
Args:
references: List of lists of reference sentences (每项是多个参考答案列表)
candidates: List of candidate sentences (模型生成的句子列表)
Returns:
float: 平均 BLEU-4 分数
"""
smoothing = SmoothingFunction()
# 将每个参考句子 tokenize
tokenized_references = [[nltk.word_tokenize(sent) for sent in ref] for ref in references]
# 将每个候选句子 tokenize
tokenized_candidates = [nltk.word_tokenize(sent) for sent in candidates]
# 计算 corpus BLEU
bleu_score = corpus_bleu(
tokenized_references,
tokenized_candidates,
weights=(0.25, 0.25, 0.25, 0.25),
smoothing_function=smoothing.method1
)
return bleu_score
def batch_rouge(references, candidates):
"""
计算批量 ROUGE 分数 (ROUGE-1, ROUGE-2, ROUGE-L)
Args:
references: List of reference sentences (每个样本一个参考句)
candidates: List of candidate sentences
Returns:
dict: {'rouge1': f1, 'rouge2': f1, 'rougeL': f1}
"""
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
for ref, cand in zip(references, candidates):
score = scorer.score(ref, cand)
scores['rouge1'].append(score['rouge1'].fmeasure)
scores['rouge2'].append(score['rouge2'].fmeasure)
scores['rougeL'].append(score['rougeL'].fmeasure)
avg_scores = {k: sum(v)/len(v) for k, v in scores.items()}
return avg_scores
def evaluate_all(references, candidates):
"""
同时计算 BLEU 和 ROUGE 的批量评估函数
Args:
references: List of reference sentences
candidates: List of candidate sentences
Returns:
dict: 包含 BLEU 和 ROUGE 的平均分数
"""
bleu = batch_bleu([[ref] for ref in references], candidates)
rouge = batch_rouge(references, candidates)
return {
'BLEU': round(bleu, 4),
'ROUGE-1': round(rouge['rouge1'], 4),
'ROUGE-2': round(rouge['rouge2'], 4),
'ROUGE-L': round(rouge['rougeL'], 4)
}
测试代码
python复制代码
# 示例数据:批量输入
references = [
"the cat is on the mat",
"a dog is playing in the garden"
]
candidates = [
"the cat sat on the mat",
"a dog plays in the garden"
]
# 调用评估函数
results = evaluate_all(references, candidates)
print("Evaluation Results:", results)