因工作需要,公司里某产品部门需要做产品的评价分析,领导找到我,问我能不能用代码来实现这个需求,提升工作效率,才刚学不久的BERT模型派上用场。
1、评价翻译
需要将亚马逊的产品评价翻译成中文,涉及到多国语言。为了节省时间,提升效率,目前阶段使用Deepseek的付费服务进行翻译,后续有空再训练翻译模型。
评价数据存放在MongoDB里,翻译后将结果也存放回Deepseek里。
python
import time
import pymongo
from openai import OpenAI
# Deepseek提示词
ss = """
【身份定位】
您是一名专注跨境电商领域的资深本地化专家,尤其精通亚马逊平台评价的中文翻译工作。
【核心要求】
翻译方向:外→中单向翻译,识别所有非中文内容(含多语言混排)进行精准转换
【术语规范】
使用亚马逊中国站官方术语(如"商品详情页"、"Prime会员"、"用户体验"等)
保留星标符号(★)及评分数字原文
专业品类词汇采用平台标准译法
【语言风格】
书面化自然表达
完整保留原始情感倾向(好评/中评/差评语气)
处理特殊句式:将欧美式长句合理切分为中文短句
【处理规范】
结构保留:维持原始段落/换行格式,不合并分段
【严谨性原则】
数字/尺寸单位直接保留(如"6.5 inch"→6.5英寸)
品牌/型号名称保持原文
不确定的俚语添加译者注说明
【输出格式】
仅返回最终翻译文本,不加任何解释/注释说明。
"""
if __name__ == "__main__":
today = time.strftime('%Y-%m-%d')
mc = pymongo.MongoClient('mongodb://root:xxxxxxxx@xxx.xxx.xxx.xxx:27017/')
# 使用deepseek进行翻译
client = OpenAI(api_key="xxxxxxxxxxxxxxxxxxxx", base_url="https://api.deepseek.com")
reviews = [i for i in mc['NLP']['reviews'].find({'insertDate': today, 'translate': None}, {'_id': 0, 'ilink': 1, 'rtitle': 1, 'rbody': 1, 'iclass': 1})]
for one in reviews:
rbody = (one['rtitle'].strip() + '.' + '.'.join([i.strip() for i in one['rbody'].split('\n') if i.strip()])).strip()
print(rbody)
response = client.chat.completions.create(model="deepseek-chat", messages=[{"role": "system", "content": ss.format(one['iclass'])}, {"role": "user", "content": rbody}], stream=False)
result = response.choices[0].message.content
result = '\n'.join([i.strip() for i in result.split('\n') if i.strip()])
print(result)
mc['NLP']['reviews'].update_one({'ilink': one['ilink']}, {'$set': {'translate': result, 'user': 'DeepSeek'}})
2、模型选择
最初只是想通过TF-IDF来统计评价中某些词的词频来进行分析,尝试过后发现错误率太高,因为同一个问题再评价中有许许多多中的表述方式。
看了两本书:《用python进行自然语言处理》、《BERT基础教程:Transformer大模型实战》,最终选择了第二本书里面的知识。
先后使用了三个模型:bert-base-chinese、chinese-bert-wwm、chinese-bert-wwm-ext,目前 用的是chinese-bert-wwm-ext。
既然翻译都用上Deepseek了,为什么评价分析不直接用Deepseek呢?因为Deepseek是大语言模型,在细分领域上的表现不尽人意,满足不了公司的需求。
3、准备数据
trains.json(总共标注了将近6000条评价,这里仅展示些许)
python
[
{
"text": "外壳适用于Galaxy Tab S9FE+,但尺寸不合。外壳实在太紧。我担心如果强行将平板塞进去会损坏设备...可惜了,其他方面看起来不错,但因此决定退货。",
"labels": [
{
"entity": "尺寸不合。外壳实在太紧",
"type": "尺寸/贴合问题"
}
]
},
{
"text": "尺寸不合。按键无法正常使用。自我提醒:下次购买前一定要先看差评!!",
"labels": [
{
"entity": "尺寸不合",
"type": "尺寸/贴合问题"
},
{
"entity": "按键无法正常使用",
"type": "按键按压困难"
}
]
},
...
]
4、添加tokens
tokens.json
python
[
"ipad", "iphone", "magsafe", "pro", "apple",
"pencil", "pen", "galaxy", "max", "ultra", "air", "mini",
"tab", "s24", "plus", "s9", "s23", "pixel", "12.9", "2024",
"fe", "tpu", "prime", "flip", "s10", "id", "logo", "a9"
]
5、写代码
(1)导入需要的库
python
# -*- coding: utf-8 -*-
import json
import torch
import numpy as np
from functools import reduce
from datasets import Dataset
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
from collections import Counter
(2)加载分词器和新增tokens
python
tokenizer = BertTokenizerFast.from_pretrained('chinese-bert-wwm-ext')
with open('tokens.json', 'r', encoding='utf-8-sig') as fp:
brand_names = json.loads(fp.read())
tokenizer.add_tokens(brand_names)
(3)加载和处理数据
python
label_counts = Counter()
# 加载数据集
with open('trains.json', 'r', encoding='utf-8') as fp:
datas = json.loads(fp.read())
# 获取标注数据里的所有标签
label_list = ["O"]
label_ = sorted(list(set([i['type'].replace("/", "_") for i in reduce(lambda x, y: x + y, [i['labels'] for i in datas])])))
label_list += [f'B-{i}'for i in label_]
label_list += [f'I-{i}'for i in label_]
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}
def to_bio(review): # 将中文评价和标注数据转换为BERT模型能读取的格式
text = review['text']
tokens = tokenizer.tokenize(text)
labels = ['O'] * len(tokens)
# 获取字符级偏移量
encoded = tokenizer.encode_plus(
text,
return_offsets_mapping=True,
add_special_tokens=False
)
# 更高效地构建字符到token的映射
char_to_token = {}
for token_idx, (char_start, char_end) in enumerate(encoded['offset_mapping']):
for char_pos in range(char_start, char_end):
char_to_token[char_pos] = token_idx
for entity in review['labels']:
entity_text = entity['entity'].strip()
start = 0
while True:
# 查找所有匹配的实体
start = text.find(entity_text, start)
if start == -1:
break
end = start + len(entity_text)
# 找到对应的token索引
token_indices = []
for char_pos in range(start, end):
if char_pos in char_to_token:
token_idx = char_to_token[char_pos]
if not token_indices or token_idx != token_indices[-1]:
token_indices.append(token_idx)
if token_indices:
labels[token_indices[0]] = f'B-{entity["type"].replace("/", "_")}'
for idx in token_indices[1:]:
labels[idx] = f'I-{entity["type"].replace("/", "_")}'
start = end # 继续查找下一个匹配
return {'tokens': tokens, 'labels': labels}
bio_data = [to_bio(i) for i in datas]
bio_data = [i for i in bio_data if len(i['tokens']) <= 256]
# 转换为Hugging Face Dataset格式
dataset = Dataset.from_dict({
'tokens': [d['tokens'] for d in bio_data],
'labels': [d['labels'] for d in bio_data]
})
# 数据编码语对齐
def tokenize_and_align(examples):
tokenized = tokenizer(
examples['tokens'],
truncation=True,
padding='max_length',
max_length=256,
is_split_into_words=True
)
labels = []
for i, label in enumerate(examples['labels']):
word_ids = tokenized.word_ids(batch_index=i)
label_ids = []
for word_idx in word_ids:
if word_idx is None:
label_ids.append(-100)
else:
label_ids.append(label2id[label[word_idx]])
labels.append(label_ids)
tokenized['labels'] = labels
return tokenized
# 最终处理好的数据
tokenized_dataset = dataset.map(tokenize_and_align, batched=True)
(4)加载模型
python
# 模型初始化
model = BertForTokenClassification.from_pretrained(
'chinese-bert-wwm-ext',
num_labels=len(label_list),
id2label=id2label,
label2id=label2id
)
model.resize_token_embeddings(len(tokenizer)) # 适配新增的特殊token
(5)设置训练参数
python
# 训练参数
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=10,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
logging_steps=250,
eval_strategy='epoch',
learning_rate=2e-5,
warmup_ratio=0.1,
weight_decay=0.01,
lr_scheduler_type="cosine",
save_strategy='epoch',
load_best_model_at_end=True,
metric_for_best_model="f1",
greater_is_better=True,
fp16=False,
max_grad_norm=1.0
)
(6)自定义评估指标
python
# 定义评估指标
def compute_metrics(p):
predictions, labels = p
predictions = np.argmax(predictions, axis=2)
true_labels = []
true_preds = []
for pred, label in zip(predictions, labels):
for p, l in zip(pred, label):
if l != -100:
true_labels.append(l)
true_preds.append(p)
# 确保所有标签都存在于验证集
unique_labels = set(true_labels)
valid_labels = [i for i in range(len(label_list)) if i in unique_labels]
target_names = [label_list[i] for i in valid_labels]
report = classification_report(
true_labels,
true_preds,
labels=valid_labels,
target_names=target_names,
output_dict=True,
zero_division=0
)
return {
"precision": report["macro avg"]["precision"],
"recall": report["macro avg"]["recall"],
"f1": report["macro avg"]["f1-score"],
}
(7)开始训练
python
# 启动训练
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
eval_dataset=tokenized_dataset,
compute_metrics=compute_metrics
)
trainer.train()
(8)保存模型
python
# 保存模型
model.save_pretrained("./chinese-bert-wwm-ext-v1")
tokenizer.save_pretrained("./chinese-bert-wwm-ext-v1")
6、训练日志
train_state.json(仅部分展示)
python
{
"best_metric": 0.9351207045695515,
"best_model_checkpoint": "./results\\checkpoint-7420",
"epoch": 10.0,
"eval_steps": 500,
"global_step": 7420,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
...
{
"epoch": 10.0,
"eval_f1": 0.9351207045695515,
"eval_loss": 0.046907734125852585,
"eval_precision": 0.9431665171223371,
"eval_recall": 0.9369447886021766,
"eval_runtime": 1690.0627,
"eval_samples_per_second": 3.51,
"eval_steps_per_second": 0.439,
"step": 7420
}
],
"logging_steps": 250,
"max_steps": 7420,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7756858384773120.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}