如下是基于modelscope进行的bert-base-uncase 模型训练及使用样例可直接运行
python
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "6"
from torch.utils.data import Dataset
import torch
from modelscope import AutoModelForSequenceClassification, AutoTokenizer
from modelscope.trainers import build_trainer
from modelscope.msdatasets import MsDataset
from modelscope.utils.hub import read_config
from modelscope.trainers import EpochBasedTrainer
from modelscope.metainfo import Trainers
from modelscope.utils.config import Config
from modelscope.utils.constant import ModeKeys
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import random
import numpy as np
import json
import tempfile
# 随机种子
seed = 42
random.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
# 同义词库
positive_verbs = ["love", "like", "admire", "adore", "enjoy", "appreciate"]
negative_verbs = ["hate", "dislike", "despise", "loathe", "detest", "abhor"]
positive_nouns = ["Hugging Face", "this product", "the service", "the app", "this book", "the movie", "the experience"]
negative_nouns = ["this", "it", "the interface", "the design", "the concept", "the approach"]
positive_adjectives = ["fantastic", "amazing", "wonderful", "excellent", "superb", "outstanding", "brilliant"]
negative_adjectives = ["terrible", "awful", "horrible", "dreadful", "lousy", "poor"]
# 生成更多样本
def generate_samples():
texts = []
labels = []
# 生成正面样本 (50条)
for _ in range(25):
# 动词结构
verb = random.choice(positive_verbs)
noun = random.choice(positive_nouns)
texts.append(f"I {verb} {noun}!")
labels.append(1)
# 形容词结构
adj = random.choice(positive_adjectives)
texts.append(f"This is {adj}!")
labels.append(1)
# 生成负面样本 (50条)
for _ in range(25):
# 动词结构
verb = random.choice(negative_verbs)
noun = random.choice(negative_nouns)
texts.append(f"I {verb} {noun}.")
labels.append(0)
# 形容词结构
adj = random.choice(negative_adjectives)
texts.append(f"This is {adj}.")
labels.append(0)
return texts, labels
# 原始样本
original_texts = ["I love Hugging Face!", "I hate this.", "This is fantastic!", "I dislike it."]
original_labels = [1, 0, 1, 0]
# 生成新样本
new_texts, new_labels = generate_samples()
# 合并样本
texts = original_texts + new_texts
labels = original_labels + new_labels
# 划分训练集和验证集
train_texts, val_texts, train_labels, val_labels = train_test_split(
texts, labels, test_size=0.2, random_state=seed
)
# 加载预训练的ModelScope tokenizer
print("加载预训练的ModelScope tokenizer")
tokenizer = AutoTokenizer.from_pretrained('AI-ModelScope/bert-base-uncased')
# 数据编码
train_encodings = tokenizer(
train_texts,
truncation=True,
padding=True,
max_length=128
)
val_encodings = tokenizer(
val_texts,
truncation=True,
padding=True,
max_length=128
)
class SentimentDataset(Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {
'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
'labels': torch.tensor(self.labels[idx])
}
return item
def __len__(self):
return len(self.labels)
# 创建数据集
print("创建数据集")
train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)
# 加载预训练的ModelScope模型
print("加载预训练的ModelScope模型")
model = AutoModelForSequenceClassification.from_pretrained(
'AI-ModelScope/bert-base-uncased',
num_labels=2 # 二分类任务
)
# 修改评估函数实现
def compute_metrics(outputs, dataloader):
"""正确的评估函数签名"""
all_preds = []
all_labels = []
# 收集所有预测结果
for batch_output in outputs:
logits = batch_output['logits']
preds = np.argmax(logits.detach().cpu().numpy(), axis=1)
all_preds.extend(preds.tolist())
# 收集所有真实标签
for batch in dataloader:
labels = batch['labels'].cpu().numpy()
all_labels.extend(labels.tolist())
return {'accuracy': accuracy_score(all_labels, all_preds)}
# 配置训练参数
cfg_dict = {
"task": "text-classification", # 添加这一行
'model': {
'type': 'AutoModelForSequenceClassification',
'model_name_or_path': 'AI-ModelScope/bert-base-uncased',
'num_labels': 2
},
'train': {
'work_dir': './out_dirs',
'dataloader': {
'batch_size_per_gpu': 2,
'workers_per_gpu': 1
},
'optimizer': {
'type': 'AdamW',
'lr': 5e-5
},
'lr_scheduler': {
'type': 'LinearLR',
'start_factor': 1.0,
'end_factor': 0.0,
'total_iters': 3
},
'hooks': [
{'type': 'CheckpointHook', 'interval': 1},
{'type': 'TextLoggerHook', 'interval': 10},
{'type': 'EvaluationHook', 'interval': 1}
],
'max_epochs': 3
},
'evaluation': {
'dataloader': {
'batch_size_per_gpu': 2,
'workers_per_gpu': 1
}
}
}
# 直接使用字典创建配置
cfg = Config(cfg_dict)
# 创建临时配置文件
with open('config.json', 'w') as f:
json.dump(cfg_dict, f)
# 创建训练器
trainer = EpochBasedTrainer(
model=model,
cfg_file='config.json',
train_dataset=train_dataset,
eval_dataset=val_dataset,
custom_eval_fn=compute_metrics # 直接传入函数
)
# 开始训练
print("训练开始")
trainer.train()
print("训练结束")
# 评估模型
#eval_results = trainer.evaluate(val_dataset)
eval_results = trainer.evaluate()
print(f"Accuracy: {eval_results['accuracy']:.4f}")
# 保存模型
trainer.model.save_pretrained("./sentiment_model")
tokenizer.save_pretrained("./sentiment_model")
# 加载模型
model = AutoModelForSequenceClassification.from_pretrained("./sentiment_model")
tokenizer = AutoTokenizer.from_pretrained("./sentiment_model")
# 预测一些示例文本
example_texts = ["I love this!", "I hate it."]
inputs = tokenizer(
example_texts,
padding=True,
truncation=True,
max_length=128,
return_tensors="pt"
)
with torch.no_grad():
outputs = model(**inputs)
predicted_labels = torch.argmax(outputs.logits, dim=1).tolist()
# 打印预测结果
for text, label in zip(example_texts, predicted_labels):
print(f"Text: {text} -- Predicted Label: {'positive' if label == 1 else 'negative'}")