【AI-ModelScope/bert-base-uncase】模型训练及使用

如下是基于modelscope进行的bert-base-uncase 模型训练及使用样例可直接运行
python 复制代码
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "6"

from torch.utils.data import Dataset
import torch
from modelscope import AutoModelForSequenceClassification, AutoTokenizer
from modelscope.trainers import build_trainer
from modelscope.msdatasets import MsDataset
from modelscope.utils.hub import read_config
from modelscope.trainers import EpochBasedTrainer
from modelscope.metainfo import Trainers
from modelscope.utils.config import Config
from modelscope.utils.constant import ModeKeys
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import random
import numpy as np
import json
import tempfile

# 随机种子
seed = 42
random.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)



# 同义词库
positive_verbs = ["love", "like", "admire", "adore", "enjoy", "appreciate"]
negative_verbs = ["hate", "dislike", "despise", "loathe", "detest", "abhor"]
positive_nouns = ["Hugging Face", "this product", "the service", "the app", "this book", "the movie", "the experience"]
negative_nouns = ["this", "it", "the interface", "the design", "the concept", "the approach"]
positive_adjectives = ["fantastic", "amazing", "wonderful", "excellent", "superb", "outstanding", "brilliant"]
negative_adjectives = ["terrible", "awful", "horrible", "dreadful", "lousy", "poor"]

# 生成更多样本
def generate_samples():
    texts = []
    labels = []

    # 生成正面样本 (50条)
    for _ in range(25):
        # 动词结构
        verb = random.choice(positive_verbs)
        noun = random.choice(positive_nouns)
        texts.append(f"I {verb} {noun}!")
        labels.append(1)

        # 形容词结构
        adj = random.choice(positive_adjectives)
        texts.append(f"This is {adj}!")
        labels.append(1)

    # 生成负面样本 (50条)
    for _ in range(25):
        # 动词结构
        verb = random.choice(negative_verbs)
        noun = random.choice(negative_nouns)
        texts.append(f"I {verb} {noun}.")
        labels.append(0)

        # 形容词结构
        adj = random.choice(negative_adjectives)
        texts.append(f"This is {adj}.")
        labels.append(0)

    return texts, labels

# 原始样本
original_texts = ["I love Hugging Face!", "I hate this.", "This is fantastic!", "I dislike it."]
original_labels = [1, 0, 1, 0]

# 生成新样本
new_texts, new_labels = generate_samples()

# 合并样本
texts = original_texts + new_texts
labels = original_labels + new_labels


# 划分训练集和验证集
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=seed
)

# 加载预训练的ModelScope tokenizer
print("加载预训练的ModelScope tokenizer")
tokenizer = AutoTokenizer.from_pretrained('AI-ModelScope/bert-base-uncased')

# 数据编码
train_encodings = tokenizer(
    train_texts,
    truncation=True,
    padding=True,
    max_length=128
)
val_encodings = tokenizer(
    val_texts,
    truncation=True,
    padding=True,
    max_length=128
)

class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.labels[idx])
        }
        return item

    def __len__(self):
        return len(self.labels)

# 创建数据集
print("创建数据集")
train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)

# 加载预训练的ModelScope模型
print("加载预训练的ModelScope模型")
model = AutoModelForSequenceClassification.from_pretrained(
    'AI-ModelScope/bert-base-uncased',
    num_labels=2  # 二分类任务
)

# 修改评估函数实现
def compute_metrics(outputs, dataloader):
    """正确的评估函数签名"""
    all_preds = []
    all_labels = []

    # 收集所有预测结果
    for batch_output in outputs:
        logits = batch_output['logits']
        preds = np.argmax(logits.detach().cpu().numpy(), axis=1)
        all_preds.extend(preds.tolist())

    # 收集所有真实标签
    for batch in dataloader:
        labels = batch['labels'].cpu().numpy()
        all_labels.extend(labels.tolist())

    return {'accuracy': accuracy_score(all_labels, all_preds)}




# 配置训练参数
cfg_dict = {
    "task": "text-classification",  # 添加这一行
    'model': {
        'type': 'AutoModelForSequenceClassification',
        'model_name_or_path': 'AI-ModelScope/bert-base-uncased',
        'num_labels': 2
    },
    'train': {
        'work_dir': './out_dirs',
        'dataloader': {
            'batch_size_per_gpu': 2,
            'workers_per_gpu': 1
        },
        'optimizer': {
            'type': 'AdamW',
            'lr': 5e-5
        },
        'lr_scheduler': {
            'type': 'LinearLR',
            'start_factor': 1.0,
            'end_factor': 0.0,
            'total_iters': 3
        },
        'hooks': [
            {'type': 'CheckpointHook', 'interval': 1},
            {'type': 'TextLoggerHook', 'interval': 10},
            {'type': 'EvaluationHook', 'interval': 1}
        ],
        'max_epochs': 3
    },
    'evaluation': {
        'dataloader': {
            'batch_size_per_gpu': 2,
            'workers_per_gpu': 1
        }
    }
}

# 直接使用字典创建配置
cfg = Config(cfg_dict)

# 创建临时配置文件
with open('config.json', 'w') as f:
    json.dump(cfg_dict, f)

# 创建训练器
trainer = EpochBasedTrainer(
    model=model,
    cfg_file='config.json',
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    custom_eval_fn=compute_metrics  # 直接传入函数
)


# 开始训练
print("训练开始")
trainer.train()
print("训练结束")

# 评估模型
#eval_results = trainer.evaluate(val_dataset)
eval_results = trainer.evaluate()
print(f"Accuracy: {eval_results['accuracy']:.4f}")

# 保存模型
trainer.model.save_pretrained("./sentiment_model")
tokenizer.save_pretrained("./sentiment_model")

# 加载模型
model = AutoModelForSequenceClassification.from_pretrained("./sentiment_model")
tokenizer = AutoTokenizer.from_pretrained("./sentiment_model")

# 预测一些示例文本
example_texts = ["I love this!", "I hate it."]
inputs = tokenizer(
    example_texts,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="pt"
)

with torch.no_grad():
    outputs = model(**inputs)
    predicted_labels = torch.argmax(outputs.logits, dim=1).tolist()

# 打印预测结果
for text, label in zip(example_texts, predicted_labels):
    print(f"Text: {text} -- Predicted Label: {'positive' if label == 1 else 'negative'}")