【AI-ModelScope/bert-base-uncase】模型训练及使用

如下是基于modelscope进行的bert-base-uncase 模型训练及使用样例可直接运行

python 复制代码
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "6"

from torch.utils.data import Dataset
import torch
from modelscope import AutoModelForSequenceClassification, AutoTokenizer
from modelscope.trainers import build_trainer
from modelscope.msdatasets import MsDataset
from modelscope.utils.hub import read_config
from modelscope.trainers import EpochBasedTrainer
from modelscope.metainfo import Trainers
from modelscope.utils.config import Config
from modelscope.utils.constant import ModeKeys
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import random
import numpy as np
import json
import tempfile

# 随机种子
seed = 42
random.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)



# 同义词库
positive_verbs = ["love", "like", "admire", "adore", "enjoy", "appreciate"]
negative_verbs = ["hate", "dislike", "despise", "loathe", "detest", "abhor"]
positive_nouns = ["Hugging Face", "this product", "the service", "the app", "this book", "the movie", "the experience"]
negative_nouns = ["this", "it", "the interface", "the design", "the concept", "the approach"]
positive_adjectives = ["fantastic", "amazing", "wonderful", "excellent", "superb", "outstanding", "brilliant"]
negative_adjectives = ["terrible", "awful", "horrible", "dreadful", "lousy", "poor"]

# 生成更多样本
def generate_samples():
    texts = []
    labels = []

    # 生成正面样本 (50条)
    for _ in range(25):
        # 动词结构
        verb = random.choice(positive_verbs)
        noun = random.choice(positive_nouns)
        texts.append(f"I {verb} {noun}!")
        labels.append(1)

        # 形容词结构
        adj = random.choice(positive_adjectives)
        texts.append(f"This is {adj}!")
        labels.append(1)

    # 生成负面样本 (50条)
    for _ in range(25):
        # 动词结构
        verb = random.choice(negative_verbs)
        noun = random.choice(negative_nouns)
        texts.append(f"I {verb} {noun}.")
        labels.append(0)

        # 形容词结构
        adj = random.choice(negative_adjectives)
        texts.append(f"This is {adj}.")
        labels.append(0)

    return texts, labels

# 原始样本
original_texts = ["I love Hugging Face!", "I hate this.", "This is fantastic!", "I dislike it."]
original_labels = [1, 0, 1, 0]

# 生成新样本
new_texts, new_labels = generate_samples()

# 合并样本
texts = original_texts + new_texts
labels = original_labels + new_labels


# 划分训练集和验证集
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=seed
)

# 加载预训练的ModelScope tokenizer
print("加载预训练的ModelScope tokenizer")
tokenizer = AutoTokenizer.from_pretrained('AI-ModelScope/bert-base-uncased')

# 数据编码
train_encodings = tokenizer(
    train_texts,
    truncation=True,
    padding=True,
    max_length=128
)
val_encodings = tokenizer(
    val_texts,
    truncation=True,
    padding=True,
    max_length=128
)

class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.labels[idx])
        }
        return item

    def __len__(self):
        return len(self.labels)

# 创建数据集
print("创建数据集")
train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)

# 加载预训练的ModelScope模型
print("加载预训练的ModelScope模型")
model = AutoModelForSequenceClassification.from_pretrained(
    'AI-ModelScope/bert-base-uncased',
    num_labels=2  # 二分类任务
)

# 修改评估函数实现
def compute_metrics(outputs, dataloader):
    """正确的评估函数签名"""
    all_preds = []
    all_labels = []

    # 收集所有预测结果
    for batch_output in outputs:
        logits = batch_output['logits']
        preds = np.argmax(logits.detach().cpu().numpy(), axis=1)
        all_preds.extend(preds.tolist())

    # 收集所有真实标签
    for batch in dataloader:
        labels = batch['labels'].cpu().numpy()
        all_labels.extend(labels.tolist())

    return {'accuracy': accuracy_score(all_labels, all_preds)}




# 配置训练参数
cfg_dict = {
    "task": "text-classification",  # 添加这一行
    'model': {
        'type': 'AutoModelForSequenceClassification',
        'model_name_or_path': 'AI-ModelScope/bert-base-uncased',
        'num_labels': 2
    },
    'train': {
        'work_dir': './out_dirs',
        'dataloader': {
            'batch_size_per_gpu': 2,
            'workers_per_gpu': 1
        },
        'optimizer': {
            'type': 'AdamW',
            'lr': 5e-5
        },
        'lr_scheduler': {
            'type': 'LinearLR',
            'start_factor': 1.0,
            'end_factor': 0.0,
            'total_iters': 3
        },
        'hooks': [
            {'type': 'CheckpointHook', 'interval': 1},
            {'type': 'TextLoggerHook', 'interval': 10},
            {'type': 'EvaluationHook', 'interval': 1}
        ],
        'max_epochs': 3
    },
    'evaluation': {
        'dataloader': {
            'batch_size_per_gpu': 2,
            'workers_per_gpu': 1
        }
    }
}

# 直接使用字典创建配置
cfg = Config(cfg_dict)

# 创建临时配置文件
with open('config.json', 'w') as f:
    json.dump(cfg_dict, f)

# 创建训练器
trainer = EpochBasedTrainer(
    model=model,
    cfg_file='config.json',
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    custom_eval_fn=compute_metrics  # 直接传入函数
)


# 开始训练
print("训练开始")
trainer.train()
print("训练结束")

# 评估模型
#eval_results = trainer.evaluate(val_dataset)
eval_results = trainer.evaluate()
print(f"Accuracy: {eval_results['accuracy']:.4f}")

# 保存模型
trainer.model.save_pretrained("./sentiment_model")
tokenizer.save_pretrained("./sentiment_model")

# 加载模型
model = AutoModelForSequenceClassification.from_pretrained("./sentiment_model")
tokenizer = AutoTokenizer.from_pretrained("./sentiment_model")

# 预测一些示例文本
example_texts = ["I love this!", "I hate it."]
inputs = tokenizer(
    example_texts,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="pt"
)

with torch.no_grad():
    outputs = model(**inputs)
    predicted_labels = torch.argmax(outputs.logits, dim=1).tolist()

# 打印预测结果
for text, label in zip(example_texts, predicted_labels):
    print(f"Text: {text} -- Predicted Label: {'positive' if label == 1 else 'negative'}")
相关推荐
zskj_zhyl2 分钟前
银发经济时代:科技赋能养老,温情守护晚年,让老人不再孤独无助
大数据·人工智能·科技·生活
Qforepost3 分钟前
智汇河套,量子“风暴”:量子科技未来产业发展论坛深度研讨加速产业成果转化
人工智能·量子计算·量子
coding者在努力5 分钟前
从零开始:用PyTorch实现线性回归模型
人工智能·pytorch·线性回归
Giser探索家11 分钟前
低空智航平台技术架构深度解析:如何用AI +空域网格破解黑飞与安全管控难题
大数据·服务器·前端·数据库·人工智能·安全·架构
静心问道11 分钟前
CacheBlend:结合缓存知识融合的快速RAG大语言模型推理服务
人工智能·语言模型·模型加速
云卓SKYDROID14 分钟前
无人机智能返航模块技术分析
人工智能·数码相机·无人机·高科技·云卓科技
独行soc40 分钟前
2025年大模型安全岗的面试汇总(题目+回答)
android·人工智能·安全·面试·职场和发展·渗透测试
CONDIMENTTTT1 小时前
[机器学习]07-基于多层感知机的鸢尾花数据集分类
人工智能·机器学习
数据知道1 小时前
机器翻译:Hugging Face库详解
人工智能·自然语言处理·机器翻译
Blossom.1181 小时前
把大模型当“温度计”——基于 LLM 的分布式系统异常根因定位实战
人工智能·python·深度学习·机器学习·自然语言处理·分类·bert