【文本分类】bert二分类

python 复制代码
import os
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

# 自定义数据集
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long)
        }


# 训练函数
def train_model(model, train_loader, optimizer, device, num_epochs=3):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}/{num_epochs}"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f"Epoch {epoch + 1} Loss: {total_loss / len(train_loader)}")


# 评估函数
def evaluate_model(model, val_loader, device):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()

            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    report = classification_report(true_labels, predictions)
    print(f"Validation Accuracy: {accuracy}")
    print("Classification Report:")
    print(report)


# 模型保存函数
def save_model(model, tokenizer, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"Model saved to {output_dir}")


# 模型加载函数
def load_model(output_dir, device):
    tokenizer = BertTokenizer.from_pretrained(output_dir)
    model = BertForSequenceClassification.from_pretrained(output_dir)
    model.to(device)
    print(f"Model loaded from {output_dir}")
    return model, tokenizer


# 推理预测函数
def predict(texts, model, tokenizer, device, max_length=128):
    model.eval()
    encodings = tokenizer(
        texts,
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    input_ids = encodings["input_ids"].to(device)
    attention_mask = encodings["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=1).cpu().numpy()
        predictions = torch.argmax(logits, dim=1).cpu().numpy()

    return predictions, probabilities


# 主函数
def main():
    # 配置参数
    config = {
        "train_batch_size": 16,
        "val_batch_size": 16,
        "learning_rate": 5e-5,
        "num_epochs": 5,
        "max_length": 128,
        "device_id": 7,  # 指定 GPU ID
        "model_dir": "model",
        "local_model_path": "roberta_tiny_model",  # 指定本地模型路径,如果为 None 则使用预训练模型
        "pretrained_model_name": "uer/chinese_roberta_L-12_H-128",  # 预训练模型名称
    }

    # 设置设备
    device = torch.device(f"cuda:{config['device_id']}" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # 加载分词器和模型
    tokenizer = BertTokenizer.from_pretrained(config["local_model_path"])
    model = BertForSequenceClassification.from_pretrained(config["local_model_path"], num_labels=2)
    model.to(device)

    # 示例数据
    train_texts = ["This is a great product!", "I hate this service."]
    train_labels = [1, 0]
    val_texts = ["Awesome experience.", "Terrible product."]
    val_labels = [1, 0]

    # 创建数据集和数据加载器
    train_dataset = CustomDataset(train_texts, train_labels, tokenizer, config["max_length"])
    val_dataset = CustomDataset(val_texts, val_labels, tokenizer, config["max_length"])
    train_loader = DataLoader(train_dataset, batch_size=config["train_batch_size"], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=config["val_batch_size"])

    # 定义优化器
    optimizer = AdamW(model.parameters(), lr=config["learning_rate"])

    # 训练模型
    train_model(model, train_loader, optimizer, device, num_epochs=config["num_epochs"])

    # 评估模型
    evaluate_model(model, val_loader, device)

    # 保存模型
    save_model(model, tokenizer, config["model_dir"])

    # 加载模型
    loaded_model, loaded_tokenizer = load_model(config["model_dir"], "cpu")

    # 推理预测
    new_texts = ["I love this!", "It's the worst."]
    predictions, probabilities = predict(new_texts, loaded_model, loaded_tokenizer,  "cpu")
    for text, pred, prob in zip(new_texts, predictions, probabilities):
        print(f"Text: {text}")
        print(f"Predicted Label: {pred} (Probability: {prob})")


if __name__ == "__main__":
    main()
相关推荐
茗创科技1 小时前
Annals of Neurology | EEG‘藏宝图’:用于脑电分类、聚类与预测的语义化低维流形
分类·数据挖掘·聚类
来酱何人1 天前
低资源NLP数据处理:少样本/零样本场景下数据增强与迁移学习结合方案
人工智能·深度学习·分类·nlp·bert
Dev7z2 天前
舌苔舌象分类图像数据集
人工智能·分类·数据挖掘
fie88892 天前
基于Matlab的深度堆叠自编码器(SAE)实现与分类应用
开发语言·分类
星期天要睡觉2 天前
深度学习——基于ResNet18迁移学习的图像分类模型
人工智能·python·分类·迁移学习
中达瑞和-高光谱·多光谱2 天前
多光谱图像颜色特征用于茶叶分类的研究进展
人工智能·分类·数据挖掘
Francek Chen3 天前
【深度学习计算机视觉】13:实战Kaggle比赛:图像分类 (CIFAR-10)
深度学习·计算机视觉·分类
盼小辉丶3 天前
TensorFlow深度学习实战——节点分类
深度学习·分类·tensorflow·图神经网络
成为深度学习高手3 天前
DGCN+informer分类预测模型
人工智能·分类·数据挖掘
Sunhen_Qiletian3 天前
卷积神经网络搭建实战(二)——基于PyTorch框架和本地自定义图像数据集的食物分类案例(附输入图片预测功能)
pytorch·分类·cnn