【文本分类】bert二分类

python 复制代码
import os
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

# 自定义数据集
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long)
        }


# 训练函数
def train_model(model, train_loader, optimizer, device, num_epochs=3):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}/{num_epochs}"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f"Epoch {epoch + 1} Loss: {total_loss / len(train_loader)}")


# 评估函数
def evaluate_model(model, val_loader, device):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()

            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    report = classification_report(true_labels, predictions)
    print(f"Validation Accuracy: {accuracy}")
    print("Classification Report:")
    print(report)


# 模型保存函数
def save_model(model, tokenizer, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"Model saved to {output_dir}")


# 模型加载函数
def load_model(output_dir, device):
    tokenizer = BertTokenizer.from_pretrained(output_dir)
    model = BertForSequenceClassification.from_pretrained(output_dir)
    model.to(device)
    print(f"Model loaded from {output_dir}")
    return model, tokenizer


# 推理预测函数
def predict(texts, model, tokenizer, device, max_length=128):
    model.eval()
    encodings = tokenizer(
        texts,
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    input_ids = encodings["input_ids"].to(device)
    attention_mask = encodings["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=1).cpu().numpy()
        predictions = torch.argmax(logits, dim=1).cpu().numpy()

    return predictions, probabilities


# 主函数
def main():
    # 配置参数
    config = {
        "train_batch_size": 16,
        "val_batch_size": 16,
        "learning_rate": 5e-5,
        "num_epochs": 5,
        "max_length": 128,
        "device_id": 7,  # 指定 GPU ID
        "model_dir": "model",
        "local_model_path": "roberta_tiny_model",  # 指定本地模型路径,如果为 None 则使用预训练模型
        "pretrained_model_name": "uer/chinese_roberta_L-12_H-128",  # 预训练模型名称
    }

    # 设置设备
    device = torch.device(f"cuda:{config['device_id']}" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # 加载分词器和模型
    tokenizer = BertTokenizer.from_pretrained(config["local_model_path"])
    model = BertForSequenceClassification.from_pretrained(config["local_model_path"], num_labels=2)
    model.to(device)

    # 示例数据
    train_texts = ["This is a great product!", "I hate this service."]
    train_labels = [1, 0]
    val_texts = ["Awesome experience.", "Terrible product."]
    val_labels = [1, 0]

    # 创建数据集和数据加载器
    train_dataset = CustomDataset(train_texts, train_labels, tokenizer, config["max_length"])
    val_dataset = CustomDataset(val_texts, val_labels, tokenizer, config["max_length"])
    train_loader = DataLoader(train_dataset, batch_size=config["train_batch_size"], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=config["val_batch_size"])

    # 定义优化器
    optimizer = AdamW(model.parameters(), lr=config["learning_rate"])

    # 训练模型
    train_model(model, train_loader, optimizer, device, num_epochs=config["num_epochs"])

    # 评估模型
    evaluate_model(model, val_loader, device)

    # 保存模型
    save_model(model, tokenizer, config["model_dir"])

    # 加载模型
    loaded_model, loaded_tokenizer = load_model(config["model_dir"], "cpu")

    # 推理预测
    new_texts = ["I love this!", "It's the worst."]
    predictions, probabilities = predict(new_texts, loaded_model, loaded_tokenizer,  "cpu")
    for text, pred, prob in zip(new_texts, predictions, probabilities):
        print(f"Text: {text}")
        print(f"Predicted Label: {pred} (Probability: {prob})")


if __name__ == "__main__":
    main()
相关推荐
大千AI助手14 小时前
TinyBERT:知识蒸馏驱动的BERT压缩革命 | 模型小7倍、推理快9倍的轻量化引擎
人工智能·深度学习·机器学习·自然语言处理·bert·蒸馏·tinybert
AI扶我青云志19 小时前
BERT系列模型
人工智能·深度学习·bert
机器学习之心HML1 天前
分类预测 | Matlab基于KPCA-ISSA-SVM和ISSA-SVM和SSA-SVM和SVM多模型分类预测对比
支持向量机·matlab·分类·kpca-issa-svm
北温凉1 天前
【论文阅读】基于注意力机制的冥想脑电分类识别研究(2025)
论文阅读·分类·数据挖掘
AI街潜水的八角2 天前
深度学习图像分类数据集—五种电器识别分类
人工智能·深度学习·分类
大千AI助手2 天前
BERT:双向Transformer革命 | 重塑自然语言理解的预训练范式
人工智能·深度学习·机器学习·自然语言处理·llm·bert·transformer
肥猪猪爸2 天前
BP神经网络对时序数据进行分类
人工智能·深度学习·神经网络·算法·机器学习·分类·时序数据
Keep learning!2 天前
深度学习入门代码详细注释-ResNet18分类蚂蚁蜜蜂
人工智能·深度学习·分类
看到我,请让我去学习3 天前
OpenCV 与深度学习:从图像分类到目标检测技术
深度学习·opencv·分类
Smilecoc3 天前
线性回归原理推导与应用(十):逻辑回归多分类实战
分类·逻辑回归·线性回归