NLP常用工具包

✨做一次按NLP项目常见工具的使用拆解

1. tokenizer

python 复制代码
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer('basic_english')
text_sample = "We're going on an adventure! The weather is really nice today."
tokens = tokenizer(text_sample)
print(tokens)

'we', "'", 're', 'going', 'on', 'an', 'adventure', '!', 'the', 'weather', 'is', 'really', 'nice', 'today', '.'

2. vocab

python 复制代码
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer

# 创建分词器
tokenizer = get_tokenizer('basic_english')

# 测试数据
test_sentences = [
    "The quick brown fox jumps over the lazy dog.",
    "Hello world! This is a test for building vocabulary.",
]

vocab = build_vocab_from_iterator(
    (tokenizer(sentence) for sentence in test_sentences),
    specials=['<unk>', '<pad>'],
    min_freq=1  # 设置最小频率为1
)

vocab.set_default_index(vocab['<unk>'])

print("词表大小:", len(vocab))
print("'fox'的索引:", vocab['fox'])

词表大小: 21

'fox'的索引: 10

3. Dataloader(示例1)

python 复制代码
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer

# 1. 创建分词器
tokenizer = get_tokenizer('basic_english')

# 2. 测试数据
train_sentences = [
    "The quick brown fox jumps over the lazy dog.",
    "Hello world! This is a test for building vocabulary.",
    # 你可以在这里添加更多训练句子
]
test_sentences = [
    "The quick brown fox jumps over the lazy dog.",
    "Hello world! This is a test for building vocabulary.",
]

# 3. 构建词表
vocab = build_vocab_from_iterator(
    (tokenizer(sentence) for sentence in train_sentences),
    specials=['<unk>', '<pad>'],
    min_freq=1
)
vocab.set_default_index(vocab['<unk>'])

print("词表大小:", len(vocab))
print("'fox'的索引:", vocab['fox'])

# 4. 自定义 Dataset
class TextDataset(Dataset):
    def __init__(self, sentences, vocab, tokenizer):
        self.sentences = sentences
        self.vocab = vocab
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        tokens = self.tokenizer(self.sentences[idx])
        indices = [self.vocab[token] for token in tokens]
        return torch.tensor(indices, dtype=torch.long)

# 5. 创建 Dataset 实例
train_dataset = TextDataset(train_sentences, vocab, tokenizer)
test_dataset  = TextDataset(test_sentences, vocab, tokenizer)

# 6. DataLoader 与 Padding Collate 函数

def collate_fn(batch):
    # batch 是一个 list of tensors
    return pad_sequence(batch, batch_first=True, padding_value=vocab['<pad>'])

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)
test_loader  = DataLoader(test_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)

# 7. 测试 DataLoader 输出
print("\n=== Train Batch Indices ===")
for batch in train_loader:
    print(batch)
    break

print("\n=== Test Batch Indices ===")
for batch in test_loader:
    print(batch)
    break

=== Train Batch Indices ===
tensor([[11, 20, 4, 18, 12, 5, 17, 9, 7, 19, 2],

3, 16, 6, 10, 13, 15, 3, 14, 8, 2, 1\]\]) === Test Batch Indices === tensor(\[\[ 3, 16, 6, 10, 13, 15, 3, 14, 8, 2, 1\], \[11, 20, 4, 18, 12, 5, 17, 9, 7, 19, 2\]\])

4. Dataloader(示例2)

python 复制代码
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer

# 1. 创建分词器
tokenizer = get_tokenizer('basic_english')

# 2. 带标签的训练与测试数据 (句子, 标签)
train_data = [
    ("The quick brown fox jumps over the lazy dog.", 1),  # 正面情感
    ("Hello world! This is a test for building vocabulary.", 0),  # 负面情感
    # 可添加更多 (sentence, label)
]
test_data = [
    ("The quick brown fox jumps over the lazy dog.", 1),
    ("Hello world! This is a test for building vocabulary.", 0),
]

# 3. 构建词表,只基于训练数据中的句子
vocab = build_vocab_from_iterator(
    (tokenizer(sentence) for sentence, _ in train_data),
    specials=['<unk>', '<pad>'],
    min_freq=1
)
vocab.set_default_index(vocab['<unk>'])

print("词表大小:", len(vocab))
print("'fox'的索引:", vocab['fox'])

# 4. 自定义 Dataset,返回 (indices_tensor, label_tensor)
class TextDataset(Dataset):
    def __init__(self, data, vocab, tokenizer):
        self.data = data
        self.vocab = vocab
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence, label = self.data[idx]
        tokens = self.tokenizer(sentence)
        indices = [self.vocab[token] for token in tokens]
        return torch.tensor(indices, dtype=torch.long), torch.tensor(label, dtype=torch.long)

# 5. Padding 与 collate_fn
def collate_fn(batch):
    sequences, labels = zip(*batch)
    padded_seqs = pad_sequence(sequences, batch_first=True, padding_value=vocab['<pad>'])
    labels_tensor = torch.stack(labels)
    return padded_seqs, labels_tensor

# 6. 创建 DataLoader
train_dataset = TextDataset(train_data, vocab, tokenizer)
test_dataset  = TextDataset(test_data, vocab, tokenizer)

train_loader = DataLoader(
    train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn
)
test_loader = DataLoader(
    test_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn
)

# 7. 测试输出
print("\n=== Train Batch ===")
for seq_batch, label_batch in train_loader:
    print("Sequences:", seq_batch)
    print("Labels:   ", label_batch)
    break

print("\n=== Test Batch ===")
for seq_batch, label_batch in test_loader:
    print("Sequences:", seq_batch)
    print("Labels:   ", label_batch)
    break
相关推荐
caiyueloveclamp19 小时前
ChatPPT:AI PPT生成领域的“六边形战士“
人工智能·powerpoint·ai生成ppt·aippt·免费aippt
paperxie_xiexuo19 小时前
学术与职场演示文稿的结构化生成机制探析:基于 PaperXie AI PPT 功能的流程解构与适用性研究
大数据·数据库·人工智能·powerpoint
算家计算20 小时前
Meta第三代“分割一切”模型——SAM 3本地部署教程:首支持文本提示分割,400万概念、30毫秒响应,检测分割追踪一网打尽
人工智能·meta
CNRio20 小时前
生成式AI技术栈全解析:从模型架构到落地工程化
人工智能·架构
算家计算20 小时前
编程AI新王Claude Opus 4.5正式发布!编程基准突破80.9%,成本降三分之二
人工智能·ai编程·claude
青瓷程序设计20 小时前
鱼类识别系统【最新版】Python+TensorFlow+Vue3+Django+人工智能+深度学习+卷积神经网络算法
人工智能·python·深度学习
央链知播20 小时前
第二届中国数据产业发展大会暨2025元宇宙AI数据要素“金杏奖”颁奖盛典在广州隆重举行
人工智能·业界资讯·数据产业
GEO_NEWS20 小时前
解析华为Flex:ai的开源棋局
人工智能·华为·开源
扑棱蛾子20 小时前
手摸手教你两分钟搞定Antigravity
人工智能
WWZZ202520 小时前
快速上手大模型:深度学习13(文本预处理、语言模型、RNN、GRU、LSTM、seq2seq)
人工智能·深度学习·算法·语言模型·自然语言处理·大模型·具身智能