分词器 Tokenizer
人类的自然语言需要经过分词器 Tokenizer 和 Embedding 处理,才可以输入到神经网络中进行处理。
分词器是将一段自然语言输入,切分成模型能理解的最小单位(tokens),然后转换成一个固定的 index。例如词汇表大小为 4,输入:"我喜欢你",分词器处理如下:
plain
input: 我
output: 0
input: 喜欢
output: 1
input:你
output: 2
常用的 Tokenization 方法有:
1、Word-level(词级别)
2、Subword-level(子词级别,如 BPE、WordPiece)
3、Character-level(字符级别)
实现一个简单的 Tokenizer
python
import re
from collections import Counter
class SimpleTokenizer:
def __init__(self, vocab_size=10000):
self.vocab_size = vocab_size
self.word2idx = {"<PAD>": 0, "<UNK>": 1, "<SOS>": 2, "<EOS>": 3}
self.idx2word = {v: k for k, v in self.word2idx.items()}
def train(self, texts):
words = []
for text in texts:
words.extend(self._tokenize(text))
word_freq = Counter(words)
most_common = word_freq.most_common(self.vocab_size - 4)
for word, _ in most_common:
idx = len(self.word2idx)
self.word2idx[word] = idx
self.idx2word[idx] = word
def _tokenize(self, text):
text = text.lower()
text = re.sub(r'[^\w\s]', '', text)
return text.split()
def encode(self, text):
tokens = self._tokenize(text)
return [self.word2idx.get(token, self.word2idx["<UNK>"]) for token in tokens]
def decode(self, indices):
return " ".join([self.idx2word.get(idx, "<UNK>") for idx in indices])
# 测试Tokenizer
def test_tokenizer():
texts = [
"Hello world, this is a test.",
"Machine learning is fascinating.",
"Natural language processing is important."
]
tokenizer = SimpleTokenizer(vocab_size=1000)
tokenizer.train(texts)
test_text = "Hello, this is machine learning."
encoded = tokenizer.encode(test_text)
decoded = tokenizer.decode(encoded)
print(f"原始文本: {test_text}")
print(f"编码结果: {encoded}")
print(f"解码结果: {decoded}")
print(f"词汇表大小: {len(tokenizer.word2idx)}")
test_tokenizer()
使用 HuggingFace 的 Tokenizer
python
from transformers import AutoTokenizer
# 使用预训练的tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
def test_huggingface_tokenizer():
text = "Hello, this is a test of tokenization."
# 编码
tokens = tokenizer.tokenize(text)
print(f"Tokens: {tokens}")
# 转换为ID
input_ids = tokenizer.convert_tokens_to_ids(tokens)
print(f"Input IDs: {input_ids}")
# 完整编码(包含特殊token)
encoded = tokenizer(text, return_tensors="pt")
print(f"完整编码: {encoded}")
# 解码
decoded = tokenizer.decode(encoded["input_ids"][0])
print(f"解码结果: {decoded}")
# test_huggingface_tokenizer()
Embedding
经过 Tokenizer 处理后的数据,再经过 Embedding 处理,所以 Embedding 层的输入往往是一个(batch_size,seq_len,1)的矩阵,batch_size 是一次批处理的数量,seq_len 是自然语言序列处理的长度,最后一个是 token 经过 Tokenizer 转换成的 index 值,如上述"我喜欢你"这句话经过 Tokenizer 处理,输入到 Embedding 层的是:
plain
[[[0],[1],[2]]]
其batch_size 为 1,seq_len 为 3,转换出来的 index 如上。
Embedding 内部是一个可训练的(Vocab_size,embedding_dim)的权重矩阵,词表里的每一个值,都对应一行维度为embedding_dimde 向量。对于输入的值,会对应到这个词向量,然后拼接成(batch_size,seq_len,embedding_dim)的矩阵输出。
Embedding 层实现
python
import torch
import torch.nn as nn
class EmbeddingLayer(nn.Module):
def __init__(self, vocab_size, embedding_dim):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
def forward(self, input_ids):
return self.embedding(input_ids)
def test_embedding():
vocab_size = 10000
embedding_dim = 512
batch_size = 2
seq_len = 10
# 模拟输入
input_ids = torch.randint(0, vocab_size, (batch_size, seq_len))
# Embedding层
embedding_layer = EmbeddingLayer(vocab_size, embedding_dim)
embeddings = embedding_layer(input_ids)
print(f"输入ID形状: {input_ids.shape}")
print(f"Embedding输出形状: {embeddings.shape}")
print(f"Embedding维度: {embedding_dim}")
# test_embedding()