|------|---|---|---|---|---|---|---|---|---|---|---|
| 输入序列 | 清 | 华 | 大 | 学 | 座 | 落 | 于 | 首 | 都 | 北 | 京 |
| 输出标注 | B | I | I | I | O | O | O | O | O | B | I |
import mindspore as ms
import mindspore.nn as nn
import mindspore.ops as ops
import mindspore.numpy as mnp
from mindspore.common.initializer import initializer, Uniform
def sequence_mask(seq_length, max_length, batch_first=False):
"""根据序列实际长度和最大长度生成mask矩阵"""
range_vector = mnp.arange(0, max_length, 1, seq_length.dtype)
result = range_vector < seq_length.view(seq_length.shape + (1,))
if batch_first:
return result.astype(ms.int64)
return result.astype(ms.int64).swapaxes(0, 1)
class CRF(nn.Cell):
def __init__(self, num_tags: int, batch_first: bool = False, reduction: str = 'sum') -> None:
if num_tags <= 0:
raise ValueError(f'invalid number of tags: {num_tags}')
super().__init__()
if reduction not in ('none', 'sum', 'mean', 'token_mean'):
raise ValueError(f'invalid reduction: {reduction}')
self.num_tags = num_tags
self.batch_first = batch_first
self.reduction = reduction
self.start_transitions = ms.Parameter(initializer(Uniform(0.1), (num_tags,)), name='start_transitions')
self.end_transitions = ms.Parameter(initializer(Uniform(0.1), (num_tags,)), name='end_transitions')
self.transitions = ms.Parameter(initializer(Uniform(0.1), (num_tags, num_tags)), name='transitions')
def construct(self, emissions, tags=None, seq_length=None):
if tags is None:
return self._decode(emissions, seq_length)
return self._forward(emissions, tags, seq_length)
def _forward(self, emissions, tags=None, seq_length=None):
if self.batch_first:
batch_size, max_length = tags.shape
emissions = emissions.swapaxes(0, 1)
tags = tags.swapaxes(0, 1)
else:
max_length, batch_size = tags.shape
if seq_length is None:
seq_length = mnp.full((batch_size,), max_length, ms.int64)
mask = sequence_mask(seq_length, max_length)
# shape: (batch_size,)
numerator = compute_score(emissions, tags, seq_length-1, mask, self.transitions, self.start_transitions, self.end_transitions)
# shape: (batch_size,)
denominator = compute_normalizer(emissions, mask, self.transitions, self.start_transitions, self.end_transitions)
# shape: (batch_size,)
llh = denominator - numerator
if self.reduction == 'none':
return llh
if self.reduction == 'sum':
return llh.sum()
if self.reduction == 'mean':
return llh.mean()
return llh.sum() / mask.astype(emissions.dtype).sum()
def _decode(self, emissions, seq_length=None):
if self.batch_first:
batch_size, max_length = emissions.shape[:2]
emissions = emissions.swapaxes(0, 1)
else:
batch_size, max_length = emissions.shape[:2]
if seq_length is None:
seq_length = mnp.full((batch_size,), max_length, ms.int64)
mask = sequence_mask(seq_length, max_length)
### viterbi_decode 前述算法
return viterbi_decode(emissions, mask, self.transitions, self.start_transitions, self.end_transitions)
BiLSTM+CRF模型
设计一个双向 LSTM+CRF 的模型来进行命名实体识别任务的训练。
模型结构如下:nn.Embedding -> nn.LSTM -> nn.Dense -> CRF
其中 LSTM 提取序列特征 ,经过 Dense 层变换获得发射概率矩阵 ,最后送入CRF层。
python复制代码
class BiLSTM_CRF(nn.Cell):
def __init__(self,
vocab_size, embedding_dim,
hidden_dim, num_tags,
padding_idx=0):
super().__init__()
self.embedding = nn.Embedding(
vocab_size, ## 词表长度
embedding_dim, ## 每个嵌入向量的大小。
padding_idx=padding_idx)
self.lstm = nn.LSTM(embedding_dim,
hidden_dim // 2,
bidirectional=True,
batch_first=True)
self.hidden2tag = nn.Dense( hidden_dim,
num_tags, ### 标签集长度
'he_uniform')
self.crf = CRF( num_tags,
batch_first=True)
def construct(self, inputs, seq_length, tags=None):
embeds = self.embedding(inputs)
outputs, _ = self.lstm(embeds, seq_length=seq_length)
feats = self.hidden2tag(outputs)
crf_outs = self.crf(feats, tags, seq_length)
return crf_outs
### 生成两句例子和对应的标签,并构造词表和标签表。
embedding_dim = 16
hidden_dim = 32
training_data = [(
"清 华 大 学 坐 落 于 首 都 北 京".split(),
"B I I I O O O O O B I".split()
), (
"重 庆 是 一 个 魔 幻 城 市".split(),
"B I O O O O O O O".split()
)]
word_to_idx = {}
word_to_idx['<pad>'] = 0
for sentence, tags in training_data:
for word in sentence:
if word not in word_to_idx:
word_to_idx[word] = len(word_to_idx)
tag_to_idx = {"B": 0, "I": 1, "O": 2}
模型实例化
由于CRF层已经进行了NLLLoss的计算,因此不需要再设置Loss。
python复制代码
model = BiLSTM_CRF( len(word_to_idx), ## 词表长度
embedding_dim, ## embedding大小
hidden_dim, ## 隐藏层大小
len(tag_to_idx)) ## 标签集长度
optimizer = nn.SGD( model.trainable_params(),
learning_rate=0.01,
weight_decay=1e-4)
### 前向传播和训练步骤定义
grad_fn = ms.value_and_grad(model, None, optimizer.parameters)
def train_step(data, seq_length, label):
loss, grads = grad_fn(data, seq_length, label)
optimizer(grads)
return loss
def prepare_sequence(seqs, word_to_idx, tag_to_idx):
# 将生成的数据打包成Batch,按照序列最大长度,对长度不足的序列进行填充,分别返回输入序列、输出标签和序列长度构成的Tensor。
seq_outputs, label_outputs, seq_length = [], [], []
max_len = max([len(i[0]) for i in seqs])
for seq, tag in seqs:
seq_length.append(len(seq))
idxs = [word_to_idx[w] for w in seq]
labels = [tag_to_idx[t] for t in tag]
idxs.extend([word_to_idx['<pad>'] for i in range(max_len - len(seq))])
labels.extend([tag_to_idx['O'] for i in range(max_len - len(seq))])
seq_outputs.append(idxs)
label_outputs.append(labels)
return ms.Tensor(seq_outputs, ms.int64), \
ms.Tensor(label_outputs, ms.int64), \
ms.Tensor(seq_length, ms.int64)
设计的模型查看:
输出的模型实例化例子查看:
模型训练
python复制代码
from tqdm import tqdm
steps = 500
with tqdm(total=steps) as t:
for i in range(steps):
loss = train_step(data, seq_length, label)
t.set_postfix(loss=loss)
t.update(1)
500步长的训练过程查看:
模型预测
python复制代码
score, history = model(data, seq_length)
print(score)
predict = post_decode(score, history, seq_length)
print(predict)
#########################
## 最后将预测的index序列转换为标签序列,打印输出结果,查看效果。
idx_to_tag = {idx: tag for tag, idx in tag_to_idx.items()}
def sequence_to_tag(sequences, idx_to_tag):
outputs = []
for seq in sequences:
outputs.append([idx_to_tag[i] for i in seq])
return outputs