情感聊天机器人通常属于开放领域,用户可以与机器人进行各种话题的互动。例如,微软小冰和早期的AnswerBus就是这种类型的聊天机器人。基于检索的开放领域聊天机器人需要大量的语料数据,其开发流程与基于任务型的聊天机器人相似,而基于深度学习的生成类型聊天机器人则具有处理开发领域的先天优势。其中,以Seq2Seq模型为基础的闲聊机器人已经在机器翻译领域取得了成功的应用。
Seq2Seq模型是NLP中的一个经典模型 ,最初由Google开发,并用于机器翻译。它基于RNN网络模型构建,能够支持且不限于的应用包括:语言翻译、人机对话、内容生成等。Seq2Seq,就如字面意思,输入一个序列,输出另一个序列。这种结构最重要的地方在于输入序列和输出序列的长度是可变的。Seq2Seq属于Encoder-Decoder的大范畴,主要是一个由编码器(encoder)和一个解码器(decoder)组成的网络。编码器将输入项转换为包含其特征的相应隐藏向量,解码器反转该过程,将向量转换为输出项,解码器每次都会使用前一个输出作为其输入。不断重复此过程,直到遇到结束字符。
1.基于Seq2Seq的聊天机器人开发流程
我们将基于TensorFlow深度学习框架,介绍以Seq2Seq为基础的聊天机器人的开发流程。
1.语料准备
首先是语料准备,先准备基于开放域聊天语料进行模型训练。在我们的聊天语料中,奇数行是问题,偶数行对应的回答。
python
1 聊点么好呢?
2 那我们随便聊聊吧
3 你是什么人?
4 我是智能客服
5 有人在吗
6 小宝一直会在这里诚心为您服务
基于生成方式的开放领域聊天机器人需要充足的聊天语料,聊天语料需要覆盖大部分的话题,才能保证回答的多样性和语句的通顺。然后我们通过对所有的聊天语料进行预处理,进行字典统计。
python
python
def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size, tokenizer=None, normalize_digits=True):
if not gfile.Exists(vocabulary_path):
print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path))
vocab = {}
with gfile.GFile(data_path, mode="rb") as f:
counter = 0
for line in f:
counter += 1
if counter % 100000 == 0:
print("processing line %d" % counter)
line = tf.compat.as_bytes(line)
tokens = tokenizer(line) if tokenizer else basic_tokenizer(line)
for win tokens:
word = _DIGIT_RE.sub(b"0", w) if normalize_digits else w
if word in vocab:
vocab[word] += 1
else:
vocab[word] = 1
vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
if len(vocab_list) > max_vocabulary_size:
vocab_list = vocab_list[:max_vocabulary_size]
with gfile.GFile(vocabulary_path, mode="wb") as vocab_file:
for win vocab_list:
vocab_file.write(w + b"\n")
根据统计的词频和字典,我们为聊天语料建立Token Id,比如"聊点什么好呢"这句话,根据每个词在词组中的位置["聊":0,"点":1,"什么":2,"好":3,"呢":4]可以表征为[0,1,2,3,4]。
python
python
def data_to_token_ids(data_path, target_path, vocabulary_path, tokenizer=None, normalize_digits=True):
"""将数据文件进行分词并转换为token-ids,使用给定的词汇文件。此函数逐行加载来自data_path的数据文件,调用上述sentence_to_token_ids,并将结果保存在target_path中。有关token-ids格式的详细信息,请参阅sentence_to_token_ids的注释。
Args:
data_path (str): 数据文件的路径,格式为每行一句。
target_path (str): 将创建的文件token-ids的路径。
vocabulary_path (str): 词汇文件的路径。
tokenizer: 用于对每个句子进行分词的函数;如果为None,将使用basic_tokenizer。
normalize_digits (bool): 如果为True,则将所有数字替换为O。
"""
if not gfile.Exists(target_path):
print("正在对位于 {} 的数据进行分词".format(data_path))
vocab = initialize_vocabulary(vocabulary_path)
with gfile.GFile(data_path, mode="rb") as data_file:
with gfile.GFile(target_path, mode="w") as tokens_file:
counter = 0
for line in data_file:
try:
line = line.decode('utf8', 'ignore')
except Exception as e:
print(e, line)
continue
counter += 1
if counter % 100000 == 0:
print("正在对第 {} 行进行分词".format(counter))
token_ids = sentence_to_token_ids(tf.compat.as_bytes(line), vocab, tokenizer, normalize_digits)
tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
1.2定义Encoder和Decoder
根据Seq2Seq的结构,需要首先定义Cell,选择GRU或者LSTM的Cell,并确定Size。然后利用Tensorflow中tf_Seq2Seq.embedding_attention_Seq2Seq这个函数来构架Encoder和Decoder模型,在训练模式下,Decoder的输入是真实的Target序列。
python
def single_cel1():
return tf.contrib.rnn.GRUCell(size) if use_lstm else tf.contrib.rnn.BasicLSTMCell(size)
def single_cell():
return tf.contrib.rnn.BasicLSTMCell(size)
cell = single_cel1() if num_layers > 1 else single_cell()
cell = tf.contrib.rnn.MultiRNNCell([single_cell() for _ in range(num_layers)])
# The seq2seg function: we use embedding for the input and attention.
def seq2seq_f(encoder_inputs, decoder_inputs, feed_previous):
return tf_seq2seq.embedding_attention_seq2seq(
encoder_inputs, decoder_inputs, cell,
num_encoder_symbols=source_vocab_size, num_decoder_symbols=target_vocab_size,
embedding_size=size, output_projection=output_projection,
feed_previous=feed_previous, dtype=dtype)
# Training outputs and losses, if forward_only:
self.outputs, self.losses, self.encoder_state = tf_seq2seq.model_with_buckets(
self.encoder_inputs,
self.decoder_inputs,
targets,
self.target_weights,
buckets,
lambda x, y: seq2seq_f(x, y, True),
softmax_loss_function=softmax_loss_function
)
# If we use output projection, we need to project outputs for decoding.
if output_projection is not None:
for b in xrange(len(buckets)):
self.outputs[b] = [
tf.matmul(output, output_projection[0]) + output_projection[1] for output in self.outputs[b]
]
else:
self.outputs, self.losses, self.encoder_state = tf_seq2seq.model_with_buckets(
self.encoder_inputs,
self.decoder_inputs,
targets,
self.target_weights,
buckets,
lambda x, y: seq2seq_f(x, y, False),
softmax_loss_function=softmax_loss_function
)
1.3模型训练和评估模块
对于训练阶段,首先定义Encoder和Decoder的网络结构(12.3.2节),然后对输入进行预处理(12.3.1节),最后通过Get_Batch将数据分成多个Batch,并利用Session进行训练。此外每次Epoch都要通过对模型生成语句的困惑度进行计算,来评估生成回答语句是否通顺。
python
python
def det_train(args):
print("Preparing dialog data in to", args.model_name, args.data_dir)
setup_workpath(workspace*args.workspace)
train_data, dev_data, _ = data_utils.prepare_dialog_data(args.data_dir, args.vocab_size)
if args.reinforce_learn:
args.batch_size = # is decode one sentence at a time
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction*args.gpu_usage)
with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
# Create model,
print("Creating id layers of hd units.")
model = seq2seq_model_utils.create_model(sess, args.forward_only-False)
# Read data into buckets and compute their sizes,
print("Reading development and training data (limit: %d)," % args.max_train_data_size)
dev_set = data_utils.read_data(dev_data, args.buckets*args.rev_model)
train_set = data_utils.read_data(train_data, args.buckets, args.max_train_data_size, args.rev_model)
#Tev mode
train_bucket_sizes = [len(train_set[b]) for b in range(len(args.buckets))]
train_total_size = float(sum(train_bucket_sizes))
train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size for i in range(len(train_bucket_sizes))]
# This is the training loop
step_time, loss = 0.0, 0.0 # current step and loss so far
previous_losses = [] # to keep track of the losses in every epoch
# Load vocabularies
vocab_path = os.path.join(args.data_dir, "rocabid.%d" % args.vocab_size)
vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path)
while True:
random_number = np.random.random() # random number between 0 and 1
bucket_id = min([i for i in range(len(train_buckets_scale)) if train_buckets_scale[i] > random_number]) # find the bucket id based on the random number
# Get a batch and make a step
start_time = time.time() # record the start time of this batch
encoder_inputs, decoder_inputs, target_weights = model.get_batch(train_set, bucket_id) # get a batch from the selected bucket id
if args.reinforce_learn:
step_loss = model.step_rf(args, sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, rev_vocab) # make a step using the reinforcement learning loss function
else:
step_loss = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id=bucket_id, forward_only=False) # make a step using the default loss function
# update the loss and current step after each batch/step finishs (in the end of this loop)
loss += step_loss / (time.time() - start
1.4模型预测和Beam Search模块
在预测模块,对应生成对话,我们需要利用Beam Search来寻找最优解。通过对Beam Size的控制可以保证输出语句的多样性。此外我们也可以加入强化学习,对于不同的机器人回答进行及时的人工反馈,通过Reinforcement Learning不断优化模型。
python
python
Get output logits for the sentence
beams, now_beams, results = [(1.0, 0.0, i'eos': 0.0, 'dec inp': decoder_inputs, 'prob': 1.0, 'prob_ts': 1.0, 'prob_t': 4.0))]. []. [
Adjusted probability
all_prob_ts = model_step(encoder_inputs, cand['dec_inp'], dptr, target_weights, bucket_id) if args.antilm else None
all_prob_t = model_step(dummy_encoder_inputs, cand['dec_inp'], dptr, target_weights, bucket_id)
Normal seg2seg
if debug:
print(' '.join([dict_lookup(rev_vocab, w) for w in cand['dec_inp']]))
if cand[eos']:
results += [(prob, 0, cand)]
continue
Adjusted probability
all_prob_ts = model_step(encoder_inputs, cand['dec_inp'], dptr, target_weights, bucket_id) if args.antilm else None
all_prob_t = model_step(dummy_encoder_inputs, cand['dec_inp'], dptr, target_weights, bucket_id)
Adjusted probability
all_prob_ts = model_step(encoder_inputs, cand['dec_inp'], dptr, target_weights, bucket_id) if args.antilm else None
all_prob_t = model_step(dummy_encoder_inputs, cand['dec_inp'], dptr, target_weights, bucket_id)
Adjusted probability
all_prob_ts = model_step(encoder_inputs, cand['dec_inp'], dptr, target_weights, bucket_id) if args.antilm else None
all_prob_t = model_step(dummy_encoder_inputs, cand['dec_inp'], dptr, target_weights, bucket_id)
]
all_prob_ts = model_step(encoder_inputs, cand['dec_inp'], dptr, target_weights, bucket_id) if args.antilm else None
all_prob_t = model_step(dummy_encoder_inputs, cand['dec_inp'], dptr, target_weights, bucket_id)
all_prob_ts = model_step(encoder_inputs, cand['dec_inp'], dptr, target_weights, bucket_id) if args.antilm else None
all_prob_t = model_step(dummy_encoder_inputs, cand['dec_inp'], dptr, target_weights, bucket_id)
all_prob = all_prob_ts - args.antilm * all_prob_t #+ args.n_bonus * dptr + random() * 1e-50
all_prob = all_prob_ts - args.antilm * all_prob_t
if args.n_bonus != 0:
all_prob += args.n_bonus * dptr
Suppress copy-cat (respond the same as input)
if dptr < len(input_token_ids):
all_prob[input_token_ids[dptr]] = all_prob[input_token_ids[dptr]] * 0.01
if return_raw:
return all_prob, all_prob_ts, all_prob_t # beam search
for c in np.argsort(all_prob)[::-1][:args.beam_size]:
new_cand "
gos dec_inp" (c - data_utils.EOS_ID), [(np.array([c]) if i -- (dptr+1) else k)
for i, k in enumerate(cand['dec_inp'])]
prob_ts cand['prob_ts *all_prob_ts[c]
prob prob cand['prob _ cand['prob ] * all_prob t[c]
new_cand = (new_cand['prob'], random(). new_cand) # stuff a randon to prevent comparing new_cand
if len (new_beams) < args.beam_size:
heapq. heappush(new_beams, new cand)
elif (new cando[0] > new _beams[0][0]):
heapq. heapreplace(new _beams, new _cand)
except Exception as e:
print("[Error]', e)
print(" ----[new _beams]-- ")
print("-ines _cand]\n", new _cand) -\n". new _beams)
results += new _cands # flush last cands post-process results res _cands
for prob, _ in sorted(results, reverse=True):
cand['dec _inp']l- res _cands. append(cand) join([dict _lookup(rev _vocab. w) for w in cand['dec _inp']l]) retugn res _cands[:args. beam _size]
往期精彩文章: