import os
from paddlenlp.data import DataCollatorWithPadding
import random
import numpy as np
import paddle
import json
from paddlenlp.utils.log import logger
from paddlenlp.transformers import (AutoModelForSequenceClassification, AutoTokenizer)
from paddlenlp.datasets import MapDataset
更新模型的dropout
def update_model_dropout(model, p=0.0):
model.base_model.embeddings.dropout.p = p
for i in range(len(model.base_model.encoder.layers)):
model.base_model.encoder.layers[i].dropout.p = p
model.base_model.encoder.layers[i].dropout1.p = p
model.base_model.encoder.layers[i].dropout2.p = p
负样本标题neg_title有一定概率和正样本标题title一样
所以对于新生成的样本还要过滤,把一样的过滤掉
def gen_pair(dataset, pool_size=100):
if len(dataset) < pool_size:
pool_size = len(dataset)
new_examples = [] # 新样本
pool = [] # 临时容器,存放样本对应的title
tmp_exmaples = [] # 临时容器,存放样本
for example in dataset:
label = example["label"]
这里是生成neg_title,label==0的本来就是neg_title
要生成也是正标题,但是正标题无法生成
if label == 0:
continue
tmp_exmaples.append(example)
pool.append(example["title"]) #池子存放的是样本对应的标题
if len(pool) >= pool_size: # 如果够批次了
np.random.shuffle(pool) # 随机刷新title顺序
#遍历临时容器中的每个样本
for idx, example in enumerate(tmp_exmaples):
设置neg_title为别人的title
example["neg_title"] = pool[idx]
把修改后的样本加入新样本集
new_examples.append(example)
tmp_exmaples = [] # 清空,以存放下个批次数据
pool = []
if len(pool)>0:
np.random.shuffle(pool)
for idx, example in enumerate(tmp_exmaples):
设置neg_title为别人的title
example["neg_title"] = pool[idx]
把修改后的样本加入新样本集
new_examples.append(example)
return MapDataset(new_examples)
def cal_md5(str):
str = str.decode("utf-8", "ignore").encode("utf-8", "ignore")
return hashlib.md5(str).hexdigest()
def create_dataloader(dataset,
batch_size=1,
batchify_fn=None,
trans_fn=None,
return_list=True,
mode='train'):
if trans_fn:
dataset = dataset.map(trans_fn)
shuffle = True if mode == 'train' else False
batch_sampler = paddle.io.BatchSampler(dataset,
batch_size=batch_size,
shuffle=shuffle)
if not return_list:
return paddle.io.DataLoader(dataset=dataset,
batch_sampler=batch_sampler,
collate_fn=batchify_fn)
return paddle.io.DataLoader(dataset=dataset,
batch_sampler=batch_sampler,
collate_fn=batchify_fn,
return_list=return_list)
重复词策略
def word_repetition(input_ids, token_type_ids, dup_rate=0.32):
"""Word Repetition strategy."""
input_ids = input_ids.numpy().tolist()
token_type_ids = token_type_ids.numpy().tolist()
batch_size, seq_len = len(input_ids), len(input_ids[0])
repetitied_input_ids = [] # 用来装重复词后的批次ids
repetitied_token_type_ids = [] # 用来装重复词后的批次sids
rep_seq_len = seq_len # 用来设定重复词策略后的批次最大序列长度
for batch_id in range(batch_size):
cur_input_id = input_ids[batch_id]
actual_len = np.count_nonzero(cur_input_id) # 非填充token
dup_word_index = []
If sequence length is less than 5, skip it
if actual_len > 5:
重复长度是0--int(dup_rate * actual_len)之间的值
dup_len = random.randint(a=0, b=max(2, int(dup_rate * actual_len)))
刨除[CLS]和[SEP],随机采样dup_len个
dup_word_index = random.sample(list(range(1, actual_len - 1)), k=dup_len)
r_input_id = []
r_token_type_id = []
for idx, word_id in enumerate(cur_input_id):
插入重复单词,如果idx在dup_word_index中,idx从0开始,
在里面,说明被采样为要重复的token
if idx in dup_word_index:
r_input_id.append(word_id)
r_token_type_id.append(token_type_ids[batch_id][idx])
正常的token只会被添加一次,选中的重复词token会被添加两次
实现了重复词策略
r_input_id.append(word_id)
r_token_type_id.append(token_type_ids[batch_id][idx])
after_dup_len = len(r_input_id) # 重复词后的批次内单样本长度
repetitied_input_ids.append(r_input_id)
repetitied_token_type_ids.append(r_token_type_id)
更新rep_seq_len
if after_dup_len > rep_seq_len:
rep_seq_len = after_dup_len
填充批次数据到同一序列长度
for batch_id in range(batch_size):
after_dup_len = len(repetitied_input_ids[batch_id]) # 这个批次内第i个样本的序列长度
pad_len = rep_seq_len - after_dup_len # 要填充的长度
repetitied_input_ids[batch_id] += [0] * pad_len
repetitied_token_type_ids[batch_id] += [0] * pad_len
返回重复词策略后的数据
return paddle.to_tensor(repetitied_input_ids, dtype="int64"), paddle.to_tensor(
repetitied_token_type_ids, dtype="int64"
)
def set_seed(seed):
random.seed(seed)
np.random.seed(seed)
paddle.seed(seed)
def read_by_lines(path):
result = []
with open(path, "r", encoding="utf8") as f:
for line in f:
result.append(line.strip())
return result
def write_by_lines(path, data):
with open(path, "w", encoding="utf8") as f:
[f.write(d + "\n") for d in data]
def write_text(path,data):
with open(path,mode='w',encoding='utf-8') as fout:
for i in data:
fout.write('{}\n'.format(i.strip()))
import os
import hnswlib
from paddlenlp.utils.log import logger
import numpy as np
import paddle
import json
@paddle.no_grad()
def evaluate_glue(model, loss_fct, metric, data_loader):
model.eval()
metric.reset()
for batch in data_loader:
input_ids, labels = batch
logits = model(input_ids)
loss = loss_fct(logits,labels)
correct = metric.compute(logits,labels)
metric.update(correct)
res = metric.accumulate()
if isinstance(metric, AccuracyAndF1):
print(
"eval loss: %f, acc: %s, precision: %s, recall: %s, f1: %s, acc and f1: %s, "
% (loss.item(),*res))
elif isinstance(metric, Mcc):
print("eval loss: %f, mcc: %s, " % (loss.item(),res[0]))
elif isinstance(metric, PearsonAndSpearman):
print(
"eval loss: %f, pearson: %s, spearman: %s, pearson and spearman: %s, "
% (loss.item(),*res),
end='')
else:
print("eval loss: %f, acc: %s, " % (loss.item(), res))
metric.reset()
model.train()
return res[0] if isinstance(res,list) else res
@paddle.no_grad()
def evaluate_clue(model, loss_fct, metric, data_loader):
model.eval()
metric.reset()
for batch in data_loader:
labels = batch.pop("labels") # 弹出栈,labels
logits = model(**batch) # **自动拆包
loss = loss_fct(logits, labels)
correct = metric.compute(logits, labels) # 计算正确数
metric.update(correct)
res = metric.accumulate() # 评估结果
logger.info("eval loss: %f, acc: %s, " % (loss.item(), res))
metric.reset()
model.train()
return res
@paddle.no_grad()
def do_evaluate2(model, tokenizer, data_loader, label_normalize_dict):
model.eval()
total_num = 0
correct_num = 0
normed_labels = [
normalized_lable
for origin_lable, normalized_lable in label_normalize_dict.items()
]
label_length = len(normed_labels[0]) # 标签长度
for batch in data_loader:
src_ids, token_type_ids, masked_positions, masked_lm_labels = batch
[bs * label_length, vocab_size]
prediction_probs = model.predict(input_ids=src_ids,
token_type_ids=token_type_ids,
masked_positions=masked_positions)
batch_size = len(src_ids)
vocab_size = prediction_probs.shape[1]
prediction_probs: [batch_size, label_length, vocab_size]
prediction_probs = paddle.reshape(prediction_probs,
shape=[batch_size, -1,
vocab_size]).numpy()
[label_num, label_length]
label_ids = np.array(
[tokenizer(label)["input_ids"][1:-1] for label in normed_labels])
y_pred = np.ones(shape=[batch_size, len(label_ids)])
计算候选标签的联合分布。
for index in range(label_length):
y_pred *= prediction_probs[:, index, label_ids[:, index]]
Get max probs label's index
y_pred_index = np.argmax(y_pred, axis=-1)
y_true_index = []
for masked_lm_label in masked_lm_labels.numpy():
label_text = "".join(
tokenizer.convert_ids_to_tokens(list(masked_lm_label)))
label_index = normed_labels.index(label_text)
y_true_index.append(label_index)
y_true_index = np.array(y_true_index)
total_num += len(y_true_index)
correct_num += (y_true_index == y_pred_index).sum()
model.train()
return 100 * correct_num / total_num, total_num
@paddle.no_grad()
def do_evaluate_chid(model, tokenizer, data_loader, label_normalize_dict):
"""
FCLUE `chid` 数据集在评估时具有特殊性:输入槽中包含额外的 `candidate_label_ids`,
因此需要自定义评估函数。
"""
model.eval()
total_num = 0
correct_num = 0
normed_labels = [
normalized_lable
for origin_lable, normalized_lable in label_normalize_dict.items()
]
label_length = len(normed_labels[0])
for batch in data_loader:
src_ids, token_type_ids, masked_positions, masked_lm_labels, candidate_label_ids = batch
[bs * label_length, vocab_size]
prediction_probs = model.predict(input_ids=src_ids,
token_type_ids=token_type_ids,
masked_positions=masked_positions)
batch_size = len(src_ids)
vocab_size = prediction_probs.shape[1]
prediction_probs: [batch_size, label_lenght, vocab_size]
prediction_probs = paddle.reshape(prediction_probs,
shape=[batch_size, -1,
vocab_size]).numpy()
candidate_num = candidate_label_ids.shape[1]
[batch_size, candidate_num(7)]
y_pred = np.ones(shape=[batch_size, candidate_num])
for label_idx in range(candidate_num):
[bathc_size, label_length(4)]
single_candidate_label_ids = candidate_label_ids[:, label_idx, :]
Calculate joint distribution of candidate labels
for index in range(label_length):
[batch_size,]
slice_word_ids = single_candidate_label_ids[:, index].numpy()
batch_single_token_prob = []
for bs_index in range(batch_size):
[1, 1]
single_token_prob = prediction_probs[
bs_index, index, slice_word_ids[bs_index]]
batch_single_token_prob.append(single_token_prob)
y_pred[:, label_idx] *= np.array(batch_single_token_prob)
Get max probs label's index
y_pred_index = np.argmax(y_pred, axis=-1)
y_true_index = []
for index, masked_lm_label in enumerate(masked_lm_labels.numpy()):
[cantidate_num, label_length]
tmp_candidate_label_ids = candidate_label_ids[index, :, :]
for idx, label_ids in enumerate(tmp_candidate_label_ids.numpy()):
if np.equal(label_ids, masked_lm_label).all():
y_true_index.append(idx)
continue
y_true_index = np.array(y_true_index)
total_num += len(y_true_index)
correct_num += (y_true_index == y_pred_index).sum()
model.train()
return 100 * correct_num / total_num, total_num
@paddle.no_grad()
def do_evaluate(model, tokenizer, data_loader, task_label_description):
model.eval()
total_num = 0
correct_num = 0
class_num = len(task_label_description) # 15
[total_num * class_num, 2]
all_prediction_probs = []
[total_num * class_num]
all_labels = []
for batch in data_loader:
src_ids, token_type_ids, true_labels = batch
prediction_probs = model(input_ids=src_ids,
token_type_ids=token_type_ids).numpy()
all_prediction_probs.append(prediction_probs)
all_labels.append(true_labels.numpy())
all_labels = np.concatenate(all_labels, axis=0)
all_prediction_probs = np.concatenate(all_prediction_probs, axis=0)
all_prediction_probs = np.reshape(all_prediction_probs, (-1, class_num, 2))
(total_num,class_num,1),1是属于正样本对的分数,0是属于负样本对的分数
prediction_pos_probs = all_prediction_probs[:, :, 1]
prediction_pos_probs = np.reshape(prediction_pos_probs, (-1, class_num))
获取total_num中每个样本属于各个类别的预测类别,y_pred_index的值会是
0--class_num-1间的值,属于模型预测sentence1和sentence2中哪一个是正样本
y_pred_index = np.argmax(prediction_pos_probs, axis=-1)
每15个是同一个原样本和不同提示的样本对,这里只用每个原样本一个真实标签就行
idx % class_num == 0保证15个里只取一个索引,这个索引对应真实标签
y_true_index = np.array([
true_label_index for idx, true_label_index in enumerate(all_labels)
if idx % class_num == 0
])
total_num = len(y_true_index) # 总的评估样本数,sentence1
correct_num = (y_pred_index == y_true_index).sum() #预测对的样本数
model.train()
return 100 * correct_num / total_num, total_num
hnsw_max_elements=1000000
hnsw_ef=100
hnsw_m=100
def build_index(data_loader, model,output_emb_size):
index = hnswlib.Index(
space='ip',
dim=output_emb_size if output_emb_size > 0 else 768)
index.init_index(max_elements=hnsw_max_elements,
ef_construction=hnsw_ef,
M=hnsw_m)
index.set_ef(hnsw_ef)
index.set_num_threads(6)
logger.info("start build index..........")
all_embeddings = []
for text_embeddings in model.get_semantic_embedding(data_loader):
all_embeddings.append(text_embeddings.numpy())
all_embeddings = np.concatenate(all_embeddings, axis=0)
index.add_items(all_embeddings)
logger.info("Total index number:{}".format(index.get_current_count()))
return index
def write_recall_file(model,query_data_loader,final_index,text_list,
id2corpus,recall_result_file,recall_num=20):
query_embedding = model.get_semantic_embedding(query_data_loader)
with open(recall_result_file, 'w', encoding='utf-8') as f:
for batch_index, batch_query_embedding in enumerate(query_embedding):
recalled_idx, cosine_sims = final_index.knn_query(\
batch_query_embedding.numpy(),recall_num)
batch_size = len(cosine_sims)
for row_index in range(batch_size):
text_index = batch_size * batch_index + row_index # 对应query列表中的文本
#把原query,从索引库召回的50条语义相近索引,前两者的相似度写入召回文件
for idx, doc_idx in enumerate(recalled_idx[row_index]):
f.write("{}\t{}\t{}\n".format(
text_list[text_index]["text"], id2corpus[doc_idx],
1.0 - cosine_sims[row_index][idx]))
@paddle.no_grad()
def evaluate(model, corpus_data_loader, query_data_loader, recall_result_file,output_emb_size,
text_list,text2similar,id2corpus,recall_num=20,final_index=None):
model.eval()
def recall(rs, N=10):
recall_flags = [np.sum(r[:N]) for r in rs]#前N个有一个1就是1
return np.mean(recall_flags)#返回的是topK召回精确率
构建索引库,这个不能写外面,因为构建索引的模型每次都不一样,如果写外面,传进来,这在模型不训练只评估时还行
但是在训练中评估时就不行,因为不同模型构建的索引库不同,因为由他们提前的向量不同,你用新模型提取的向量
#去旧的索引库查找,肯定出问题.
if final_index is None:
final_index = build_index(corpus_data_loader,model,output_emb_size)
write_recall_file(model,query_data_loader,final_index,text_list,
id2corpus,recall_result_file,recall_num)
rs = []
with open(recall_result_file, 'r', encoding='utf-8') as f:
relevance_labels = []
for index, line in enumerate(f):
#用来保存一个文本的召回文本的标记,召回到相似文本标记为1
if index % recall_num == 0 and index != 0:
rs.append(relevance_labels)
relevance_labels = []# 够一个原query的召回就清空,之后存放下个query的召回
text, recalled_text, cosine_sim = line.rstrip().split("\t")#原文本,召回文本,距离
#召回是模拟用户不规则的query,去召回语料库中的title,如果召回的文本和相似文本对中query对应的
文本一样,那说明成功召回,设置当前召回标记1,不管多少召回,就只会有1个标记1的,但是算精度的时候,
只要成功,在召回k中成功召回,都是1召回精度衡量的是N次召回的平均成功几率,text2similar中query
对应的相似文本必须在corpus语料库里存在,否则就不可能成功召回,所以在评估前必须校准一下,query
对应的必须是title,如果不是,就得过滤掉,因为如果在corpus里找不到,就不可能召回,标记里全是0,
评估就失准
if text2similar[text] == recalled_text:# 成功召回,设置标记1,表示找到
relevance_labels.append(1)
else:
relevance_labels.append(0)# 不相同就设置0,表示没找到
recall_N = []# 召回精确率,N越小,召回精度越高越好
recall_nums = [1, 5, 10, 20]
for topN in recall_nums:# 遍历召回数
R = round(100 * recall(rs, N=topN), 3)
recall_N.append(R)
evaluate_result_file = os.path.join(recall_result_dir,
evaluate_result)
result = open(evaluate_result_file, 'a')
res = []
timestamp = time.strftime('%Y%m%d-%H%M%S', time.localtime())
res.append(timestamp)
for key, val in zip(recall_nums, recall_N):
print('recall@{}={}'.format(key, val))
res.append(str(val))
result.write('\t'.join(res) + '\n')
print(res)
model.train()
score=recall_N[0]*0.3+recall_N[1]*0.3+recall_N[2]*0.2+recall_N[3]*0.2
return score
def predict_rank_predict(model, data_loader):
all_probs = []
model.eval()
with paddle.no_grad():
for batch_data in data_loader:
input_ids, token_type_ids = batch_data
batch_prob = model.predict(input_ids=input_ids,#批次相似度
token_type_ids=token_type_ids).numpy()
all_probs.append(batch_prob)
if (len(all_probs) == 1): # 只预测一个批次的情况
all_probs = np.array(all_probs)
else:
all_probs = np.concatenate(all_probs, axis=0) # 合并
return all_probs
@paddle.no_grad()
def evaluate_rank_auc(model, metric, data_loader, phase="dev"):
model.eval()
metric.reset()
for idx, batch in enumerate(data_loader):
input_ids, token_type_ids, labels = batch
获取预测概率
pos_probs = model.predict(input_ids=input_ids,
token_type_ids=token_type_ids)
neg_probs = 1.0 - pos_probs # 负概率
预测(neg_probs, pos_probs)
preds = np.concatenate((neg_probs, pos_probs), axis=1)
metric.update(preds=preds, labels=labels)
auc=metric.accumulate()
print("eval_{} auc:{:.3}".format(phase,auc))
metric.reset()
model.train()
return auc
@paddle.no_grad()
def evaluate_seq_classification(model, criterion, metric, data_loader):
model.eval() # 评估模式
metric.reset() # 指标重置
losses = [] # 用来保存批次损失
for batch in data_loader:
input_ids,token_type_ids,labels=batch
logits = model(input_ids,token_type_ids)
loss = criterion(logits, labels) # 计算损失
losses.append(loss.item())
correct = metric.compute(logits, labels) # 计算正确数
metric.update(correct) # 更新指标
acc = metric.accumulate() # 平均准确率
logger.info("eval loss: %.5f, acc: %.5f" % (np.mean(losses), acc))
metric.reset()
model.train()
return np.mean(losses),acc
def predict_sims(model, data_loader):
cosine_sims = []
model.eval()
with paddle.no_grad():
for batch_data in data_loader:
query_input_ids, query_token_type_ids, title_input_ids, title_token_type_ids = batch_data
batch_cosine_sim = model.cosine_sim( # [n]
query_input_ids=query_input_ids,
title_input_ids=title_input_ids,
query_token_type_ids=query_token_type_ids,
title_token_type_ids=title_token_type_ids).numpy()
cosine_sims.append(batch_cosine_sim)
cosine_sims = np.concatenate(cosine_sims, axis=0)
return cosine_sims
import os
import paddle
from paddlenlp.utils.log import logger
import pandas as pd
from tqdm import tqdm
from paddlenlp.datasets import load_dataset
from paddlenlp.prompt import InputExample
import json
def read_json_file(file):
with open(file, 'r', encoding='utf-8') as f:
for line in f.readlines():
line = line.strip()
过滤掉空内容的
if not line:
continue
line = json.loads(line)
yield line
def read_text_pair(data_path, is_test=False):
with open(data_path, 'r', encoding='utf-8') as f:
for line in f:
data = line.rstrip().split("\t")
if not is_test:
if len(data) != 3:
continue
query1,query2,label=data
if not query1.strip() or not query2.strip() or not label.strip():
continue
yield {'query1': query1, 'query2': query2, 'label': label}
else:
if not data[0].strip() or not data[1].strip():
continue
yield {'query1': data[0], 'query2': data[1]}
def read_pair_data(data_path):
with open(data_path, 'r', encoding='utf-8') as f:
for line in f.readlines():
word, label= line.strip().split('\t')
if not word.strip() or not str(label).strip():
continue
print(word,label)
yield {'text': word, 'label': label}
def get_label_dict(path):
with open(path, "r", encoding="utf-8") as f:
labels= [i.strip() for i in f.readlines()]
label2id=dict(zip(labels,range(len(labels))))
id2label=dict(zip(label2id.values(),label2id.keys()))
return label2id,id2label
def read_pair_by_pd(src_path):
df=pd.read_csv(src_path, sep='\t',header=None)
for index, row in df.iterrows():
query,title = row
yield {'query': str(query), 'title': str(title)}
对于一些不规则的三数据组,里面有引号,引号内可能有制表符的,用pd读
def read_data_by_pd(src_path, is_predict=False):
data = pd.read_csv(src_path, sep='\t')
for index, row in tqdm(data.iterrows()):
query = row['query']
title = row['title']
neg_title = row['neg_title']
yield {'query': query, 'title': title, 'neg_title': neg_title}
def read_data_by_pd_test(src_path, is_predict=False):
data = pd.read_csv(src_path, sep='\t')
for index, row in tqdm(data.iterrows()):
query = row['query']
title = row['title']
label = row['label']
yield {'query': query, 'title': title, 'label': label}
def read_texts(data_path, is_test=False):
with open(data_path, 'r', encoding='utf-8') as f:
for line in f:
data = line.rstrip().split("\t")
if not is_test:
if len(data) != 3:
continue
if len(data[0].strip()) == 0 or len(data[1].strip()) == 0 or len(data[2].strip()) == 0:
continue
yield {'text_a': data[0], 'text_b': data[1], 'label': data[2]}
else:
if len(data) != 2:
continue
if len(data[0].strip()) == 0 or len(data[1].strip()) == 0:
continue
yield {'text_a': data[0], 'text_b': data[1]}
def read_simcse_text(data_path):
with open(data_path,encoding='utf-8') as f:
for line in f:
data=line.strip().split('\t')
if len(data) != 2:
continue
query,title=data
if not query.strip() or not title.strip():
continue
yield {"query": query.strip(), "title": title.strip()}
def gen_text_file(similar_text_pair_file):
text2similar_text = {}
texts = []
with open(similar_text_pair_file, 'r', encoding='utf-8') as f:
for line in f:
splited_line = line.rstrip().split("\t")
if len(splited_line) != 2:
continue
text, similar_text = splited_line
if len(text.strip())==0 or len(similar_text.strip())==0:
continue
text2similar_text[text.strip()] = similar_text.strip()
texts.append({"text": text.strip()})
return texts, text2similar_text
def read_text_label(data_path):
with open(data_path,encoding='utf-8') as f:
for line in f:
split_line=line.rstrip().split('\t')
if len(split_line) !=2:
continue
if ' ' in split_line[0]:
text=split_line[0].replace(' ','_')
text=list(text)
label=split_line[1].split(' ')
assert len(text)==len(label), f'{text},{label}'
yield {"text": text, "label": label}
def load_dataset(datafiles):
def read(data_path):
with open(data_path, 'r', encoding='utf-8') as fp:
next(fp) # Skip header
for line in fp.readlines():
words, labels = line.strip('\n').split('\t')
words = words.split('\002')
labels = labels.split('\002')
yield words, labels
if isinstance(datafiles, str):
return MapDataset(list(read(datafiles)))
elif isinstance(datafiles, list) or isinstance(datafiles, tuple):
return [MapDataset(list(read(datafile))) for datafile in datafiles]
def gen_id2corpus(corpus_file):
id2corpus = {}
with open(corpus_file, 'r', encoding='utf-8') as f:
for idx, line in enumerate(f):
id2corpus[idx] = line.rstrip()
return id2corpus
def read_single_data(data_path):
with open(data_path, 'r', encoding='utf-8') as f:
for line in f.readlines():
word =line.rstrip()
if not word.strip():
continue
yield {'text': word}
def read_text_single(data_path):
with open(data_path, "r", encoding="utf-8") as f:
for line in f:
data = line.rstrip()
if not data:
continue
yield {"text_a": data, "text_b": data}