保险问答系统提前特征,插入milvus

import os

import sys

from tqdm import tqdm

import numpy as np

from scipy.special import softmax

import paddle

from paddle import inference

from paddlenlp.transformers import AutoTokenizer

from paddlenlp.data import Stack, Tuple, Pad

from paddlenlp.utils.log import logger

import paddle.nn as nn

class SimCSE(nn.Layer):

def init(self,

pretrained_model,

dropout=None,

margin=0.0,

scale=20,

output_emb_size=None):

super().init()

self.ptm = pretrained_model

self.dropout = nn.Dropout(dropout if dropout is not None else 0.1)

self.output_emb_size = output_emb_size

if output_emb_size > 0:

weight_attr = paddle.ParamAttr(

initializer=paddle.nn.initializer.TruncatedNormal(std=0.02))

self.emb_reduce_linear = paddle.nn.Linear(768,

output_emb_size,

weight_attr=weight_attr)

self.margin = margin

self.sacle = scale

self.classifier = nn.Linear(output_emb_size, 2)

self.rdrop_loss = paddlenlp.losses.RDropLoss()

@paddle.jit.to_static(input_spec=[

paddle.static.InputSpec(shape=[None, None], dtype='int64'),

paddle.static.InputSpec(shape=[None, None], dtype='int64')

])

def get_pooled_embedding(self,

input_ids,

token_type_ids=None,

position_ids=None,

attention_mask=None,

with_pooler=True):

sequence_output, cls_embedding = self.ptm(input_ids, token_type_ids,

position_ids, attention_mask)

if with_pooler == False:

cls_embedding = sequence_output[:, 0, :]

if self.output_emb_size > 0:

cls_embedding = self.emb_reduce_linear(cls_embedding)

cls_embedding = self.dropout(cls_embedding)

cls_embedding = F.normalize(cls_embedding, p=2, axis=-1)

return cls_embedding

def get_semantic_embedding(self, data_loader):

self.eval()

with paddle.no_grad():

for batch_data in data_loader:

input_ids, token_type_ids = batch_data

text_embeddings = self.get_pooled_embedding(

input_ids, token_type_ids=token_type_ids)

yield text_embeddings

def cosine_sim(self,

query_input_ids,

title_input_ids,

query_token_type_ids=None,

query_position_ids=None,

query_attention_mask=None,

title_token_type_ids=None,

title_position_ids=None,

title_attention_mask=None,

with_pooler=True):

query_cls_embedding = self.get_pooled_embedding(query_input_ids,

query_token_type_ids,

query_position_ids,

query_attention_mask,

with_pooler=with_pooler)

title_cls_embedding = self.get_pooled_embedding(title_input_ids,

title_token_type_ids,

title_position_ids,

title_attention_mask,

with_pooler=with_pooler)

cosine_sim = paddle.sum(query_cls_embedding * title_cls_embedding,

axis=-1)

return cosine_sim

def forward(self,

query_input_ids,

title_input_ids,

query_token_type_ids=None,

query_position_ids=None,

query_attention_mask=None,

title_token_type_ids=None,

title_position_ids=None,

title_attention_mask=None):

query_cls_embedding = self.get_pooled_embedding(query_input_ids,

query_token_type_ids,

query_position_ids,

query_attention_mask)

title_cls_embedding = self.get_pooled_embedding(title_input_ids,

title_token_type_ids,

title_position_ids,

title_attention_mask)

logits1 = self.classifier(query_cls_embedding)

logits2 = self.classifier(title_cls_embedding)

kl_loss = self.rdrop_loss(logits1, logits2)

cosine_sim = paddle.matmul(query_cls_embedding,

title_cls_embedding,

transpose_y=True)

margin_diag = paddle.full(shape=[query_cls_embedding.shape[0]],

fill_value=self.margin,

dtype=paddle.get_default_dtype())

cosine_sim = cosine_sim - paddle.diag(margin_diag)

cosine_sim *= self.sacle

labels = paddle.arange(0, query_cls_embedding.shape[0], dtype='int64')

labels = paddle.reshape(labels, shape=[-1, 1])

loss = F.cross_entropy(input=cosine_sim, label=labels)

return loss, kl_loss

def convert_example(example,

tokenizer,

max_seq_length=512,

pad_to_max_seq_len=False):

result = []

for key, text in example.items():

encoded_inputs = tokenizer(text=text,

max_seq_len=max_seq_length,

pad_to_max_seq_len=pad_to_max_seq_len)

input_ids = encoded_inputs["input_ids"]

token_type_ids = encoded_inputs["token_type_ids"]

result += [input_ids, token_type_ids]

return result

model_dir='./output/bxqa/'

corpus_file='./datasets/bxqa/corpus.csv'

max_seq_length=64

batch_size=32

device='gpu'

class Predictor(object):

def init(self,

model_dir,

device="gpu",

max_seq_length=128,

batch_size=32,

use_tensorrt=False,

precision="fp32",

cpu_threads=10,

enable_mkldnn=False):

self.max_seq_length = max_seq_length

self.batch_size = batch_size

model_file = model_dir + "inference.get_pooled_embedding.pdmodel"

params_file = model_dir + "inference.get_pooled_embedding.pdiparams"

if not os.path.exists(model_file):

raise ValueError("not find model file path {}".format(model_file))

if not os.path.exists(params_file):

raise ValueError("not find params file path {}".format(params_file))

config = paddle.inference.Config(model_file, params_file)

if device == "gpu":

config.enable_use_gpu(100, 0)

precision_map = {

"fp16": inference.PrecisionType.Half,

"fp32": inference.PrecisionType.Float32,

"int8": inference.PrecisionType.Int8

}

precision_mode = precision_map[precision]

if use_tensorrt:

config.enable_tensorrt_engine(max_batch_size=batch_size,

min_subgraph_size=30,

precision_mode=precision_mode)

elif device == "cpu":

config.disable_gpu()

if args.enable_mkldnn:

cache 10 different shapes for mkldnn to avoid memory leak

config.set_mkldnn_cache_capacity(10)

config.enable_mkldnn()

config.set_cpu_math_library_num_threads(args.cpu_threads)

elif device == "xpu":

config.enable_xpu(100)

config.switch_use_feed_fetch_ops(False)

self.predictor = paddle.inference.create_predictor(config)

self.input_handles = [

self.predictor.get_input_handle(name)

for name in self.predictor.get_input_names()

]

self.output_handle = self.predictor.get_output_handle(

self.predictor.get_output_names()[0])

def predict(self, data, tokenizer):

batchify_fn = lambda samples, fn=Tuple(

Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'), # input

Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'

), # segment

): fn(samples)

all_embeddings = []#存放所有数据

examples = []#用来临时存放一个批次数据

for idx, text in enumerate(tqdm(data)):

input_ids, segment_ids = convert_example(

text,

tokenizer,

max_seq_length=self.max_seq_length,

pad_to_max_seq_len=True)

examples.append((input_ids, segment_ids))

if (len(examples) >= self.batch_size):#够一个批次就做下面的事

input_ids, segment_ids = batchify_fn(examples)

self.input_handles[0].copy_from_cpu(input_ids)

self.input_handles[1].copy_from_cpu(segment_ids)

self.predictor.run()

logits = self.output_handle.copy_to_cpu()

all_embeddings.append(logits)

examples = []

if (len(examples) > 0):#处理最后一个批次数据

input_ids, segment_ids = batchify_fn(examples)

self.input_handles[0].copy_from_cpu(input_ids)

self.input_handles[1].copy_from_cpu(segment_ids)

self.predictor.run()

logits = self.output_handle.copy_to_cpu()

all_embeddings.append(logits)

all_embeddings = np.concatenate(all_embeddings, axis=0)

np.save('bxqa_corpus_embedding', all_embeddings)

def read_text(file_path):

file = open(file_path)

id2corpus = {}

for idx, data in enumerate(file.readlines()):

id2corpus[idx] = data.strip()

return id2corpus

predictor = Predictor(model_dir, device, max_seq_length,

batch_size)

tokenizer = AutoTokenizer.from_pretrained('ernie-3.0-medium-zh')

id2corpus = read_text(corpus_file)

corpus_list = [{idx: text} for idx, text in id2corpus.items()]

predictor.predict(corpus_list, tokenizer)

from pymilvus import *

import pymilvus

MILVUS_HOST = '127.0.0.1'

MILVUS_PORT =19530

data_dim = 256

top_k = 10

collection_name = 'bxqa'

partition_tag = 'partition_1'

embedding_name = 'embeddings'

index_config = {

"index_type": "IVF_FLAT",

"metric_type": "L2",

"params": {

"nlist": 500

},

}

search_params = {

"metric_type": "L2",

"params": {

"nprobe": 20

},

}

fmt = "\n=== {:30} ===\n"

text_max_len = 1000

fields = [

FieldSchema(name="pk",

dtype=DataType.INT64,

is_primary=True,

auto_id=False,

max_length=100),

FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=text_max_len),

FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=data_dim)

]

schema = CollectionSchema(fields, "bxqa Index")

class VecToMilvus():

def init(self):

print(fmt.format("start connecting to Milvus"))

connections.connect("default", host=MILVUS_HOST, port=MILVUS_PORT)

self.collection = None

def has_collection(self, collection_name):

try:

has = utility.has_collection(collection_name)

print(f"Does collection {collection_name} exist in Milvus: {has}")

return has

except Exception as e:

print("Milvus has_table error:", e)

def creat_collection(self, collection_name):

try:

print(fmt.format("Create collection {}".format(collection_name)))

self.collection = Collection(collection_name,

schema,

consistency_level="Strong")

except Exception as e:

print("Milvus create collection error:", e)

def drop_collection(self, collection_name):

try:

utility.drop_collection(collection_name)

except Exception as e:

print("Milvus delete collection error:", e)

def create_index(self, index_name):

try:

print(fmt.format("Start Creating index"))

self.collection.create_index(index_name, index_config)

print(fmt.format("Start loading"))

self.collection.load()

except Exception as e:

print("Milvus create index error:", e)

def has_partition(self, partition_tag):

try:

result = self.collection.has_partition(partition_tag)

return result

except Exception as e:

print("Milvus has partition error: ", e)

def create_partition(self, partition_tag):

try:

self.collection.create_partition(partition_tag)

print('create partition {} successfully'.format(partition_tag))

except Exception as e:

print('Milvus create partition error: ', e)

def insert(self, entities, collection_name, index_name, partition_tag=None):

try:

if not self.has_collection(collection_name):#没有集合就创建

self.creat_collection(collection_name)

self.create_index(index_name)

else:

self.collection = Collection(collection_name)

if (partition_tag #没有分区就创建

is not None) and (not self.has_partition(partition_tag)):

self.create_partition(partition_tag)

self.collection.insert(entities, partition_name=partition_tag)

print(

f"Number of entities in Milvus: {self.collection.num_entities}"

) # check the num_entites

except Exception as e:

print("Milvus insert error:", e)

embeddings = np.load('bxqa_corpus_embedding.npy')

embedding_ids = [i for i in range(embeddings.shape[0])]

client = VecToMilvus()

collection_name = 'faq_finance'

client.has_collection(collection_name)

partition_tag = 'partition_1'

data_size = len(embedding_ids)

corpus_list=[list(i.values())[0] for i in corpus_list]

batch_size = 10000

for i in tqdm(range(0, data_size, batch_size)):

cur_end = i + batch_size

if (cur_end > data_size):

cur_end = data_size

batch_emb = embeddings[np.arange(i, cur_end)]

entities = [

j for j in range(i, cur_end, 1)\], \[corpus_list\[j\]\[:text_max_len - 1\] for j in range(i, cur_end, 1)\], batch_emb # field embeddings, supports numpy.ndarray and list

client.insert(collection_name=collection_name,

entities=entities,

index_name=embedding_name,

partition_tag=partition_tag)

相关推荐
hjs_deeplearning5 分钟前
论文写作篇#8:双栏的格式里怎么插入横跨两栏的图片和表格
人工智能·深度学习·学习·yolo·机器学习·论文写作·论文排版
Helios@28 分钟前
CNN 中感受野/权值共享是什么意思?
人工智能·深度学习·计算机视觉
冰蓝蓝1 小时前
TensorBoard
人工智能·深度学习
视觉AI1 小时前
研究下适合部署在jeston上的深度学习类单目标跟踪算法
深度学习·算法·目标跟踪
誉鏐1 小时前
RNN模型与NLP应用——(8/9)Attention(注意力机制)
人工智能·rnn·自然语言处理
AndrewHZ1 小时前
【图像处理基石】什么是AWB?
图像处理·深度学习·isp算法·awb·ai awb·isp芯片
Listennnn12 小时前
神经网络能不能完全拟合y=x² ???
人工智能·深度学习·神经网络
WhyNot?13 小时前
深度学习入门(三):神经网络的学习
深度学习·神经网络·学习
odoo中国14 小时前
深度学习 Deep Learning 第16章 结构化概率模型
人工智能·深度学习·结构化模型
摸鱼仙人~14 小时前
为什么有的深度学习训练,有训练集、验证集、测试集3个划分,有的只是划分训练集和测试集?
人工智能·深度学习