保险问答系统提前特征,插入milvus

import os

import sys

from tqdm import tqdm

import numpy as np

from scipy.special import softmax

import paddle

from paddle import inference

from paddlenlp.transformers import AutoTokenizer

from paddlenlp.data import Stack, Tuple, Pad

from paddlenlp.utils.log import logger

import paddle.nn as nn

class SimCSE(nn.Layer):

def init(self,

pretrained_model,

dropout=None,

margin=0.0,

scale=20,

output_emb_size=None):

super().init()

self.ptm = pretrained_model

self.dropout = nn.Dropout(dropout if dropout is not None else 0.1)

self.output_emb_size = output_emb_size

if output_emb_size > 0:

weight_attr = paddle.ParamAttr(

initializer=paddle.nn.initializer.TruncatedNormal(std=0.02))

self.emb_reduce_linear = paddle.nn.Linear(768,

output_emb_size,

weight_attr=weight_attr)

self.margin = margin

self.sacle = scale

self.classifier = nn.Linear(output_emb_size, 2)

self.rdrop_loss = paddlenlp.losses.RDropLoss()

@paddle.jit.to_static(input_spec=[

paddle.static.InputSpec(shape=[None, None], dtype='int64'),

paddle.static.InputSpec(shape=[None, None], dtype='int64')

])

def get_pooled_embedding(self,

input_ids,

token_type_ids=None,

position_ids=None,

attention_mask=None,

with_pooler=True):

sequence_output, cls_embedding = self.ptm(input_ids, token_type_ids,

position_ids, attention_mask)

if with_pooler == False:

cls_embedding = sequence_output[:, 0, :]

if self.output_emb_size > 0:

cls_embedding = self.emb_reduce_linear(cls_embedding)

cls_embedding = self.dropout(cls_embedding)

cls_embedding = F.normalize(cls_embedding, p=2, axis=-1)

return cls_embedding

def get_semantic_embedding(self, data_loader):

self.eval()

with paddle.no_grad():

for batch_data in data_loader:

input_ids, token_type_ids = batch_data

text_embeddings = self.get_pooled_embedding(

input_ids, token_type_ids=token_type_ids)

yield text_embeddings

def cosine_sim(self,

query_input_ids,

title_input_ids,

query_token_type_ids=None,

query_position_ids=None,

query_attention_mask=None,

title_token_type_ids=None,

title_position_ids=None,

title_attention_mask=None,

with_pooler=True):

query_cls_embedding = self.get_pooled_embedding(query_input_ids,

query_token_type_ids,

query_position_ids,

query_attention_mask,

with_pooler=with_pooler)

title_cls_embedding = self.get_pooled_embedding(title_input_ids,

title_token_type_ids,

title_position_ids,

title_attention_mask,

with_pooler=with_pooler)

cosine_sim = paddle.sum(query_cls_embedding * title_cls_embedding,

axis=-1)

return cosine_sim

def forward(self,

query_input_ids,

title_input_ids,

query_token_type_ids=None,

query_position_ids=None,

query_attention_mask=None,

title_token_type_ids=None,

title_position_ids=None,

title_attention_mask=None):

query_cls_embedding = self.get_pooled_embedding(query_input_ids,

query_token_type_ids,

query_position_ids,

query_attention_mask)

title_cls_embedding = self.get_pooled_embedding(title_input_ids,

title_token_type_ids,

title_position_ids,

title_attention_mask)

logits1 = self.classifier(query_cls_embedding)

logits2 = self.classifier(title_cls_embedding)

kl_loss = self.rdrop_loss(logits1, logits2)

cosine_sim = paddle.matmul(query_cls_embedding,

title_cls_embedding,

transpose_y=True)

margin_diag = paddle.full(shape=[query_cls_embedding.shape[0]],

fill_value=self.margin,

dtype=paddle.get_default_dtype())

cosine_sim = cosine_sim - paddle.diag(margin_diag)

cosine_sim *= self.sacle

labels = paddle.arange(0, query_cls_embedding.shape[0], dtype='int64')

labels = paddle.reshape(labels, shape=[-1, 1])

loss = F.cross_entropy(input=cosine_sim, label=labels)

return loss, kl_loss

def convert_example(example,

tokenizer,

max_seq_length=512,

pad_to_max_seq_len=False):

result = []

for key, text in example.items():

encoded_inputs = tokenizer(text=text,

max_seq_len=max_seq_length,

pad_to_max_seq_len=pad_to_max_seq_len)

input_ids = encoded_inputs["input_ids"]

token_type_ids = encoded_inputs["token_type_ids"]

result += [input_ids, token_type_ids]

return result

model_dir='./output/bxqa/'

corpus_file='./datasets/bxqa/corpus.csv'

max_seq_length=64

batch_size=32

device='gpu'

class Predictor(object):

def init(self,

model_dir,

device="gpu",

max_seq_length=128,

batch_size=32,

use_tensorrt=False,

precision="fp32",

cpu_threads=10,

enable_mkldnn=False):

self.max_seq_length = max_seq_length

self.batch_size = batch_size

model_file = model_dir + "inference.get_pooled_embedding.pdmodel"

params_file = model_dir + "inference.get_pooled_embedding.pdiparams"

if not os.path.exists(model_file):

raise ValueError("not find model file path {}".format(model_file))

if not os.path.exists(params_file):

raise ValueError("not find params file path {}".format(params_file))

config = paddle.inference.Config(model_file, params_file)

if device == "gpu":

config.enable_use_gpu(100, 0)

precision_map = {

"fp16": inference.PrecisionType.Half,

"fp32": inference.PrecisionType.Float32,

"int8": inference.PrecisionType.Int8

}

precision_mode = precision_map[precision]

if use_tensorrt:

config.enable_tensorrt_engine(max_batch_size=batch_size,

min_subgraph_size=30,

precision_mode=precision_mode)

elif device == "cpu":

config.disable_gpu()

if args.enable_mkldnn:

cache 10 different shapes for mkldnn to avoid memory leak

config.set_mkldnn_cache_capacity(10)

config.enable_mkldnn()

config.set_cpu_math_library_num_threads(args.cpu_threads)

elif device == "xpu":

config.enable_xpu(100)

config.switch_use_feed_fetch_ops(False)

self.predictor = paddle.inference.create_predictor(config)

self.input_handles = [

self.predictor.get_input_handle(name)

for name in self.predictor.get_input_names()

]

self.output_handle = self.predictor.get_output_handle(

self.predictor.get_output_names()[0])

def predict(self, data, tokenizer):

batchify_fn = lambda samples, fn=Tuple(

Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'), # input

Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'

), # segment

): fn(samples)

all_embeddings = []#存放所有数据

examples = []#用来临时存放一个批次数据

for idx, text in enumerate(tqdm(data)):

input_ids, segment_ids = convert_example(

text,

tokenizer,

max_seq_length=self.max_seq_length,

pad_to_max_seq_len=True)

examples.append((input_ids, segment_ids))

if (len(examples) >= self.batch_size):#够一个批次就做下面的事

input_ids, segment_ids = batchify_fn(examples)

self.input_handles[0].copy_from_cpu(input_ids)

self.input_handles[1].copy_from_cpu(segment_ids)

self.predictor.run()

logits = self.output_handle.copy_to_cpu()

all_embeddings.append(logits)

examples = []

if (len(examples) > 0):#处理最后一个批次数据

input_ids, segment_ids = batchify_fn(examples)

self.input_handles[0].copy_from_cpu(input_ids)

self.input_handles[1].copy_from_cpu(segment_ids)

self.predictor.run()

logits = self.output_handle.copy_to_cpu()

all_embeddings.append(logits)

all_embeddings = np.concatenate(all_embeddings, axis=0)

np.save('bxqa_corpus_embedding', all_embeddings)

def read_text(file_path):

file = open(file_path)

id2corpus = {}

for idx, data in enumerate(file.readlines()):

id2corpus[idx] = data.strip()

return id2corpus

predictor = Predictor(model_dir, device, max_seq_length,

batch_size)

tokenizer = AutoTokenizer.from_pretrained('ernie-3.0-medium-zh')

id2corpus = read_text(corpus_file)

corpus_list = [{idx: text} for idx, text in id2corpus.items()]

predictor.predict(corpus_list, tokenizer)

from pymilvus import *

import pymilvus

MILVUS_HOST = '127.0.0.1'

MILVUS_PORT =19530

data_dim = 256

top_k = 10

collection_name = 'bxqa'

partition_tag = 'partition_1'

embedding_name = 'embeddings'

index_config = {

"index_type": "IVF_FLAT",

"metric_type": "L2",

"params": {

"nlist": 500

},

}

search_params = {

"metric_type": "L2",

"params": {

"nprobe": 20

},

}

fmt = "\n=== {:30} ===\n"

text_max_len = 1000

fields = [

FieldSchema(name="pk",

dtype=DataType.INT64,

is_primary=True,

auto_id=False,

max_length=100),

FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=text_max_len),

FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=data_dim)

]

schema = CollectionSchema(fields, "bxqa Index")

class VecToMilvus():

def init(self):

print(fmt.format("start connecting to Milvus"))

connections.connect("default", host=MILVUS_HOST, port=MILVUS_PORT)

self.collection = None

def has_collection(self, collection_name):

try:

has = utility.has_collection(collection_name)

print(f"Does collection {collection_name} exist in Milvus: {has}")

return has

except Exception as e:

print("Milvus has_table error:", e)

def creat_collection(self, collection_name):

try:

print(fmt.format("Create collection {}".format(collection_name)))

self.collection = Collection(collection_name,

schema,

consistency_level="Strong")

except Exception as e:

print("Milvus create collection error:", e)

def drop_collection(self, collection_name):

try:

utility.drop_collection(collection_name)

except Exception as e:

print("Milvus delete collection error:", e)

def create_index(self, index_name):

try:

print(fmt.format("Start Creating index"))

self.collection.create_index(index_name, index_config)

print(fmt.format("Start loading"))

self.collection.load()

except Exception as e:

print("Milvus create index error:", e)

def has_partition(self, partition_tag):

try:

result = self.collection.has_partition(partition_tag)

return result

except Exception as e:

print("Milvus has partition error: ", e)

def create_partition(self, partition_tag):

try:

self.collection.create_partition(partition_tag)

print('create partition {} successfully'.format(partition_tag))

except Exception as e:

print('Milvus create partition error: ', e)

def insert(self, entities, collection_name, index_name, partition_tag=None):

try:

if not self.has_collection(collection_name):#没有集合就创建

self.creat_collection(collection_name)

self.create_index(index_name)

else:

self.collection = Collection(collection_name)

if (partition_tag #没有分区就创建

is not None) and (not self.has_partition(partition_tag)):

self.create_partition(partition_tag)

self.collection.insert(entities, partition_name=partition_tag)

print(

f"Number of entities in Milvus: {self.collection.num_entities}"

) # check the num_entites

except Exception as e:

print("Milvus insert error:", e)

embeddings = np.load('bxqa_corpus_embedding.npy')

embedding_ids = [i for i in range(embeddings.shape[0])]

client = VecToMilvus()

collection_name = 'faq_finance'

client.has_collection(collection_name)

partition_tag = 'partition_1'

data_size = len(embedding_ids)

corpus_list=[list(i.values())[0] for i in corpus_list]

batch_size = 10000

for i in tqdm(range(0, data_size, batch_size)):

cur_end = i + batch_size

if (cur_end > data_size):

cur_end = data_size

batch_emb = embeddings[np.arange(i, cur_end)]

entities = [

[j for j in range(i, cur_end, 1)],

[corpus_list[j][:text_max_len - 1] for j in range(i, cur_end, 1)],

batch_emb # field embeddings, supports numpy.ndarray and list

]

client.insert(collection_name=collection_name,

entities=entities,

index_name=embedding_name,

partition_tag=partition_tag)

相关推荐
红色的山茶花20 分钟前
YOLOv9-0.1部分代码阅读笔记-loss_tal.py
笔记·深度学习·yolo
一位小说男主1 小时前
编码器与解码器:从‘乱码’到‘通话’
人工智能·深度学习
qq_529025292 小时前
Torch.gather
python·深度学习·机器学习
凯哥是个大帅比2 小时前
人工智能ACA(五)--深度学习基础
人工智能·深度学习
m0_748232923 小时前
DALL-M:基于大语言模型的上下文感知临床数据增强方法 ,补充
人工智能·语言模型·自然语言处理
海棠AI实验室3 小时前
AI的进阶之路:从机器学习到深度学习的演变(三)
人工智能·深度学习·机器学习
AIGC大时代3 小时前
如何使用ChatGPT辅助文献综述,以及如何进行优化?一篇说清楚
人工智能·深度学习·chatgpt·prompt·aigc
人机与认知实验室5 小时前
人、机、环境中各有其神经网络系统
人工智能·深度学习·神经网络·机器学习