保险问答系统提前特征,插入milvus

import os

import sys

from tqdm import tqdm

import numpy as np

from scipy.special import softmax

import paddle

from paddle import inference

from paddlenlp.transformers import AutoTokenizer

from paddlenlp.data import Stack, Tuple, Pad

from paddlenlp.utils.log import logger

import paddle.nn as nn

class SimCSE(nn.Layer):

def init(self,

pretrained_model,

dropout=None,

margin=0.0,

scale=20,

output_emb_size=None):

super().init()

self.ptm = pretrained_model

self.dropout = nn.Dropout(dropout if dropout is not None else 0.1)

self.output_emb_size = output_emb_size

if output_emb_size > 0:

weight_attr = paddle.ParamAttr(

initializer=paddle.nn.initializer.TruncatedNormal(std=0.02))

self.emb_reduce_linear = paddle.nn.Linear(768,

output_emb_size,

weight_attr=weight_attr)

self.margin = margin

self.sacle = scale

self.classifier = nn.Linear(output_emb_size, 2)

self.rdrop_loss = paddlenlp.losses.RDropLoss()

@paddle.jit.to_static(input_spec=[

paddle.static.InputSpec(shape=[None, None], dtype='int64'),

paddle.static.InputSpec(shape=[None, None], dtype='int64')

])

def get_pooled_embedding(self,

input_ids,

token_type_ids=None,

position_ids=None,

attention_mask=None,

with_pooler=True):

sequence_output, cls_embedding = self.ptm(input_ids, token_type_ids,

position_ids, attention_mask)

if with_pooler == False:

cls_embedding = sequence_output[:, 0, :]

if self.output_emb_size > 0:

cls_embedding = self.emb_reduce_linear(cls_embedding)

cls_embedding = self.dropout(cls_embedding)

cls_embedding = F.normalize(cls_embedding, p=2, axis=-1)

return cls_embedding

def get_semantic_embedding(self, data_loader):

self.eval()

with paddle.no_grad():

for batch_data in data_loader:

input_ids, token_type_ids = batch_data

text_embeddings = self.get_pooled_embedding(

input_ids, token_type_ids=token_type_ids)

yield text_embeddings

def cosine_sim(self,

query_input_ids,

title_input_ids,

query_token_type_ids=None,

query_position_ids=None,

query_attention_mask=None,

title_token_type_ids=None,

title_position_ids=None,

title_attention_mask=None,

with_pooler=True):

query_cls_embedding = self.get_pooled_embedding(query_input_ids,

query_token_type_ids,

query_position_ids,

query_attention_mask,

with_pooler=with_pooler)

title_cls_embedding = self.get_pooled_embedding(title_input_ids,

title_token_type_ids,

title_position_ids,

title_attention_mask,

with_pooler=with_pooler)

cosine_sim = paddle.sum(query_cls_embedding * title_cls_embedding,

axis=-1)

return cosine_sim

def forward(self,

query_input_ids,

title_input_ids,

query_token_type_ids=None,

query_position_ids=None,

query_attention_mask=None,

title_token_type_ids=None,

title_position_ids=None,

title_attention_mask=None):

query_cls_embedding = self.get_pooled_embedding(query_input_ids,

query_token_type_ids,

query_position_ids,

query_attention_mask)

title_cls_embedding = self.get_pooled_embedding(title_input_ids,

title_token_type_ids,

title_position_ids,

title_attention_mask)

logits1 = self.classifier(query_cls_embedding)

logits2 = self.classifier(title_cls_embedding)

kl_loss = self.rdrop_loss(logits1, logits2)

cosine_sim = paddle.matmul(query_cls_embedding,

title_cls_embedding,

transpose_y=True)

margin_diag = paddle.full(shape=[query_cls_embedding.shape[0]],

fill_value=self.margin,

dtype=paddle.get_default_dtype())

cosine_sim = cosine_sim - paddle.diag(margin_diag)

cosine_sim *= self.sacle

labels = paddle.arange(0, query_cls_embedding.shape[0], dtype='int64')

labels = paddle.reshape(labels, shape=[-1, 1])

loss = F.cross_entropy(input=cosine_sim, label=labels)

return loss, kl_loss

def convert_example(example,

tokenizer,

max_seq_length=512,

pad_to_max_seq_len=False):

result = []

for key, text in example.items():

encoded_inputs = tokenizer(text=text,

max_seq_len=max_seq_length,

pad_to_max_seq_len=pad_to_max_seq_len)

input_ids = encoded_inputs["input_ids"]

token_type_ids = encoded_inputs["token_type_ids"]

result += [input_ids, token_type_ids]

return result

model_dir='./output/bxqa/'

corpus_file='./datasets/bxqa/corpus.csv'

max_seq_length=64

batch_size=32

device='gpu'

class Predictor(object):

def init(self,

model_dir,

device="gpu",

max_seq_length=128,

batch_size=32,

use_tensorrt=False,

precision="fp32",

cpu_threads=10,

enable_mkldnn=False):

self.max_seq_length = max_seq_length

self.batch_size = batch_size

model_file = model_dir + "inference.get_pooled_embedding.pdmodel"

params_file = model_dir + "inference.get_pooled_embedding.pdiparams"

if not os.path.exists(model_file):

raise ValueError("not find model file path {}".format(model_file))

if not os.path.exists(params_file):

raise ValueError("not find params file path {}".format(params_file))

config = paddle.inference.Config(model_file, params_file)

if device == "gpu":

config.enable_use_gpu(100, 0)

precision_map = {

"fp16": inference.PrecisionType.Half,

"fp32": inference.PrecisionType.Float32,

"int8": inference.PrecisionType.Int8

}

precision_mode = precision_map[precision]

if use_tensorrt:

config.enable_tensorrt_engine(max_batch_size=batch_size,

min_subgraph_size=30,

precision_mode=precision_mode)

elif device == "cpu":

config.disable_gpu()

if args.enable_mkldnn:

cache 10 different shapes for mkldnn to avoid memory leak

config.set_mkldnn_cache_capacity(10)

config.enable_mkldnn()

config.set_cpu_math_library_num_threads(args.cpu_threads)

elif device == "xpu":

config.enable_xpu(100)

config.switch_use_feed_fetch_ops(False)

self.predictor = paddle.inference.create_predictor(config)

self.input_handles = [

self.predictor.get_input_handle(name)

for name in self.predictor.get_input_names()

]

self.output_handle = self.predictor.get_output_handle(

self.predictor.get_output_names()[0])

def predict(self, data, tokenizer):

batchify_fn = lambda samples, fn=Tuple(

Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'), # input

Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'

), # segment

): fn(samples)

all_embeddings = []#存放所有数据

examples = []#用来临时存放一个批次数据

for idx, text in enumerate(tqdm(data)):

input_ids, segment_ids = convert_example(

text,

tokenizer,

max_seq_length=self.max_seq_length,

pad_to_max_seq_len=True)

examples.append((input_ids, segment_ids))

if (len(examples) >= self.batch_size):#够一个批次就做下面的事

input_ids, segment_ids = batchify_fn(examples)

self.input_handles[0].copy_from_cpu(input_ids)

self.input_handles[1].copy_from_cpu(segment_ids)

self.predictor.run()

logits = self.output_handle.copy_to_cpu()

all_embeddings.append(logits)

examples = []

if (len(examples) > 0):#处理最后一个批次数据

input_ids, segment_ids = batchify_fn(examples)

self.input_handles[0].copy_from_cpu(input_ids)

self.input_handles[1].copy_from_cpu(segment_ids)

self.predictor.run()

logits = self.output_handle.copy_to_cpu()

all_embeddings.append(logits)

all_embeddings = np.concatenate(all_embeddings, axis=0)

np.save('bxqa_corpus_embedding', all_embeddings)

def read_text(file_path):

file = open(file_path)

id2corpus = {}

for idx, data in enumerate(file.readlines()):

id2corpus[idx] = data.strip()

return id2corpus

predictor = Predictor(model_dir, device, max_seq_length,

batch_size)

tokenizer = AutoTokenizer.from_pretrained('ernie-3.0-medium-zh')

id2corpus = read_text(corpus_file)

corpus_list = [{idx: text} for idx, text in id2corpus.items()]

predictor.predict(corpus_list, tokenizer)

from pymilvus import *

import pymilvus

MILVUS_HOST = '127.0.0.1'

MILVUS_PORT =19530

data_dim = 256

top_k = 10

collection_name = 'bxqa'

partition_tag = 'partition_1'

embedding_name = 'embeddings'

index_config = {

"index_type": "IVF_FLAT",

"metric_type": "L2",

"params": {

"nlist": 500

},

}

search_params = {

"metric_type": "L2",

"params": {

"nprobe": 20

},

}

fmt = "\n=== {:30} ===\n"

text_max_len = 1000

fields = [

FieldSchema(name="pk",

dtype=DataType.INT64,

is_primary=True,

auto_id=False,

max_length=100),

FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=text_max_len),

FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=data_dim)

]

schema = CollectionSchema(fields, "bxqa Index")

class VecToMilvus():

def init(self):

print(fmt.format("start connecting to Milvus"))

connections.connect("default", host=MILVUS_HOST, port=MILVUS_PORT)

self.collection = None

def has_collection(self, collection_name):

try:

has = utility.has_collection(collection_name)

print(f"Does collection {collection_name} exist in Milvus: {has}")

return has

except Exception as e:

print("Milvus has_table error:", e)

def creat_collection(self, collection_name):

try:

print(fmt.format("Create collection {}".format(collection_name)))

self.collection = Collection(collection_name,

schema,

consistency_level="Strong")

except Exception as e:

print("Milvus create collection error:", e)

def drop_collection(self, collection_name):

try:

utility.drop_collection(collection_name)

except Exception as e:

print("Milvus delete collection error:", e)

def create_index(self, index_name):

try:

print(fmt.format("Start Creating index"))

self.collection.create_index(index_name, index_config)

print(fmt.format("Start loading"))

self.collection.load()

except Exception as e:

print("Milvus create index error:", e)

def has_partition(self, partition_tag):

try:

result = self.collection.has_partition(partition_tag)

return result

except Exception as e:

print("Milvus has partition error: ", e)

def create_partition(self, partition_tag):

try:

self.collection.create_partition(partition_tag)

print('create partition {} successfully'.format(partition_tag))

except Exception as e:

print('Milvus create partition error: ', e)

def insert(self, entities, collection_name, index_name, partition_tag=None):

try:

if not self.has_collection(collection_name):#没有集合就创建

self.creat_collection(collection_name)

self.create_index(index_name)

else:

self.collection = Collection(collection_name)

if (partition_tag #没有分区就创建

is not None) and (not self.has_partition(partition_tag)):

self.create_partition(partition_tag)

self.collection.insert(entities, partition_name=partition_tag)

print(

f"Number of entities in Milvus: {self.collection.num_entities}"

) # check the num_entites

except Exception as e:

print("Milvus insert error:", e)

embeddings = np.load('bxqa_corpus_embedding.npy')

embedding_ids = [i for i in range(embeddings.shape[0])]

client = VecToMilvus()

collection_name = 'faq_finance'

client.has_collection(collection_name)

partition_tag = 'partition_1'

data_size = len(embedding_ids)

corpus_list=[list(i.values())[0] for i in corpus_list]

batch_size = 10000

for i in tqdm(range(0, data_size, batch_size)):

cur_end = i + batch_size

if (cur_end > data_size):

cur_end = data_size

batch_emb = embeddings[np.arange(i, cur_end)]

entities = [

[j for j in range(i, cur_end, 1)],

[corpus_list[j][:text_max_len - 1] for j in range(i, cur_end, 1)],

batch_emb # field embeddings, supports numpy.ndarray and list

]

client.insert(collection_name=collection_name,

entities=entities,

index_name=embedding_name,

partition_tag=partition_tag)

相关推荐
王哈哈^_^1 小时前
【数据集】【YOLO】【VOC】目标检测数据集,查找数据集,yolo目标检测算法详细实战训练步骤!
人工智能·深度学习·算法·yolo·目标检测·计算机视觉·pyqt
是瑶瑶子啦1 小时前
【深度学习】论文笔记:空间变换网络(Spatial Transformer Networks)
论文阅读·人工智能·深度学习·视觉检测·空间变换
向阳12182 小时前
Bert快速入门
人工智能·python·自然语言处理·bert
wangyue43 小时前
c# 深度模型入门
深度学习
川石课堂软件测试3 小时前
性能测试|docker容器下搭建JMeter+Grafana+Influxdb监控可视化平台
运维·javascript·深度学习·jmeter·docker·容器·grafana
985小水博一枚呀3 小时前
【深度学习滑坡制图|论文解读3】基于融合CNN-Transformer网络和深度迁移学习的遥感影像滑坡制图方法
人工智能·深度学习·神经网络·cnn·transformer
985小水博一枚呀3 小时前
【深度学习滑坡制图|论文解读2】基于融合CNN-Transformer网络和深度迁移学习的遥感影像滑坡制图方法
人工智能·深度学习·神经网络·cnn·transformer·迁移学习
深度学习实战训练营5 小时前
基于CNN-RNN的影像报告生成
人工智能·深度学习
孙同学要努力12 小时前
全连接神经网络案例——手写数字识别
人工智能·深度学习·神经网络
Jina AI13 小时前
RAG 系统的分块难题:小型语言模型如何找到最佳断点?
人工智能·语言模型·自然语言处理