保险问答系统提前特征,插入milvus

import os

import sys

from tqdm import tqdm

import numpy as np

from scipy.special import softmax

import paddle

from paddle import inference

from paddlenlp.transformers import AutoTokenizer

from paddlenlp.data import Stack, Tuple, Pad

from paddlenlp.utils.log import logger

import paddle.nn as nn

class SimCSE(nn.Layer):

def init(self,

pretrained_model,

dropout=None,

margin=0.0,

scale=20,

output_emb_size=None):

super().init()

self.ptm = pretrained_model

self.dropout = nn.Dropout(dropout if dropout is not None else 0.1)

self.output_emb_size = output_emb_size

if output_emb_size > 0:

weight_attr = paddle.ParamAttr(

initializer=paddle.nn.initializer.TruncatedNormal(std=0.02))

self.emb_reduce_linear = paddle.nn.Linear(768,

output_emb_size,

weight_attr=weight_attr)

self.margin = margin

self.sacle = scale

self.classifier = nn.Linear(output_emb_size, 2)

self.rdrop_loss = paddlenlp.losses.RDropLoss()

@paddle.jit.to_static(input_spec=[

paddle.static.InputSpec(shape=[None, None], dtype='int64'),

paddle.static.InputSpec(shape=[None, None], dtype='int64')

])

def get_pooled_embedding(self,

input_ids,

token_type_ids=None,

position_ids=None,

attention_mask=None,

with_pooler=True):

sequence_output, cls_embedding = self.ptm(input_ids, token_type_ids,

position_ids, attention_mask)

if with_pooler == False:

cls_embedding = sequence_output[:, 0, :]

if self.output_emb_size > 0:

cls_embedding = self.emb_reduce_linear(cls_embedding)

cls_embedding = self.dropout(cls_embedding)

cls_embedding = F.normalize(cls_embedding, p=2, axis=-1)

return cls_embedding

def get_semantic_embedding(self, data_loader):

self.eval()

with paddle.no_grad():

for batch_data in data_loader:

input_ids, token_type_ids = batch_data

text_embeddings = self.get_pooled_embedding(

input_ids, token_type_ids=token_type_ids)

yield text_embeddings

def cosine_sim(self,

query_input_ids,

title_input_ids,

query_token_type_ids=None,

query_position_ids=None,

query_attention_mask=None,

title_token_type_ids=None,

title_position_ids=None,

title_attention_mask=None,

with_pooler=True):

query_cls_embedding = self.get_pooled_embedding(query_input_ids,

query_token_type_ids,

query_position_ids,

query_attention_mask,

with_pooler=with_pooler)

title_cls_embedding = self.get_pooled_embedding(title_input_ids,

title_token_type_ids,

title_position_ids,

title_attention_mask,

with_pooler=with_pooler)

cosine_sim = paddle.sum(query_cls_embedding * title_cls_embedding,

axis=-1)

return cosine_sim

def forward(self,

query_input_ids,

title_input_ids,

query_token_type_ids=None,

query_position_ids=None,

query_attention_mask=None,

title_token_type_ids=None,

title_position_ids=None,

title_attention_mask=None):

query_cls_embedding = self.get_pooled_embedding(query_input_ids,

query_token_type_ids,

query_position_ids,

query_attention_mask)

title_cls_embedding = self.get_pooled_embedding(title_input_ids,

title_token_type_ids,

title_position_ids,

title_attention_mask)

logits1 = self.classifier(query_cls_embedding)

logits2 = self.classifier(title_cls_embedding)

kl_loss = self.rdrop_loss(logits1, logits2)

cosine_sim = paddle.matmul(query_cls_embedding,

title_cls_embedding,

transpose_y=True)

margin_diag = paddle.full(shape=[query_cls_embedding.shape[0]],

fill_value=self.margin,

dtype=paddle.get_default_dtype())

cosine_sim = cosine_sim - paddle.diag(margin_diag)

cosine_sim *= self.sacle

labels = paddle.arange(0, query_cls_embedding.shape[0], dtype='int64')

labels = paddle.reshape(labels, shape=[-1, 1])

loss = F.cross_entropy(input=cosine_sim, label=labels)

return loss, kl_loss

def convert_example(example,

tokenizer,

max_seq_length=512,

pad_to_max_seq_len=False):

result = []

for key, text in example.items():

encoded_inputs = tokenizer(text=text,

max_seq_len=max_seq_length,

pad_to_max_seq_len=pad_to_max_seq_len)

input_ids = encoded_inputs["input_ids"]

token_type_ids = encoded_inputs["token_type_ids"]

result += [input_ids, token_type_ids]

return result

model_dir='./output/bxqa/'

corpus_file='./datasets/bxqa/corpus.csv'

max_seq_length=64

batch_size=32

device='gpu'

class Predictor(object):

def init(self,

model_dir,

device="gpu",

max_seq_length=128,

batch_size=32,

use_tensorrt=False,

precision="fp32",

cpu_threads=10,

enable_mkldnn=False):

self.max_seq_length = max_seq_length

self.batch_size = batch_size

model_file = model_dir + "inference.get_pooled_embedding.pdmodel"

params_file = model_dir + "inference.get_pooled_embedding.pdiparams"

if not os.path.exists(model_file):

raise ValueError("not find model file path {}".format(model_file))

if not os.path.exists(params_file):

raise ValueError("not find params file path {}".format(params_file))

config = paddle.inference.Config(model_file, params_file)

if device == "gpu":

config.enable_use_gpu(100, 0)

precision_map = {

"fp16": inference.PrecisionType.Half,

"fp32": inference.PrecisionType.Float32,

"int8": inference.PrecisionType.Int8

}

precision_mode = precision_map[precision]

if use_tensorrt:

config.enable_tensorrt_engine(max_batch_size=batch_size,

min_subgraph_size=30,

precision_mode=precision_mode)

elif device == "cpu":

config.disable_gpu()

if args.enable_mkldnn:

cache 10 different shapes for mkldnn to avoid memory leak

config.set_mkldnn_cache_capacity(10)

config.enable_mkldnn()

config.set_cpu_math_library_num_threads(args.cpu_threads)

elif device == "xpu":

config.enable_xpu(100)

config.switch_use_feed_fetch_ops(False)

self.predictor = paddle.inference.create_predictor(config)

self.input_handles = [

self.predictor.get_input_handle(name)

for name in self.predictor.get_input_names()

]

self.output_handle = self.predictor.get_output_handle(

self.predictor.get_output_names()[0])

def predict(self, data, tokenizer):

batchify_fn = lambda samples, fn=Tuple(

Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'), # input

Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'

), # segment

): fn(samples)

all_embeddings = []#存放所有数据

examples = []#用来临时存放一个批次数据

for idx, text in enumerate(tqdm(data)):

input_ids, segment_ids = convert_example(

text,

tokenizer,

max_seq_length=self.max_seq_length,

pad_to_max_seq_len=True)

examples.append((input_ids, segment_ids))

if (len(examples) >= self.batch_size):#够一个批次就做下面的事

input_ids, segment_ids = batchify_fn(examples)

self.input_handles[0].copy_from_cpu(input_ids)

self.input_handles[1].copy_from_cpu(segment_ids)

self.predictor.run()

logits = self.output_handle.copy_to_cpu()

all_embeddings.append(logits)

examples = []

if (len(examples) > 0):#处理最后一个批次数据

input_ids, segment_ids = batchify_fn(examples)

self.input_handles[0].copy_from_cpu(input_ids)

self.input_handles[1].copy_from_cpu(segment_ids)

self.predictor.run()

logits = self.output_handle.copy_to_cpu()

all_embeddings.append(logits)

all_embeddings = np.concatenate(all_embeddings, axis=0)

np.save('bxqa_corpus_embedding', all_embeddings)

def read_text(file_path):

file = open(file_path)

id2corpus = {}

for idx, data in enumerate(file.readlines()):

id2corpus[idx] = data.strip()

return id2corpus

predictor = Predictor(model_dir, device, max_seq_length,

batch_size)

tokenizer = AutoTokenizer.from_pretrained('ernie-3.0-medium-zh')

id2corpus = read_text(corpus_file)

corpus_list = [{idx: text} for idx, text in id2corpus.items()]

predictor.predict(corpus_list, tokenizer)

from pymilvus import *

import pymilvus

MILVUS_HOST = '127.0.0.1'

MILVUS_PORT =19530

data_dim = 256

top_k = 10

collection_name = 'bxqa'

partition_tag = 'partition_1'

embedding_name = 'embeddings'

index_config = {

"index_type": "IVF_FLAT",

"metric_type": "L2",

"params": {

"nlist": 500

},

}

search_params = {

"metric_type": "L2",

"params": {

"nprobe": 20

},

}

fmt = "\n=== {:30} ===\n"

text_max_len = 1000

fields = [

FieldSchema(name="pk",

dtype=DataType.INT64,

is_primary=True,

auto_id=False,

max_length=100),

FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=text_max_len),

FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=data_dim)

]

schema = CollectionSchema(fields, "bxqa Index")

class VecToMilvus():

def init(self):

print(fmt.format("start connecting to Milvus"))

connections.connect("default", host=MILVUS_HOST, port=MILVUS_PORT)

self.collection = None

def has_collection(self, collection_name):

try:

has = utility.has_collection(collection_name)

print(f"Does collection {collection_name} exist in Milvus: {has}")

return has

except Exception as e:

print("Milvus has_table error:", e)

def creat_collection(self, collection_name):

try:

print(fmt.format("Create collection {}".format(collection_name)))

self.collection = Collection(collection_name,

schema,

consistency_level="Strong")

except Exception as e:

print("Milvus create collection error:", e)

def drop_collection(self, collection_name):

try:

utility.drop_collection(collection_name)

except Exception as e:

print("Milvus delete collection error:", e)

def create_index(self, index_name):

try:

print(fmt.format("Start Creating index"))

self.collection.create_index(index_name, index_config)

print(fmt.format("Start loading"))

self.collection.load()

except Exception as e:

print("Milvus create index error:", e)

def has_partition(self, partition_tag):

try:

result = self.collection.has_partition(partition_tag)

return result

except Exception as e:

print("Milvus has partition error: ", e)

def create_partition(self, partition_tag):

try:

self.collection.create_partition(partition_tag)

print('create partition {} successfully'.format(partition_tag))

except Exception as e:

print('Milvus create partition error: ', e)

def insert(self, entities, collection_name, index_name, partition_tag=None):

try:

if not self.has_collection(collection_name):#没有集合就创建

self.creat_collection(collection_name)

self.create_index(index_name)

else:

self.collection = Collection(collection_name)

if (partition_tag #没有分区就创建

is not None) and (not self.has_partition(partition_tag)):

self.create_partition(partition_tag)

self.collection.insert(entities, partition_name=partition_tag)

print(

f"Number of entities in Milvus: {self.collection.num_entities}"

) # check the num_entites

except Exception as e:

print("Milvus insert error:", e)

embeddings = np.load('bxqa_corpus_embedding.npy')

embedding_ids = [i for i in range(embeddings.shape[0])]

client = VecToMilvus()

collection_name = 'faq_finance'

client.has_collection(collection_name)

partition_tag = 'partition_1'

data_size = len(embedding_ids)

corpus_list=[list(i.values())[0] for i in corpus_list]

batch_size = 10000

for i in tqdm(range(0, data_size, batch_size)):

cur_end = i + batch_size

if (cur_end > data_size):

cur_end = data_size

batch_emb = embeddings[np.arange(i, cur_end)]

entities = [

[j for j in range(i, cur_end, 1)],

[corpus_list[j][:text_max_len - 1] for j in range(i, cur_end, 1)],

batch_emb # field embeddings, supports numpy.ndarray and list

]

client.insert(collection_name=collection_name,

entities=entities,

index_name=embedding_name,

partition_tag=partition_tag)

相关推荐
莫叫石榴姐10 分钟前
数据科学与SQL:组距分组分析 | 区间分布问题
大数据·人工智能·sql·深度学习·算法·机器学习·数据挖掘
967711 分钟前
对抗样本存在的原因
深度学习
YRr YRr1 小时前
深度学习:神经网络中的损失函数的使用
人工智能·深度学习·神经网络
我爱学Python!1 小时前
大语言模型与图结构的融合: 推荐系统中的新兴范式
人工智能·语言模型·自然语言处理·langchain·llm·大语言模型·推荐系统
日出等日落1 小时前
Windows电脑本地部署llamafile并接入Qwen大语言模型远程AI对话实战
人工智能·语言模型·自然语言处理
静静的喝酒1 小时前
深度学习笔记之BERT(二)BERT精简变体:ALBERT
深度学习·bert·albert
麦麦大数据1 小时前
Python棉花病虫害图谱系统CNN识别+AI问答知识neo4j vue+flask深度学习神经网络可视化
人工智能·python·深度学习
段传涛1 小时前
PAL(Program-Aided Language Model)
人工智能·语言模型·自然语言处理
weixin_443290691 小时前
【阅读记录-章节3】Build a Large Language Model (From Scratch)
人工智能·语言模型·自然语言处理
谢眠2 小时前
深度学习day3-自动微分
python·深度学习·机器学习