全文检索官网示例

链接地址:https://milvus.io/docs/zh/full_text_search_with_milvus.md

full_text_demo:

python 复制代码
from typing import List
from __init__ import openai_client
import sys

from pymilvus import (
    MilvusClient,
    DataType,
    Function,
    FunctionType,
    AnnSearchRequest,
    RRFRanker,
)

# Connect to Milvus:连接到 Milvus
uri = "http://ip:19530"
collection_name = "full_text_demo"
client = MilvusClient(uri=uri)
print("连接成功")

# sys.exit()

analyzer_params = {"tokenizer": "standard", "filter": ["lowercase"]}

schema = MilvusClient.create_schema()
schema.add_field(
    field_name="id",
    datatype=DataType.VARCHAR,
    is_primary=True,
    auto_id=True,
    max_length=100,
)
schema.add_field(
    field_name="content",
    datatype=DataType.VARCHAR,
    max_length=65535,
    analyzer_params=analyzer_params,
    enable_match=True,  # Enable text matching
    enable_analyzer=True,  # Enable text analysis
)
schema.add_field(field_name="sparse_vector", datatype=DataType.SPARSE_FLOAT_VECTOR)
schema.add_field(
    field_name="dense_vector",
    datatype=DataType.FLOAT_VECTOR,
    dim=1536,  # Dimension for text-embedding-3-small
)
schema.add_field(field_name="metadata", datatype=DataType.JSON)

bm25_function = Function(
    name="bm25",
    function_type=FunctionType.BM25,
    input_field_names=["content"],
    output_field_names="sparse_vector",
)

schema.add_function(bm25_function)

# 创建索引
index_params = MilvusClient.prepare_index_params()
index_params.add_index(
    field_name="sparse_vector",
    index_type="SPARSE_INVERTED_INDEX",
    metric_type="BM25",
)
index_params.add_index(field_name="dense_vector", index_type="FLAT", metric_type="IP")

if client.has_collection(collection_name):
    client.drop_collection(collection_name)
client.create_collection(
    collection_name=collection_name,
    schema=schema,
    index_params=index_params,
)
print(f"Collection '{collection_name}' created successfully")


# openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
model_name = "text-embedding-3-small"


def get_embeddings(texts: List[str]) -> List[List[float]]:
    if not texts:
        return []

    response = openai_client.embeddings.create(input=texts, model=model_name)
    return [embedding.embedding for embedding in response.data]


# Define indexes
index_params = MilvusClient.prepare_index_params()
index_params.add_index(
    field_name="sparse_vector",
    index_type="SPARSE_INVERTED_INDEX",
    metric_type="BM25",
)
index_params.add_index(field_name="dense_vector", index_type="FLAT", metric_type="IP")

# Drop collection if exist
if client.has_collection(collection_name):
    client.drop_collection(collection_name)
# Create the collection
client.create_collection(
    collection_name=collection_name,
    schema=schema,
    index_params=index_params,
)
print(f"Collection '{collection_name}' created successfully")


# Set up OpenAI for embeddings
openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
model_name = "text-embedding-3-small"


# Define embedding generation function for reuse
def get_embeddings(texts: List[str]) -> List[List[float]]:
    if not texts:
        return []

    response = openai_client.embeddings.create(input=texts, model=model_name)
    return [embedding.embedding for embedding in response.data]

# Example documents to insert
documents = [
    {
        "content": "Milvus is a vector database built for embedding similarity search and AI applications.",
        "metadata": {"source": "documentation", "topic": "introduction"},
    },
    {
        "content": "Full-text search in Milvus allows you to search using keywords and phrases.",
        "metadata": {"source": "tutorial", "topic": "full-text search"},
    },
    {
        "content": "Hybrid search combines the power of sparse BM25 retrieval with dense vector search.",
        "metadata": {"source": "blog", "topic": "hybrid search"},
    },
]

# Prepare entities for insertion
entities = []
texts = [doc["content"] for doc in documents]
embeddings = get_embeddings(texts)

for i, doc in enumerate(documents):
    entities.append(
        {
            "content": doc["content"],
            "dense_vector": embeddings[i],
            "metadata": doc.get("metadata", {}),
        }
    )

# Insert data
client.insert(collection_name, entities)
print(f"Inserted {len(entities)} documents")


# Example query for semantic search
query = "How does Milvus help with similarity search?"

# Generate embedding for query
query_embedding = get_embeddings([query])[0]

# Semantic search using dense vectors
results = client.search(
    collection_name=collection_name,
    data=[query_embedding],
    anns_field="dense_vector",
    limit=5,
    output_fields=["content", "metadata"],
)
dense_results = results[0]

# Print results
print("\nDense Search (Semantic):")
for i, result in enumerate(dense_results):
    print(
        f"{i+1}. Score: {result['distance']:.4f}, Content: {result['entity']['content']}"
    )
相关推荐
fantasy_arch2 小时前
pytorch例子计算两张图相似度
人工智能·pytorch·python
WBluuue4 小时前
数学建模:智能优化算法
python·机器学习·数学建模·爬山算法·启发式算法·聚类·模拟退火算法
赴3355 小时前
矿物分类案列 (一)六种方法对数据的填充
人工智能·python·机器学习·分类·数据挖掘·sklearn·矿物分类
大模型真好玩5 小时前
一文深度解析OpenAI近期发布系列大模型:意欲一统大模型江湖?
人工智能·python·mcp
RPA+AI十二工作室5 小时前
亚马逊店铺绩效巡检_影刀RPA源码解读
chrome·python·rpa·影刀
小艳加油6 小时前
Python机器学习与深度学习;Transformer模型/注意力机制/目标检测/语义分割/图神经网络/强化学习/生成式模型/自监督学习/物理信息神经网络等
python·深度学习·机器学习·transformer
学行库小秘7 小时前
ANN神经网络回归预测模型
人工智能·python·深度学习·神经网络·算法·机器学习·回归
Yn3127 小时前
在 Python 中使用 json 模块的完整指南
开发语言·python·json
秋难降8 小时前
线段树的深度解析(最长递增子序列类解题步骤)
数据结构·python·算法
猿榜8 小时前
Python基础-控制结构
python