Python实战- Milvus 向量库使用相关方法demo

文章目录

前言
- [Python实战- Milvus 向量库使用相关方法demo](#Python实战- Milvus 向量库使用相关方法demo)
- - [1. 建表场景demo](#1. 建表场景demo)
  - [2. 修改表结构场景demo](#2. 修改表结构场景demo)
  - [3. 批量新增、查询场景demo](#3. 批量新增、查询场景demo)
  - [4. 删除、更新、查询场景demo](#4. 删除、更新、查询场景demo)
  - [5. 向量混合查询场景](#5. 向量混合查询场景)

前言

如果您觉得有用的话，记得给博主点个赞，评论，收藏一键三连啊，写作不易啊^ _ ^。

而且听说点赞的人每天的运气都不会太差，实在白嫖的话，那欢迎常来啊!!!

Python实战- Milvus 向量库使用相关方法demo

1. 建表场景demo

下面是我要用的测试表

字段名	类型	性质
id	INT64	普通字段(主键)
user	VARCHAR	普通字段(用户名)
timestamp	INT64	普通字段(时间)
embedding	FLOAT_VECTOR	向量字段

python 复制代码

from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility
import random
import time
import numpy as np
import sys


# 定义连接 Milvus 的方法
def connect_to_milvus(host="localhost", port="19530"):
    try:
        connections.connect("default", host=host, port=port)
        print("连接 Milvus 成功")
    except Exception as e:
        print(f"连接 Milvus 失败: {e}")
        raise


# 定义创建表的函数
def creatTable():
    try:
        # 定义要创建的表的名称
        collection_name = "demo_collection"
        flag = False
        # 如果存在就删除
        if utility.has_collection(collection_name):
            print(f"Collection `{collection_name}` 已存在，准备删除")
            Collection(name=collection_name).drop()
            flag = True

        # 定义表的字段，包括字段名称、数据类型、是否为主键、是否自动增长、最大长度、维度等信息
        fields = [
            FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),  # 主键字段，自动增长
            FieldSchema(name="user", dtype=DataType.VARCHAR, max_length=100),  # 用户字段，字符串类型，最大长度100
            FieldSchema(name="timestamp", dtype=DataType.INT64),  # 时间戳字段，整数类型
            FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=4)  # 向量字段，浮点类型，维度为4
        ]
        # 据字段定义创建表的 schema
        schema = CollectionSchema(fields, description="Test collection")
        # 根据表名和 schema 创建表
        collection = Collection(name=collection_name, schema=schema)
        # 打印表建立完成的提示信息
        if flag:
            print("表已重新建立完成")
        else:
            print("表建立完成")
        for field in collection.schema.fields:
            print(f"  - {field.name}: {field.dtype.name}, {'主键' if field.is_primary else ''}")
    except Exception as e:
        print(f"Exception : {e}")
    finally:
        # 断开连接
        connections.disconnect("default")
        print("✅ 程序执行完毕，连接已断开")

if __name__ == '__main__':
    # 链接
    connect_to_milvus()
    # 创建表
    creatTable()
    sys.exit(0)

执行脚本:

2. 修改表结构场景demo

demo逻辑:

创建一个新的集合：定义新的字段结构。
数据迁移：将现有数据迁移到新的集合。
更新引用：更新应用程序中对集合的引用，指向新的集合。
删除旧集合：在确认数据迁移无误后，删除旧集合。

下面有个两个示例demo，一个新增字段和一个更新字段的示例:

python 复制代码

from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility
import sys


# =========================
# 自动连接 Milvus 的上下文
# =========================
class MilvusConnection:
    def __init__(self, host="localhost", port="19530"):
        self.host = host
        self.port = port

    def __enter__(self):
        try:
            connections.connect("default", host=self.host, port=self.port)
            print("连接 Milvus 成功")
        except Exception as e:
            print(f"连接 Milvus 失败: {e}")
            raise
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        connections.disconnect("default")
        print("✅ 程序执行完毕，连接已断开")


# =============================
# 自动建立向量字段的索引
# =============================
def create_vector_index_if_needed(collection: Collection):
    for field in collection.schema.fields:
        if field.dtype == DataType.FLOAT_VECTOR:
            print(f">>>>>>为向量字段 `{field.name}` 创建索引...")
            collection.create_index(
                field_name=field.name,
                index_params={
                    "index_type": "IVF_FLAT",
                    "metric_type": "L2",
                    "params": {"nlist": 128}
                }
            )
            print(f"✅ 向量字段 `{field.name}` 索引创建完成")
            return


# ==================================
# 创建 Collection + 自动建索引
# ==================================
def create_table(collection_name, fields, description="Test collection"):
    try:
        # 如果存在就删除
        if utility.has_collection(collection_name):
            print(f"Collection `{collection_name}` 已存在，准备删除")
            Collection(name=collection_name).drop()
            print(f"Collection `{collection_name}` 删除成功")

        # 根据字段定义创建表的 schema
        schema = CollectionSchema(fields, description=description)
        # 根据表名和 schema 创建表
        collection = Collection(name=collection_name, schema=schema)
        print("表建立完成")
        create_vector_index_if_needed(collection)
        for field in collection.schema.fields:
            print(f"  - {field.name}: {field.dtype.name}, {'主键' if field.is_primary else ''}")
        return collection
    except Exception as e:
        print(f"Exception : {e}")


# ========================================
# 数据迁移（自动 load 和 flush）
# ========================================
def migrate_data(source_collection_name, target_collection_name):
    try:
        source_collection = Collection(name=source_collection_name)
        target_collection = Collection(name=target_collection_name)

        source_collection.load()

        # 源集合中全部字段
        source_fields = [f.name for f in source_collection.schema.fields]
        # 目标集合中需要插入的字段（排除 auto_id 主键字段）
        target_fields = [f.name for f in target_collection.schema.fields if not f.auto_id]

        print(f"源字段: {source_fields}")
        print(f"目标字段: {target_fields}")

        # 确保从源集合中查询到目标集合需要的字段
        missing_fields = [f for f in target_fields if f not in source_fields]
        if missing_fields:
            print(f"⚠ 警告：源集合中缺少字段 {missing_fields}，将为这些字段插入 None")

        # 构造用于 query 的字段（在源中存在的字段）
        query_fields = [f for f in target_fields if f in source_fields]
        source_data = source_collection.query(expr="id > 0", output_fields=query_fields)
        if not source_data:
            print(f"⚠ 源集合 `{source_collection_name}` 中没有符合条件的数据，跳过数据迁移")
            print("✅ 数据迁移完成")
            return
        # 构建 columnar_data
        columnar_data = [[] for _ in target_fields]
        for row in source_data:
            for i, field in enumerate(target_fields):
                columnar_data[i].append(row.get(field, None))

        target_collection.insert(columnar_data)
        target_collection.flush()

        print("✅ 数据迁移完成")
    except Exception as e:
        print(f"❌ 数据迁移异常: {e}")


# ====================================
# 模拟字段更新：字段替换
# ====================================
def update_field_in_production(collection_name, new_field_schema):
    try:
        # 创建一个新的集合名称
        new_collection_name = f"{collection_name}_new"
        # 如果新集合已存在，删除它
        if utility.has_collection(new_collection_name):
            print(f"新集合 `{new_collection_name}` 已存在，准备删除")
            Collection(name=new_collection_name).drop()

        # 获取现有集合
        collection = Collection(name=collection_name)
        # 获取现有字段
        existing_fields = collection.schema.fields
        # 创建新的字段列表，包括更新的字段
        updated_fields = [new_field_schema if field.name == new_field_schema.name else field for field in
                          existing_fields]
        # 创建新的 schema
        new_schema = CollectionSchema(updated_fields, description=collection.schema.description)
        # 创建新的集合
        new_collection = Collection(name=new_collection_name, schema=new_schema)
        print(f"新集合 `{new_collection_name}` 创建完成")
        create_vector_index_if_needed(new_collection)

        # 迁移数据
        migrate_data(collection_name, new_collection_name)

        # 更新应用程序中对集合的引用
        print(f"请更新应用程序中对集合的引用，将 `{collection_name}` 替换为 `{new_collection_name}`")

        # 删除旧集合
        print(f"删除旧集合 `{collection_name}`")
        collection.drop()
        print('"最终表结构>>>>> ')
        for field in new_collection.schema.fields:
            print(f"  - {field.name}: {field.dtype.name}, {'主键' if field.is_primary else ''}")
        return new_collection_name
    except Exception as e:
        print(f"异常: {e}")


# ====================================
# 定义新增字段的函数
# ====================================
def add_field_to_collection(collection_name, new_field_schema):
    try:
        # 创建一个新的集合名称
        new_collection_name = f"{collection_name}_new"
        # 如果新集合已存在，删除它
        if utility.has_collection(new_collection_name):
            print(f"新集合 `{new_collection_name}` 已存在，准备删除")
            Collection(name=new_collection_name).drop()

        # 获取现有集合
        collection = Collection(name=collection_name)
        # 获取现有字段
        existing_fields = collection.schema.fields
        # 创建新的字段列表，包括新增的字段
        updated_fields = existing_fields + [new_field_schema]
        # 创建新的 schema
        new_schema = CollectionSchema(updated_fields, description=collection.schema.description)
        # 创建新的集合
        new_collection = Collection(name=new_collection_name, schema=new_schema)
        print(f"新集合 `{new_collection_name}` 创建完成")
        create_vector_index_if_needed(new_collection)

        # 迁移数据
        migrate_data(collection_name, new_collection_name)

        # 更新应用程序中对集合的引用
        print(f"请更新应用程序中对集合的引用，将 `{collection_name}` 替换为 `{new_collection_name}`")

        # 删除旧集合
        print(f"删除旧集合 `{collection_name}`")
        print('"最终表结构>>>>> ')
        for field in new_collection.schema.fields:
            print(f"  - {field.name}: {field.dtype.name}, {'主键' if field.is_primary else ''}")
        collection.drop()
        return new_collection_name

    except Exception as e:
        print(f"异常: {e}")


# ====================================
# 修改表结构场景demo
# ====================================
if __name__ == '__main__':
    # 定义要创建的表的名称
    collection_name = "demo_collection"
    # 定义初始字段
    initial_fields = [
        FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),  # 主键字段，自动增长
        FieldSchema(name="user", dtype=DataType.INT64, max_length=100),  # 用户字段，字符串类型，最大长度100
        FieldSchema(name="timestamp", dtype=DataType.INT64),  # 时间戳字段，整数类型
        FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=4)  # 向量字段，浮点类型，维度为4
    ]
    # 定义更新的字段
    updated_field = FieldSchema(name="user", dtype=DataType.VARCHAR, max_length=200)  # 更新字段
    # 定义新增的字段
    new_field = FieldSchema(name="tag", dtype=DataType.VARCHAR, max_length=20)  # 新增字段

    # 使用上下文管理器连接 Milvus
    with MilvusConnection():
        # 创建表
        collection = create_table(collection_name, initial_fields)
        try:
            # 更新字段
            print("================更新字段===============")
            new_collection_name = update_field_in_production(collection_name, updated_field)
            # 新增字段
            print("================新增字段===============")
            new_collection_name = add_field_to_collection(new_collection_name, new_field)
        except Exception as e:
            print(f"异常: {e}")

    sys.exit(0)

注意的是，对含有向量字段的表查询时，该向量字段必须添加索引。

更新效果:

新增效果:

3. 批量新增、查询场景demo

代码如下:

python 复制代码

from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility
import sys
import random
import time


# =========================
# 自动连接 Milvus 的上下文
# =========================
class MilvusConnection:
    def __init__(self, host="localhost", port="19530"):
        self.host = host
        self.port = port

    def __enter__(self):
        try:
            connections.connect("default", host=self.host, port=self.port)
            print("连接 Milvus 成功")
        except Exception as e:
            print(f"连接 Milvus 失败: {e}")
            raise
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        connections.disconnect("default")
        print("✅ 程序执行完毕，连接已断开")


# =============================
# 自动建立向量字段的索引
# =============================
def create_vector_index_if_needed(collection: Collection):
    for field in collection.schema.fields:
        if field.dtype == DataType.FLOAT_VECTOR:
            print(f">>>>>>为向量字段 `{field.name}` 创建索引...")
            collection.create_index(
                field_name=field.name,
                index_params={
                    "index_type": "IVF_FLAT",
                    "metric_type": "L2",
                    "params": {"nlist": 128}
                }
            )
            print(f"✅ 向量字段 `{field.name}` 索引创建完成")
            return


# ==================================
# 创建 Collection + 自动建索引
# ==================================
def create_table(collection_name, fields, description="Test collection"):
    try:
        # 如果存在就删除
        if utility.has_collection(collection_name):
            print(f"Collection `{collection_name}` 已存在，准备删除")
            Collection(name=collection_name).drop()
            print(f"Collection `{collection_name}` 删除成功")

        # 根据字段定义创建表的 schema
        schema = CollectionSchema(fields, description=description)
        # 根据表名和 schema 创建表
        collection = Collection(name=collection_name, schema=schema)
        print("表建立完成")
        create_vector_index_if_needed(collection)
        for field in collection.schema.fields:
            print(f"  - {field.name}: {field.dtype.name}, {'主键' if field.is_primary else ''}")
        return collection
    except Exception as e:
        print(f"Exception : {e}")


# =========================
# 批量插入
# =========================
def insert_mock_data(collection: Collection, num_rows=10):
    print(f"🚀 插入测试数据，共 {num_rows} 条")

    # 固定几个向量模板（4维）
    fixed_vectors = [
        [0.1, 0.2, 0.3, 0.4],
        [0.4, 0.3, 0.2, 0.1],
        [0.9, 0.8, 0.7, 0.6],
        [0.5, 0.5, 0.5, 0.5]
    ]
    # 固定几个用户模板
    fixed_user = [
        'yangzhenyu',
        'anran',
        'xiaoyu',
        'wanghui',
        'taoyutian'
    ]

    # 构造字段数据
    id_list = [f'pk_id_{i}' for i in range(num_rows)]
    user_list = [random.choice(fixed_user) for _ in range(num_rows)]
    timestamp_list = [int(time.time()) + i for i in range(num_rows)]
    vector_list = [random.choice(fixed_vectors) for _ in range(num_rows)]

    # 按列插入（注意顺序要和 schema 对应：user, timestamp, embedding）
    entities = [
        id_list,
        user_list,
        timestamp_list,
        vector_list
    ]

    try:
        collection.insert(entities)
        collection.flush()
        print("✅ 插入完成")
    except Exception as e:
        print(f"Exception : {e}")


# =========================
# 查询最新N条数据（按 timestamp 降序）
# =========================
def query_latest_data(collection: Collection, num=10):
    print(f"🔍 查询最近 {num} 条数据...")
    try:
        collection.load()
        result = collection.query(
            expr="timestamp >= 0",
            output_fields=["user", "timestamp", "embedding"],
            limit=num,
            order_by=[{"field_name": "timestamp", "order": "desc"}
        ]  # 需要 Milvus >= 2.3 ,注意的是对于大部分全表扫描、简单表达式，排序可能无效（会退化成插入顺序或主键顺序），不是"完全稳定可靠"的功能
        )
        for i, row in enumerate(result, 1):
            print(f"{i}. {row}")
    except Exception as e:
        print(f"Exception : {e}")


def query_latest_data_check(collection: Collection, num=10):
    print(f"🔍 查询最近 {num} 条数据（期望 timestamp 降序）...")
    try:
        collection.load()
        result = collection.query(
            expr="timestamp >= 0",
            output_fields=["id", "user", "timestamp"],
            limit=num,
            order_by=[{"field_name": "timestamp", "order": "desc"}]
        )
        for i, row in enumerate(result, 1):
            print(f"{i}. {row}")
        print("⚠️ 检查是否为降序:")
        for i in range(1, len(result)):
            if result[i]["timestamp"] > result[i-1]["timestamp"]:
                print("❌ 排序异常：不是严格降序")
                break
        else:
            print("✅ 排序正确：为严格降序")
    except Exception as e:
        print(f"Exception : {e}")


# ====================================
# 批量新增、查询场景demo
# ====================================
if __name__ == '__main__':
    # 定义要创建的表的名称
    collection_name = "demo_collection"
    # 定义初始字段
    initial_fields = [
        FieldSchema(name="id", dtype=DataType.VARCHAR, max_length=100, is_primary=True, auto_id=False),  # 主键字段，自动增长
        FieldSchema(name="user", dtype=DataType.VARCHAR, max_length=100),  # 用户字段，字符串类型，最大长度100
        FieldSchema(name="timestamp", dtype=DataType.INT64),  # 时间戳字段，整数类型
        FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=4)  # 向量字段，浮点类型，维度为4
    ]
    # 使用上下文管理器连接 Milvus
    with MilvusConnection():
        # 创建表
        collection = create_table(collection_name, initial_fields)
        try:
            # 批量插入
            print("================批量插入===============")
            insert_mock_data(collection, 3500)
            # 查询最新10条数据验证
            print("================查询最新10条数据验证===============")
            query_latest_data(collection, num=10)

        except Exception as e:
            print(f"异常: {e}")

    sys.exit(0)

执行结果:

4. 删除、更新、查询场景demo

demo流程:

1、查询pk_id_0

2、删除pk_id_0

3、查询pk_id_0

4、查询pk_id_1

5、更新pk_id_1

6、查询pk_id_1

注意点:

1、Milvus 的 upsert() 是更新或插入数据，但它更新后的数据并不会立即反映在 query() 中，因为:

upsert() 相当于执行 insert()，如果主键重复则覆盖。
数据是异步写入的（写入 WAL -> 后台持久化 -> 索引构建），即使你调用了 flush()，也不意味着数据立刻可查。
查询操作依赖的是被加载（load）的 segments，更新后新的 segment 没被加载进内存，所以查询不到。
解决方法是在更新后查询的环节前，新加 collection.load() 重新加载最新的 segments，前置要求是segments是最新的。

2、释放 Milvus 中当前加载的 Collection 的内存资源，Milvus 的数据不会默认一直加载在内存里。你需要显式调用 collection.load() 才能做查询（query）、搜索（search）等操作，当你不再需要操作这个集合时,调用collection.release()释放内存中的数据资源，避免资源占用太多。

3、Milvus 的 flush() 和 load() 不保证数据立即可查，因为flush()是异步的，即使你调用了 flush()， Milvus 可能还在后台处理 segment，

解决方案是等 2 秒再 load，确保 flush 后的数据已经落盘并 segment ready。

4、保持 id 不变 + 更新的话，建表时不能使用auto_id=False

情况	是否能用 upsert	是否能指定 id	备注
auto_id = True（自动 id）	❌ 不能用 upsert	❌ 不能传 id	只能删除后重新插入，id 会变
auto_id = False（手动 id）	✅ 可以用 upsert	✅ 需要传 id	才能根据主键正确更新记录

5、用 auto_id=True的限制

目的	能否实现	原因说明
删除指定 id 的数据	✅ 可以	delete(expr="id == xxx") 支持
更新指定 id 的数据	❌ 不行	无法重新插入相同 id，Milvus 不支持指定 id
插入时控制主键	❌ 不行	auto_id=True 会忽略用户手动设置的 id 值
查询用原始 id 查结果	✅ 可以	但仅限于原始记录未被删除时

6、排序功能 collection.query() 的 order_by 对于大部分全表扫描、简单表达式，排序可能无效， order_by 主要在有过滤条件且索引支持的情况下效果更好，这可以视为当前 Milvus SDK 和引擎的一个已知限制，不是"完全稳定可靠"的功能。

代码:

python 复制代码

import time
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility
import sys


# ====================================
# 查询函数
# ====================================
def query_by_id(collection: Collection, id_value: str):
    print(f"🔍 查询 id = {id_value} ...")
    try:
        results = collection.query(expr=f"id == '{id_value}'",
                                   output_fields=["id", "user", "timestamp", "embedding"])
        if results:
            print(f"✅ 查询成功: {results[0]}")
        else:
            print("⚠️ 没有找到该记录")
    except Exception as e:
        print(f"❌ 查询异常: {e}")


# ====================================
# 删除函数
# ====================================
def delete_by_id(collection: Collection, id_value: str):
    print(f"🗑️ 删除 id = {id_value} ...")
    try:
        collection.delete(expr=f"id == '{id_value}'")
        collection.flush()
        print("✅ 删除成功")
    except Exception as e:
        print(f"❌ 删除异常: {e}")


# ====================================
# 更新函数
# ====================================
def update_by_id(collection: Collection, id_value: str, new_user: str):
    print(f"✏️ 更新 id = {id_value} 的 user 字段为 `{new_user}` ...")
    try:
        # 查询旧数据
        old_data = collection.query(expr=f"id == '{id_value}'",
                                    output_fields=["user", "timestamp", "embedding"])
        if not old_data:
            print("==============找不到需要更新的数据")
            return
        # 组织更新数据（主键不能变）
        timestamp = int(time.time())
        embedding = old_data[0]["embedding"]
        # Milvus 的 upsert() 是更新或插入数据，但它更新后的数据并不会立即反映在 query() 中
        collection.upsert([
            [id_value],
            [new_user],
            [timestamp],
            [embedding]
        ])
        collection.flush()
        print("✅ 更新成功")
    except Exception as e:
        print(f"❌ 更新异常: {e}")


# ====================================
# 删除、更新、查询场景demo
# ====================================
if __name__ == '__main__':
    # 定义要创建的表的名称
    collection_name = "demo_collection"

    id1 = 'pk_id_0'
    id2 = 'pk_id_1'
    try:
        connections.connect("default", host="localhost", port="19530")
        collection = Collection(collection_name)
        collection.load()

        # 1. 查询 id1
        print("\n=== Step 1: 查询 id1 ===")
        query_by_id(collection, id1)

        # 2. 删除 id1
        print("\n=== Step 2: 删除 id1 ===")
        delete_by_id(collection, id1)

        # 3. 再查 id1
        print("\n=== Step 3: 再查 id1 ===")
        query_by_id(collection, id1)

        # 4. 查询 id2
        print("\n=== Step 4: 查询 id2 ===")
        query_by_id(collection, id2)

        # 5. 更新 id2 的 user 字段
        print("\n=== Step 5: 更新 id2 的 user 字段 ===")
        update_by_id(collection, id2, "yangzhenyu")

        # 6. 查询 id2 查看更新结果
        print("\n=== Step 6: 查询 id2 查看更新结果 ===")
        time.sleep(2)  # 等 2 秒再 load，确保 flush 后的数据已经落盘并 segment ready
        collection.load()  # 🔁 重新加载最新的 segments
        query_by_id(collection, id2)

        # 释放 Milvus 中当前加载的 Collection 的内存资源
        # Milvus 的数据不会默认一直加载在内存里。你需要显式调用 collection.load() 才能做查询（query）、搜索（search）等操作
        # 当你不再需要操作这个集合时,调用collection.release()释放内存中的数据资源，避免资源占用太多
        collection.release()
    except Exception as e:
        print(f"异常: {e}")
    finally:
        connections.disconnect("default")
        print("✅ 程序执行完毕，连接已断开")

    sys.exit(0)

执行效果:

Python实战- Milvus 向量库 使用相关方法demo

文章目录

前言

Python实战- Milvus 向量库 使用相关方法demo

1. 建表场景demo

2. 修改表结构场景demo

3. 批量新增、查询场景demo

4. 删除、更新、查询场景demo

5. 向量混合查询场景

Python实战- Milvus 向量库使用相关方法demo

Python实战- Milvus 向量库使用相关方法demo