m3e向量化mysql某表

一.在线请求m3e

1.写个m3e-run.python

python 复制代码

# -*- coding: utf-8 -*-
import os
import time
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"

import pymysql
import chromadb
from chromadb.utils import embedding_functions

# ======================================
# MySQL 配置
# ======================================
MYSQL_CONFIG = {
    "host": "localhost",
    "user": "root",
    "password": "root",
    "database": "after260518",
    "charset": "utf8mb4"
}

print("=" * 70)
print(" 🚀 开始使用 m3e 模型进行 AI 向量化导入 ")
print("=" * 70)

# ======================================
# 加载 m3e 模型
# ======================================
start_time = time.time()
print(f"\n[日志] 开始加载 m3e 中文语义模型...")

try:
    embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name="moka-ai/m3e-base"
    )
    print(f"[日志] ✅ m3e 模型加载成功！耗时：{round(time.time() - start_time, 2)}s")
except Exception as e:
    print(f"[日志] ❌ 模型加载失败：{e}")
    exit()

# ======================================
# 连接 Chroma
# ======================================
print("\n[日志] 正在连接 Chroma 向量库...")
client = chromadb.PersistentClient(path="./chroma_db")

try:
    client.delete_collection("zm_work_records")
    print("[日志] 已清空旧数据，重新创建...")
except:
    pass

collection = client.get_or_create_collection(
    name="zm_work_records",
    embedding_function=embedding_func
)
print("[日志] ✅ Chroma 连接就绪")

# ======================================
# 连接 MySQL
# ======================================
print("\n[日志] 正在连接 MySQL...")
try:
    db = pymysql.connect(**MYSQL_CONFIG)
    cursor = db.cursor(pymysql.cursors.DictCursor)
    print("[日志] ✅ MySQL 连接成功")
except Exception as e:
    print(f"[日志] ❌ MySQL 连接失败：{e}")
    exit()

# ======================================
# 读取数据（联表查矿名）
# ======================================
print("\n[日志] 正在读取数据表 zm_summary_all...")
cursor.execute("SELECT m.mine_name as minename, a.id, a.rem, a.description, a.process, a.content FROM zm_summary_all a left join zm_mine m on m.mine_id = a.mine_id ")
rows = cursor.fetchall()
total = len(rows)
print(f"[日志] ✅ 数据读取完成，共 {total} 条记录")

# ======================================
# 开始导入
# ======================================
print("\n[日志] 开始 AI 向量化导入...")
success = 0
fail = 0

for index, row in enumerate(rows, 1):
    try:
        # 【修复】缩进统一 + 空值安全处理
        minename = str(row.get("minename", ""))
        doc_id = str(row.get("id", ""))
        title = str(row.get("rem", ""))
        desc = str(row.get("description", ""))
        process = str(row.get("process", ""))
        content = str(row.get("content", ""))

        # 拼接向量化文本
        text = f"矿名：{minename} 标题：{title} 描述：{desc}"

        metadata = {
            "minename": minename,
            "rem": title,
            "description": desc,
            "process": process,
            "content": content
        }

        collection.add(
            ids=[doc_id],
            documents=[text],
            metadatas=[metadata]
        )

        print(f"[日志] 第 {index}/{total} 条 | ID:{doc_id} 导入成功")
        success += 1

    except Exception as e:
        print(f"[日志] 第 {index} 条导入失败：{str(e)}")
        fail += 1

# ======================================
# 最终汇总
# ======================================
print("\n" + "="*70)
print(f"[日志] 🎉 导入全部完成！")
print(f"[日志] 总记录：{total}")
print(f"[日志] 成功：{success}")
print(f"[日志] 失败：{fail}")
print(f"[日志] 总耗时：{round(time.time() - start_time, 2)}s")
print("="*70)

db.close()
print("\n[日志] 数据库连接已关闭")

2.运行试试：

m3e-query.python

python 复制代码

# -*- coding: utf8 -*-
import os
import time
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"

import chromadb
from chromadb.utils import embedding_functions

print("=" * 70)
print(" 🔍 m3e AI 语义检索系统（带日志版）")
print("=" * 70)

# ======================================
# 加载模型
# ======================================
start_time = time.time()
print("\n[日志] 加载 m3e 模型...")

embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="moka-ai/m3e-base"
)
print(f"[日志] ✅ 模型加载完成，耗时：{round(time.time() - start_time, 2)}s")

# ======================================
# 连接向量库
# ======================================
print("\n[日志] 连接 Chroma...")
client = chromadb.PersistentClient(path="./chroma_db")
collection = client.get_collection(
    name="zm_work_records",
    embedding_function=embedding_func
)
print("[日志] ✅ 连接成功，可以开始查询")

# ======================================
# 你的问题
# ======================================
question = "视频"  # 这里改问题

print(f"\n[日志] 用户问题：{question}")
print("[日志] 正在进行 AI 语义匹配...")

# ======================================
# 开始查询
# ======================================
query_start = time.time()
results = collection.query(
    query_texts=[question],
    n_results=3,
    include=["metadatas", "distances"]
)
print(f"[日志] 查询完成，耗时：{round(time.time() - query_start, 4)}s")

# ======================================
# 输出结果（带相似度分数）
# ======================================
print("\n" + "="*70)
print(" 📊 匹配结果（按相似度排序）")
print("="*70)

metadatas = results["metadatas"][0]
distances = results["distances"][0]

for idx, (meta, dist) in enumerate(zip(metadatas, distances), 1):
    # 距离转相似度
    similarity = round(100 - (dist * 100), 2)
    print(f"\n【结果 {idx}】| 相似度：{similarity}%")
    print("问题标题：", meta["rem"])
    print("排查思路：", meta["process"])
    print("解决办法：", meta["content"])

print("\n[日志] 程序正常结束\n")

以上代码问题关键字：视频

二.下载m3e并且调用本地离线版：

1.下载

①下载huggingface‑cli

bash 复制代码

pip install -U huggingface_hub

配置path C:\Users\Administrator\AppData\Local\Python\pythoncore-3.14-64\Scripts

因为要用 huggingface_hub命令

hf --version 查看版本1.16.4

②下载m3e

bash 复制代码

最小版
huggingface-cli download moka-ai/m3e-small --local-dir ./m3e-small

标准版
huggingface-cli download moka-ai/m3e-base --local-dir ./m3e-base

最大版
huggingface-cli download moka-ai/m3e-large --local-dir ./m3e-large

提示 Warning: `huggingface-cli` is deprecated and no longer works. Use `hf` instead.

改成 hf download moka-ai/m3e-base --local-dir ./m3e-base

2.执行查询 py_run_local.python 进行表数据向量化

python 复制代码

# -*- coding: utf-8 -*-
import os
import time

# 不用联网，直接本地加载
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
os.environ["TRANSFORMERS_OFFLINE"] = "1"
os.environ["HF_HUB_OFFLINE"] = "1"

import pymysql
import chromadb
from chromadb.utils import embedding_functions

# ======================================
# MySQL 配置
# ======================================
MYSQL_CONFIG = {
    "host": "localhost",
    "user": "root",
    "password": "root",
    "database": "after260518",
    "charset": "utf8mb4"
}

print("=" * 70)
print(" 🚀 开始使用 本地 m3e-large 模型进行 AI 向量化导入 ")
print("=" * 70)

# ======================================
# 加载 本地 m3e-large 模型（离线、秒加载）
# ======================================
start_time = time.time()
print(f"\n[日志] 开始加载 本地 m3e-large 模型...")

try:
    embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name="./m3e-large",    # 本地最大版模型
        local_files_only=True        # 强制离线
    )
    print(f"[日志] ✅ m3e-large 模型加载成功！耗时：{round(time.time() - start_time, 2)}s")
except Exception as e:
    print(f"[日志] ❌ 模型加载失败：{e}")
    exit()

# ======================================
# 连接 Chroma
# ======================================
print("\n[日志] 正在连接 Chroma 向量库...")
client = chromadb.PersistentClient(path="./chroma_db")

try:
    client.delete_collection("zm_work_records")
    print("[日志] 已清空旧数据，重新创建...")
except:
    pass

collection = client.get_or_create_collection(
    name="zm_work_records",
    embedding_function=embedding_func
)
print("[日志] ✅ Chroma 连接就绪")

# ======================================
# 连接 MySQL
# ======================================
print("\n[日志] 正在连接 MySQL...")
try:
    db = pymysql.connect(**MYSQL_CONFIG)
    cursor = db.cursor(pymysql.cursors.DictCursor)
    print("[日志] ✅ MySQL 连接成功")
except Exception as e:
    print(f"[日志] ❌ MySQL 连接失败：{e}")
    exit()

# ======================================
# 读取数据（联表查矿名）
# ======================================
print("\n[日志] 正在读取数据表 zm_summary_all...")
cursor.execute("SELECT m.mine_name as minename, a.id, a.rem, a.description, a.process, a.content FROM zm_summary_all a left join zm_mine m on m.mine_id = a.mine_id ")
rows = cursor.fetchall()
total = len(rows)
print(f"[日志] ✅ 数据读取完成，共 {total} 条记录")

# ======================================
# 开始导入
# ======================================
print("\n[日志] 开始 AI 向量化导入...")
success = 0
fail = 0

for index, row in enumerate(rows, 1):
    try:
        minename = str(row.get("minename", ""))
        doc_id = str(row.get("id", ""))
        title = str(row.get("rem", ""))
        desc = str(row.get("description", ""))
        process = str(row.get("process", ""))
        content = str(row.get("content", ""))

        text = f"矿名：{minename} 标题：{title} 描述：{desc}"

        metadata = {
            "minename": minename,
            "rem": title,
            "description": desc,
            "process": process,
            "content": content
        }

        collection.add(
            ids=[doc_id],
            documents=[text],
            metadatas=[metadata]
        )

        print(f"[日志] 第 {index}/{total} 条 | ID:{doc_id} 导入成功")
        success += 1

    except Exception as e:
        print(f"[日志] 第 {index} 条导入失败：{str(e)}")
        fail += 1

# ======================================
# 最终汇总
# ======================================
print("\n" + "="*70)
print(f"[日志] 🎉 导入全部完成！")
print(f"[日志] 总记录：{total}")
print(f"[日志] 成功：{success}")
print(f"[日志] 失败：{fail}")
print(f"[日志] 总耗时：{round(time.time() - start_time, 2)}s")
print("="*70)

db.close()
print("\n[日志] 数据库连接已关闭")

3.执行问题查询

python 复制代码

# -*- coding: utf8 -*-
import os
import time

# 强制离线模式，不联网、不下载
os.environ["TRANSFORMERS_OFFLINE"] = "1"
os.environ["HF_HUB_OFFLINE"] = "1"

import chromadb
from chromadb.utils import embedding_functions

print("=" * 70)
print(" 🔍 本地离线 m3e-large AI 语义检索系统（带日志）")
print("=" * 70)

# ======================================
# 加载 本地离线 m3e-large 模型
# ======================================
start_time = time.time()
print("\n[日志] 加载本地离线 m3e-large 模型...")

embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="./m3e-large",    # 本地最大模型
    local_files_only=True        # 强制离线
)
print(f"[日志] ✅ 模型加载完成，耗时：{round(time.time() - start_time, 2)}s")

# ======================================
# 连接向量库
# ======================================
print("\n[日志] 连接 Chroma...")
client = chromadb.PersistentClient(path="./chroma_db")
collection = client.get_collection(
    name="zm_work_records",
    embedding_function=embedding_func
)
print("[日志] ✅ 连接成功，可以开始查询")

# ======================================
# 你的问题
# ======================================
question = "视频"  # 这里改问题

print(f"\n[日志] 用户问题：{question}")
print("[日志] 正在进行 AI 语义匹配...")

# ======================================
# 开始查询
# ======================================
query_start = time.time()
results = collection.query(
    query_texts=[question],
    n_results=3,
    include=["metadatas", "distances"]
)
print(f"[日志] 查询完成，耗时：{round(time.time() - query_start, 4)}s")

# ======================================
# 输出结果（带相似度分数）
# ======================================
print("\n" + "="*70)
print(" 📊 匹配结果（按相似度排序）")
print("="*70)

metadatas = results["metadatas"][0]
distances = results["distances"][0]

for idx, (meta, dist) in enumerate(zip(metadatas, distances), 1):
    similarity = round(100 - (dist * 100), 2)
    print(f"\n【结果 {idx}】| 相似度：{similarity}%")
    print("问题标题：", meta["rem"])
    print("排查思路：", meta["process"])
    print("解决办法：", meta["content"])

print("\n[日志] 程序正常结束\n")