m3e向量化mysql某表

一.在线请求m3e

1.写个m3e-run.python

python 复制代码
# -*- coding: utf-8 -*-
import os
import time
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"

import pymysql
import chromadb
from chromadb.utils import embedding_functions

# ======================================
# MySQL 配置
# ======================================
MYSQL_CONFIG = {
    "host": "localhost",
    "user": "root",
    "password": "root",
    "database": "after260518",
    "charset": "utf8mb4"
}

print("=" * 70)
print(" 🚀 开始使用 m3e 模型进行 AI 向量化导入 ")
print("=" * 70)

# ======================================
# 加载 m3e 模型
# ======================================
start_time = time.time()
print(f"\n[日志] 开始加载 m3e 中文语义模型...")

try:
    embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name="moka-ai/m3e-base"
    )
    print(f"[日志] ✅ m3e 模型加载成功!耗时:{round(time.time() - start_time, 2)}s")
except Exception as e:
    print(f"[日志] ❌ 模型加载失败:{e}")
    exit()

# ======================================
# 连接 Chroma
# ======================================
print("\n[日志] 正在连接 Chroma 向量库...")
client = chromadb.PersistentClient(path="./chroma_db")

try:
    client.delete_collection("zm_work_records")
    print("[日志] 已清空旧数据,重新创建...")
except:
    pass

collection = client.get_or_create_collection(
    name="zm_work_records",
    embedding_function=embedding_func
)
print("[日志] ✅ Chroma 连接就绪")

# ======================================
# 连接 MySQL
# ======================================
print("\n[日志] 正在连接 MySQL...")
try:
    db = pymysql.connect(**MYSQL_CONFIG)
    cursor = db.cursor(pymysql.cursors.DictCursor)
    print("[日志] ✅ MySQL 连接成功")
except Exception as e:
    print(f"[日志] ❌ MySQL 连接失败:{e}")
    exit()

# ======================================
# 读取数据(联表查矿名)
# ======================================
print("\n[日志] 正在读取数据表 zm_summary_all...")
cursor.execute("SELECT m.mine_name as minename, a.id, a.rem, a.description, a.process, a.content FROM zm_summary_all a left join zm_mine m on m.mine_id = a.mine_id ")
rows = cursor.fetchall()
total = len(rows)
print(f"[日志] ✅ 数据读取完成,共 {total} 条记录")

# ======================================
# 开始导入
# ======================================
print("\n[日志] 开始 AI 向量化导入...")
success = 0
fail = 0

for index, row in enumerate(rows, 1):
    try:
        # 【修复】缩进统一 + 空值安全处理
        minename = str(row.get("minename", ""))
        doc_id = str(row.get("id", ""))
        title = str(row.get("rem", ""))
        desc = str(row.get("description", ""))
        process = str(row.get("process", ""))
        content = str(row.get("content", ""))

        # 拼接向量化文本
        text = f"矿名:{minename} 标题:{title} 描述:{desc}"

        metadata = {
            "minename": minename,
            "rem": title,
            "description": desc,
            "process": process,
            "content": content
        }

        collection.add(
            ids=[doc_id],
            documents=[text],
            metadatas=[metadata]
        )

        print(f"[日志] 第 {index}/{total} 条 | ID:{doc_id} 导入成功")
        success += 1

    except Exception as e:
        print(f"[日志] 第 {index} 条导入失败:{str(e)}")
        fail += 1

# ======================================
# 最终汇总
# ======================================
print("\n" + "="*70)
print(f"[日志] 🎉 导入全部完成!")
print(f"[日志] 总记录:{total}")
print(f"[日志] 成功:{success}")
print(f"[日志] 失败:{fail}")
print(f"[日志] 总耗时:{round(time.time() - start_time, 2)}s")
print("="*70)

db.close()
print("\n[日志] 数据库连接已关闭")

2.运行试试:

m3e-query.python

python 复制代码
# -*- coding: utf8 -*-
import os
import time
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"

import chromadb
from chromadb.utils import embedding_functions

print("=" * 70)
print(" 🔍 m3e AI 语义检索系统(带日志版)")
print("=" * 70)

# ======================================
# 加载模型
# ======================================
start_time = time.time()
print("\n[日志] 加载 m3e 模型...")

embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="moka-ai/m3e-base"
)
print(f"[日志] ✅ 模型加载完成,耗时:{round(time.time() - start_time, 2)}s")

# ======================================
# 连接向量库
# ======================================
print("\n[日志] 连接 Chroma...")
client = chromadb.PersistentClient(path="./chroma_db")
collection = client.get_collection(
    name="zm_work_records",
    embedding_function=embedding_func
)
print("[日志] ✅ 连接成功,可以开始查询")

# ======================================
# 你的问题
# ======================================
question = "视频"  # 这里改问题

print(f"\n[日志] 用户问题:{question}")
print("[日志] 正在进行 AI 语义匹配...")

# ======================================
# 开始查询
# ======================================
query_start = time.time()
results = collection.query(
    query_texts=[question],
    n_results=3,
    include=["metadatas", "distances"]
)
print(f"[日志] 查询完成,耗时:{round(time.time() - query_start, 4)}s")

# ======================================
# 输出结果(带相似度分数)
# ======================================
print("\n" + "="*70)
print(" 📊 匹配结果(按相似度排序)")
print("="*70)

metadatas = results["metadatas"][0]
distances = results["distances"][0]

for idx, (meta, dist) in enumerate(zip(metadatas, distances), 1):
    # 距离转相似度
    similarity = round(100 - (dist * 100), 2)
    print(f"\n【结果 {idx}】| 相似度:{similarity}%")
    print("问题标题:", meta["rem"])
    print("排查思路:", meta["process"])
    print("解决办法:", meta["content"])

print("\n[日志] 程序正常结束\n")

以上代码问题关键字:视频

二.下载m3e并且调用本地离线版:

1.下载

①下载huggingface‑cli

bash 复制代码
pip install -U huggingface_hub

配置path C:\Users\Administrator\AppData\Local\Python\pythoncore-3.14-64\Scripts

因为要用 huggingface_hub命令

hf --version 查看版本1.16.4

②下载m3e

bash 复制代码
最小版
huggingface-cli download moka-ai/m3e-small --local-dir ./m3e-small

标准版
huggingface-cli download moka-ai/m3e-base --local-dir ./m3e-base

最大版
huggingface-cli download moka-ai/m3e-large --local-dir ./m3e-large

提示 Warning: `huggingface-cli` is deprecated and no longer works. Use `hf` instead.

改成 hf download moka-ai/m3e-base --local-dir ./m3e-base

2.执行查询 py_run_local.python 进行表数据向量化

python 复制代码
# -*- coding: utf-8 -*-
import os
import time

# 不用联网,直接本地加载
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
os.environ["TRANSFORMERS_OFFLINE"] = "1"
os.environ["HF_HUB_OFFLINE"] = "1"

import pymysql
import chromadb
from chromadb.utils import embedding_functions

# ======================================
# MySQL 配置
# ======================================
MYSQL_CONFIG = {
    "host": "localhost",
    "user": "root",
    "password": "root",
    "database": "after260518",
    "charset": "utf8mb4"
}

print("=" * 70)
print(" 🚀 开始使用 本地 m3e-large 模型进行 AI 向量化导入 ")
print("=" * 70)

# ======================================
# 加载 本地 m3e-large 模型(离线、秒加载)
# ======================================
start_time = time.time()
print(f"\n[日志] 开始加载 本地 m3e-large 模型...")

try:
    embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name="./m3e-large",    # 本地最大版模型
        local_files_only=True        # 强制离线
    )
    print(f"[日志] ✅ m3e-large 模型加载成功!耗时:{round(time.time() - start_time, 2)}s")
except Exception as e:
    print(f"[日志] ❌ 模型加载失败:{e}")
    exit()

# ======================================
# 连接 Chroma
# ======================================
print("\n[日志] 正在连接 Chroma 向量库...")
client = chromadb.PersistentClient(path="./chroma_db")

try:
    client.delete_collection("zm_work_records")
    print("[日志] 已清空旧数据,重新创建...")
except:
    pass

collection = client.get_or_create_collection(
    name="zm_work_records",
    embedding_function=embedding_func
)
print("[日志] ✅ Chroma 连接就绪")

# ======================================
# 连接 MySQL
# ======================================
print("\n[日志] 正在连接 MySQL...")
try:
    db = pymysql.connect(**MYSQL_CONFIG)
    cursor = db.cursor(pymysql.cursors.DictCursor)
    print("[日志] ✅ MySQL 连接成功")
except Exception as e:
    print(f"[日志] ❌ MySQL 连接失败:{e}")
    exit()

# ======================================
# 读取数据(联表查矿名)
# ======================================
print("\n[日志] 正在读取数据表 zm_summary_all...")
cursor.execute("SELECT m.mine_name as minename, a.id, a.rem, a.description, a.process, a.content FROM zm_summary_all a left join zm_mine m on m.mine_id = a.mine_id ")
rows = cursor.fetchall()
total = len(rows)
print(f"[日志] ✅ 数据读取完成,共 {total} 条记录")

# ======================================
# 开始导入
# ======================================
print("\n[日志] 开始 AI 向量化导入...")
success = 0
fail = 0

for index, row in enumerate(rows, 1):
    try:
        minename = str(row.get("minename", ""))
        doc_id = str(row.get("id", ""))
        title = str(row.get("rem", ""))
        desc = str(row.get("description", ""))
        process = str(row.get("process", ""))
        content = str(row.get("content", ""))

        text = f"矿名:{minename} 标题:{title} 描述:{desc}"

        metadata = {
            "minename": minename,
            "rem": title,
            "description": desc,
            "process": process,
            "content": content
        }

        collection.add(
            ids=[doc_id],
            documents=[text],
            metadatas=[metadata]
        )

        print(f"[日志] 第 {index}/{total} 条 | ID:{doc_id} 导入成功")
        success += 1

    except Exception as e:
        print(f"[日志] 第 {index} 条导入失败:{str(e)}")
        fail += 1

# ======================================
# 最终汇总
# ======================================
print("\n" + "="*70)
print(f"[日志] 🎉 导入全部完成!")
print(f"[日志] 总记录:{total}")
print(f"[日志] 成功:{success}")
print(f"[日志] 失败:{fail}")
print(f"[日志] 总耗时:{round(time.time() - start_time, 2)}s")
print("="*70)

db.close()
print("\n[日志] 数据库连接已关闭")

3.执行问题查询

python 复制代码
# -*- coding: utf8 -*-
import os
import time

# 强制离线模式,不联网、不下载
os.environ["TRANSFORMERS_OFFLINE"] = "1"
os.environ["HF_HUB_OFFLINE"] = "1"

import chromadb
from chromadb.utils import embedding_functions

print("=" * 70)
print(" 🔍 本地离线 m3e-large AI 语义检索系统(带日志)")
print("=" * 70)

# ======================================
# 加载 本地离线 m3e-large 模型
# ======================================
start_time = time.time()
print("\n[日志] 加载本地离线 m3e-large 模型...")

embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="./m3e-large",    # 本地最大模型
    local_files_only=True        # 强制离线
)
print(f"[日志] ✅ 模型加载完成,耗时:{round(time.time() - start_time, 2)}s")

# ======================================
# 连接向量库
# ======================================
print("\n[日志] 连接 Chroma...")
client = chromadb.PersistentClient(path="./chroma_db")
collection = client.get_collection(
    name="zm_work_records",
    embedding_function=embedding_func
)
print("[日志] ✅ 连接成功,可以开始查询")

# ======================================
# 你的问题
# ======================================
question = "视频"  # 这里改问题

print(f"\n[日志] 用户问题:{question}")
print("[日志] 正在进行 AI 语义匹配...")

# ======================================
# 开始查询
# ======================================
query_start = time.time()
results = collection.query(
    query_texts=[question],
    n_results=3,
    include=["metadatas", "distances"]
)
print(f"[日志] 查询完成,耗时:{round(time.time() - query_start, 4)}s")

# ======================================
# 输出结果(带相似度分数)
# ======================================
print("\n" + "="*70)
print(" 📊 匹配结果(按相似度排序)")
print("="*70)

metadatas = results["metadatas"][0]
distances = results["distances"][0]

for idx, (meta, dist) in enumerate(zip(metadatas, distances), 1):
    similarity = round(100 - (dist * 100), 2)
    print(f"\n【结果 {idx}】| 相似度:{similarity}%")
    print("问题标题:", meta["rem"])
    print("排查思路:", meta["process"])
    print("解决办法:", meta["content"])

print("\n[日志] 程序正常结束\n")
相关推荐
我是一颗柠檬14 小时前
【MySQL全面教学】MySQL视图与触发器Day12(2026年)
数据库·后端·mysql
索西引擎14 小时前
【LangChain 1.0】接入 DeepSeek API:从 API Key 申请到流式响应的完整实践
android·java·langchain
山峰哥15 小时前
索引策略与SQL优化:从Explain对比到生产调优的完整方法论
android·java·数据库·sql·性能优化·深度优先
qq_4523962315 小时前
第八篇:《Dockerfile 指令精讲(一):FROM、RUN、COPY、ADD》
数据库·docker·postgresql
woniu_buhui_fei15 小时前
Redis实现分布式限流
数据库·redis·分布式
二蛋和他的大花15 小时前
高德地图 Flutter 插件:跨 Android / iOS / HarmonyOS 的完整实现
android·flutter·ios
2601_9574188015 小时前
相机如何连接手机?通俗易懂的PTP/MTP连接原理解析
android·数码相机·架构
杜子不疼.15 小时前
从“能用“到“敢用“:DolphinDB 通过国家安全可靠测评,时序数据库国产替代迈入新阶段
数据库·oracle·时序数据库
量子-Alex15 小时前
【大模型智能体】A practical guide to building agents
大数据·数据库·人工智能