一.在线请求m3e
1.写个m3e-run.python
python
# -*- coding: utf-8 -*-
import os
import time
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
import pymysql
import chromadb
from chromadb.utils import embedding_functions
# ======================================
# MySQL 配置
# ======================================
MYSQL_CONFIG = {
"host": "localhost",
"user": "root",
"password": "root",
"database": "after260518",
"charset": "utf8mb4"
}
print("=" * 70)
print(" 🚀 开始使用 m3e 模型进行 AI 向量化导入 ")
print("=" * 70)
# ======================================
# 加载 m3e 模型
# ======================================
start_time = time.time()
print(f"\n[日志] 开始加载 m3e 中文语义模型...")
try:
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
model_name="moka-ai/m3e-base"
)
print(f"[日志] ✅ m3e 模型加载成功!耗时:{round(time.time() - start_time, 2)}s")
except Exception as e:
print(f"[日志] ❌ 模型加载失败:{e}")
exit()
# ======================================
# 连接 Chroma
# ======================================
print("\n[日志] 正在连接 Chroma 向量库...")
client = chromadb.PersistentClient(path="./chroma_db")
try:
client.delete_collection("zm_work_records")
print("[日志] 已清空旧数据,重新创建...")
except:
pass
collection = client.get_or_create_collection(
name="zm_work_records",
embedding_function=embedding_func
)
print("[日志] ✅ Chroma 连接就绪")
# ======================================
# 连接 MySQL
# ======================================
print("\n[日志] 正在连接 MySQL...")
try:
db = pymysql.connect(**MYSQL_CONFIG)
cursor = db.cursor(pymysql.cursors.DictCursor)
print("[日志] ✅ MySQL 连接成功")
except Exception as e:
print(f"[日志] ❌ MySQL 连接失败:{e}")
exit()
# ======================================
# 读取数据(联表查矿名)
# ======================================
print("\n[日志] 正在读取数据表 zm_summary_all...")
cursor.execute("SELECT m.mine_name as minename, a.id, a.rem, a.description, a.process, a.content FROM zm_summary_all a left join zm_mine m on m.mine_id = a.mine_id ")
rows = cursor.fetchall()
total = len(rows)
print(f"[日志] ✅ 数据读取完成,共 {total} 条记录")
# ======================================
# 开始导入
# ======================================
print("\n[日志] 开始 AI 向量化导入...")
success = 0
fail = 0
for index, row in enumerate(rows, 1):
try:
# 【修复】缩进统一 + 空值安全处理
minename = str(row.get("minename", ""))
doc_id = str(row.get("id", ""))
title = str(row.get("rem", ""))
desc = str(row.get("description", ""))
process = str(row.get("process", ""))
content = str(row.get("content", ""))
# 拼接向量化文本
text = f"矿名:{minename} 标题:{title} 描述:{desc}"
metadata = {
"minename": minename,
"rem": title,
"description": desc,
"process": process,
"content": content
}
collection.add(
ids=[doc_id],
documents=[text],
metadatas=[metadata]
)
print(f"[日志] 第 {index}/{total} 条 | ID:{doc_id} 导入成功")
success += 1
except Exception as e:
print(f"[日志] 第 {index} 条导入失败:{str(e)}")
fail += 1
# ======================================
# 最终汇总
# ======================================
print("\n" + "="*70)
print(f"[日志] 🎉 导入全部完成!")
print(f"[日志] 总记录:{total}")
print(f"[日志] 成功:{success}")
print(f"[日志] 失败:{fail}")
print(f"[日志] 总耗时:{round(time.time() - start_time, 2)}s")
print("="*70)
db.close()
print("\n[日志] 数据库连接已关闭")
2.运行试试:
m3e-query.python
python
# -*- coding: utf8 -*-
import os
import time
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
import chromadb
from chromadb.utils import embedding_functions
print("=" * 70)
print(" 🔍 m3e AI 语义检索系统(带日志版)")
print("=" * 70)
# ======================================
# 加载模型
# ======================================
start_time = time.time()
print("\n[日志] 加载 m3e 模型...")
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
model_name="moka-ai/m3e-base"
)
print(f"[日志] ✅ 模型加载完成,耗时:{round(time.time() - start_time, 2)}s")
# ======================================
# 连接向量库
# ======================================
print("\n[日志] 连接 Chroma...")
client = chromadb.PersistentClient(path="./chroma_db")
collection = client.get_collection(
name="zm_work_records",
embedding_function=embedding_func
)
print("[日志] ✅ 连接成功,可以开始查询")
# ======================================
# 你的问题
# ======================================
question = "视频" # 这里改问题
print(f"\n[日志] 用户问题:{question}")
print("[日志] 正在进行 AI 语义匹配...")
# ======================================
# 开始查询
# ======================================
query_start = time.time()
results = collection.query(
query_texts=[question],
n_results=3,
include=["metadatas", "distances"]
)
print(f"[日志] 查询完成,耗时:{round(time.time() - query_start, 4)}s")
# ======================================
# 输出结果(带相似度分数)
# ======================================
print("\n" + "="*70)
print(" 📊 匹配结果(按相似度排序)")
print("="*70)
metadatas = results["metadatas"][0]
distances = results["distances"][0]
for idx, (meta, dist) in enumerate(zip(metadatas, distances), 1):
# 距离转相似度
similarity = round(100 - (dist * 100), 2)
print(f"\n【结果 {idx}】| 相似度:{similarity}%")
print("问题标题:", meta["rem"])
print("排查思路:", meta["process"])
print("解决办法:", meta["content"])
print("\n[日志] 程序正常结束\n")
以上代码问题关键字:视频
二.下载m3e并且调用本地离线版:
1.下载
①下载huggingface‑cli
bash
pip install -U huggingface_hub
配置path C:\Users\Administrator\AppData\Local\Python\pythoncore-3.14-64\Scripts
因为要用 huggingface_hub命令
hf --version 查看版本1.16.4
②下载m3e
bash
最小版
huggingface-cli download moka-ai/m3e-small --local-dir ./m3e-small
标准版
huggingface-cli download moka-ai/m3e-base --local-dir ./m3e-base
最大版
huggingface-cli download moka-ai/m3e-large --local-dir ./m3e-large
提示 Warning: `huggingface-cli` is deprecated and no longer works. Use `hf` instead.
改成 hf download moka-ai/m3e-base --local-dir ./m3e-base
2.执行查询 py_run_local.python 进行表数据向量化
python
# -*- coding: utf-8 -*-
import os
import time
# 不用联网,直接本地加载
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
os.environ["TRANSFORMERS_OFFLINE"] = "1"
os.environ["HF_HUB_OFFLINE"] = "1"
import pymysql
import chromadb
from chromadb.utils import embedding_functions
# ======================================
# MySQL 配置
# ======================================
MYSQL_CONFIG = {
"host": "localhost",
"user": "root",
"password": "root",
"database": "after260518",
"charset": "utf8mb4"
}
print("=" * 70)
print(" 🚀 开始使用 本地 m3e-large 模型进行 AI 向量化导入 ")
print("=" * 70)
# ======================================
# 加载 本地 m3e-large 模型(离线、秒加载)
# ======================================
start_time = time.time()
print(f"\n[日志] 开始加载 本地 m3e-large 模型...")
try:
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
model_name="./m3e-large", # 本地最大版模型
local_files_only=True # 强制离线
)
print(f"[日志] ✅ m3e-large 模型加载成功!耗时:{round(time.time() - start_time, 2)}s")
except Exception as e:
print(f"[日志] ❌ 模型加载失败:{e}")
exit()
# ======================================
# 连接 Chroma
# ======================================
print("\n[日志] 正在连接 Chroma 向量库...")
client = chromadb.PersistentClient(path="./chroma_db")
try:
client.delete_collection("zm_work_records")
print("[日志] 已清空旧数据,重新创建...")
except:
pass
collection = client.get_or_create_collection(
name="zm_work_records",
embedding_function=embedding_func
)
print("[日志] ✅ Chroma 连接就绪")
# ======================================
# 连接 MySQL
# ======================================
print("\n[日志] 正在连接 MySQL...")
try:
db = pymysql.connect(**MYSQL_CONFIG)
cursor = db.cursor(pymysql.cursors.DictCursor)
print("[日志] ✅ MySQL 连接成功")
except Exception as e:
print(f"[日志] ❌ MySQL 连接失败:{e}")
exit()
# ======================================
# 读取数据(联表查矿名)
# ======================================
print("\n[日志] 正在读取数据表 zm_summary_all...")
cursor.execute("SELECT m.mine_name as minename, a.id, a.rem, a.description, a.process, a.content FROM zm_summary_all a left join zm_mine m on m.mine_id = a.mine_id ")
rows = cursor.fetchall()
total = len(rows)
print(f"[日志] ✅ 数据读取完成,共 {total} 条记录")
# ======================================
# 开始导入
# ======================================
print("\n[日志] 开始 AI 向量化导入...")
success = 0
fail = 0
for index, row in enumerate(rows, 1):
try:
minename = str(row.get("minename", ""))
doc_id = str(row.get("id", ""))
title = str(row.get("rem", ""))
desc = str(row.get("description", ""))
process = str(row.get("process", ""))
content = str(row.get("content", ""))
text = f"矿名:{minename} 标题:{title} 描述:{desc}"
metadata = {
"minename": minename,
"rem": title,
"description": desc,
"process": process,
"content": content
}
collection.add(
ids=[doc_id],
documents=[text],
metadatas=[metadata]
)
print(f"[日志] 第 {index}/{total} 条 | ID:{doc_id} 导入成功")
success += 1
except Exception as e:
print(f"[日志] 第 {index} 条导入失败:{str(e)}")
fail += 1
# ======================================
# 最终汇总
# ======================================
print("\n" + "="*70)
print(f"[日志] 🎉 导入全部完成!")
print(f"[日志] 总记录:{total}")
print(f"[日志] 成功:{success}")
print(f"[日志] 失败:{fail}")
print(f"[日志] 总耗时:{round(time.time() - start_time, 2)}s")
print("="*70)
db.close()
print("\n[日志] 数据库连接已关闭")
3.执行问题查询
python
# -*- coding: utf8 -*-
import os
import time
# 强制离线模式,不联网、不下载
os.environ["TRANSFORMERS_OFFLINE"] = "1"
os.environ["HF_HUB_OFFLINE"] = "1"
import chromadb
from chromadb.utils import embedding_functions
print("=" * 70)
print(" 🔍 本地离线 m3e-large AI 语义检索系统(带日志)")
print("=" * 70)
# ======================================
# 加载 本地离线 m3e-large 模型
# ======================================
start_time = time.time()
print("\n[日志] 加载本地离线 m3e-large 模型...")
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
model_name="./m3e-large", # 本地最大模型
local_files_only=True # 强制离线
)
print(f"[日志] ✅ 模型加载完成,耗时:{round(time.time() - start_time, 2)}s")
# ======================================
# 连接向量库
# ======================================
print("\n[日志] 连接 Chroma...")
client = chromadb.PersistentClient(path="./chroma_db")
collection = client.get_collection(
name="zm_work_records",
embedding_function=embedding_func
)
print("[日志] ✅ 连接成功,可以开始查询")
# ======================================
# 你的问题
# ======================================
question = "视频" # 这里改问题
print(f"\n[日志] 用户问题:{question}")
print("[日志] 正在进行 AI 语义匹配...")
# ======================================
# 开始查询
# ======================================
query_start = time.time()
results = collection.query(
query_texts=[question],
n_results=3,
include=["metadatas", "distances"]
)
print(f"[日志] 查询完成,耗时:{round(time.time() - query_start, 4)}s")
# ======================================
# 输出结果(带相似度分数)
# ======================================
print("\n" + "="*70)
print(" 📊 匹配结果(按相似度排序)")
print("="*70)
metadatas = results["metadatas"][0]
distances = results["distances"][0]
for idx, (meta, dist) in enumerate(zip(metadatas, distances), 1):
similarity = round(100 - (dist * 100), 2)
print(f"\n【结果 {idx}】| 相似度:{similarity}%")
print("问题标题:", meta["rem"])
print("排查思路:", meta["process"])
print("解决办法:", meta["content"])
print("\n[日志] 程序正常结束\n")