jieba 加whooh 构建自己本地数据库的搜索引擎

例子

python 复制代码
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, ID
from jieba.analyse import ChineseAnalyzer
from whoosh.qparser import QueryParser

import os



analyzer = ChineseAnalyzer()
schema = Schema(title=TEXT(stored=True, analyzer=analyzer), content=TEXT(stored=True, analyzer=analyzer), id=ID(stored=True))
if not os.path.exists("index"):
    os.mkdir("index")
ix = create_in("index", schema)


documents = [
	{
		"title": "下文",
		"content": "首先安装jieba和whoosh库,",
		"id": "1"
	},
	{
		"title": "中文自然语言处理",
		"content": "中文自然语言处理涉及分词、词性标注、命名实体识别等...",
		"id": "2"
	}
]

writer = ix.writer() 
for doc in documents:
    writer.add_document(title=doc["title"], content=doc["content"], id=doc["id"])
writer.commit()

searcher = ix.searcher()
query_parser = QueryParser("content", schema=ix.schema)
search_input = "jieba和whoosh"
query = query_parser.parse(search_input)
results = searcher.search(query, limit=None)

print(f"找到 {len(results)} 篇相关文档:")
for result in results:
    print(f"{result['id']} - {result['title']}")

实战

python 复制代码
from whoosh.index import create_in,open_dir
from whoosh.fields import Schema, TEXT, ID
from jieba.analyse import ChineseAnalyzer
from whoosh.qparser import QueryParser
from whoosh.index import open_dir
import os

import jieba
import pandas as pd

from glob import glob
from multiprocessing import Process, freeze_support

from tqdm import tqdm


class GenVocTensorForDataSet:
    def __init__(self):
        pass

    @staticmethod
    def gen_data_tensor(data_v, out_dir, process_count):
        """

        :param data_v:
        :param out_dir:
        :param process_count:
        :return:
        """
        total_l = []
        one_p_count = 0
        for one_v in tqdm(data_v):
            one_p_count += 1

            with open(one_v, "r", encoding="utf-8") as f:
                total_str = f.read()
                total_str = "".join(total_str.split())
            one_data = list(jieba.cut(total_str))
            documents = []
            text = ""
            for one in one_data:
                text += one
                if text not in total_str[len("".join(documents)) + len(text):]:
                    documents.append(text)
                    text = ""
            total_l.append(documents)
        pd.to_pickle({"voc": total_l},
                     out_dir + "/{}{}.pandas_pickle_data_set".format(process_count, one_p_count))

    def gen_voc_data_to_tensor_set(self, paths_list_dir, out_dir, works_num=8):
        """
        唯一长度拆分
        :param paths_list_dir: 多个txt 的文件夹
        :param works_num:
        :return:
        """
        paths_list_pr = glob(pathname=paths_list_dir + "*")

        p_list = []
        # 发任务到异步进程
        for i in range(0, len(paths_list_pr), len(paths_list_pr) // works_num):
            j = len(paths_list_pr) // works_num + i

            p = Process(target=self.gen_data_tensor, args=(
                paths_list_pr[i:j], out_dir, i))
            p.start()
            p_list.append(p)

        for p in p_list:
            p.join()

    @staticmethod
    def init_data_set(paths_list_dir):
        paths_list_pr = glob(pathname=paths_list_dir + "*")
        analyzer = ChineseAnalyzer()
        schema = Schema(title=TEXT(stored=True, analyzer=analyzer), content=TEXT(stored=True, analyzer=analyzer),
                        id=ID(stored=True))
        if not os.path.exists("index"):
            os.mkdir("index")
        with create_in("index", schema, indexname='article_index') as ix:


            # documents = [
            #     {
            #         "title": "下文",
            #         "content": "首先安装jieba和whoosh库,",
            #         "id": "1"
            #     },
            #     {
            #         "title": "中文自然语言处理",
            #         "content": "中文自然语言处理涉及分词、词性标注、命名实体识别等...",
            #         "id": "2"
            #     }
            # ]

            writer = ix.writer()
            total_count_id = 0
            for one_p in paths_list_pr:
                documents = pd.read_pickle(one_p)
                for doc in tqdm(documents["voc"]):
                    for doc_i, doc_j in zip(doc[1:], doc[:-1]):
                        writer.add_document(title=doc_i, content=doc_j, id=str(total_count_id))
                        total_count_id += 1
            writer.commit()

    @staticmethod
    def add_data_set(paths_list_dir):
        paths_list_pr = glob(pathname=paths_list_dir + "*")
        with open_dir("indexdir", indexname='article_index') as ix:
            writer = ix.writer()
            total_count_id = 0
            for one_p in paths_list_pr:
                documents = pd.read_pickle(one_p)
                for doc in tqdm(documents["voc"]):
                    for doc_i, doc_j in zip(doc[1:], doc[:-1]):
                        writer.add_document(title=doc_i, content=doc_j, id=str(total_count_id))
                        total_count_id += 1
            writer.commit()


    @staticmethod
    def search_by_jieba_world(search_text):
        ix = open_dir("index", indexname='article_index')
        with ix.searcher() as searcher:
            query_parser = QueryParser("content", schema=ix.schema)
            search_input = search_text
            query = query_parser.parse(search_input)
            results = searcher.search(query, limit=None)

            print(f"找到 {len(results)} 篇相关文档:")
            for result in results:
                print(f"{result['id']} - {result['title']}")
        return results


if __name__ == '__main__':
    freeze_support()
    txt_p = "E:/just_and_sum/data_sets/"
    gvt_fds = GenVocTensorForDataSet()
    # 生成分词库
    # gvt_fds.gen_voc_data_to_tensor_set(txt_p, "E:/just_and_sum/data_set_d",works_num=8)
    # 初始化数据库
    # data_base = gvt_fds.init_data_set("E:/just_and_sum/data_set_d/")
    # 搜索
    search_res = gvt_fds.search_by_jieba_world("头孢克洛头孢泊肟酯是同")
    print(search_res)
相关推荐
钮钴禄·爱因斯晨22 分钟前
Python常见的文件操作
android·数据库·python
AI小云42 分钟前
【数据操作与可视化】Pandas数据处理-Series数据结构
开发语言·数据结构·python·numpy·pandas
Python大数据分析@1 小时前
如何理解Python中的yield用法?
python
源码之家2 小时前
基于python新闻数据分析可视化系统 Hadoop 新闻平台 爬虫 情感分析 舆情分析 可视化 Django框架 vue框架 机器学习 大数据毕业设计✅
大数据·爬虫·python·数据分析·毕业设计·情感分析·新闻
IT油腻大叔2 小时前
DeepSeek-多层注意力计算机制理解
python·深度学习·机器学习
小呀小萝卜儿2 小时前
2025-11-17 学习记录--Python-机器学习作业:项目1 - PM2.5预测
python·学习·机器学习
闲人编程2 小时前
CPython与PyPy性能对比:不同解释器的优劣分析
python·算法·编译器·jit·cpython·codecapsule
kk哥88992 小时前
PyCharm 2025.1 是什么编程语言,如何安装
python·php
海拥2 小时前
基于 IPIDEA 的 SERP 结构化数据抽取与趋势监控的工程化实践
python
yivifu3 小时前
EPUB文件HTML批量修改避坑
python·epub·zipfile