jieba 加whooh 构建自己本地数据库的搜索引擎

例子

python 复制代码
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, ID
from jieba.analyse import ChineseAnalyzer
from whoosh.qparser import QueryParser

import os



analyzer = ChineseAnalyzer()
schema = Schema(title=TEXT(stored=True, analyzer=analyzer), content=TEXT(stored=True, analyzer=analyzer), id=ID(stored=True))
if not os.path.exists("index"):
    os.mkdir("index")
ix = create_in("index", schema)


documents = [
	{
		"title": "下文",
		"content": "首先安装jieba和whoosh库,",
		"id": "1"
	},
	{
		"title": "中文自然语言处理",
		"content": "中文自然语言处理涉及分词、词性标注、命名实体识别等...",
		"id": "2"
	}
]

writer = ix.writer() 
for doc in documents:
    writer.add_document(title=doc["title"], content=doc["content"], id=doc["id"])
writer.commit()

searcher = ix.searcher()
query_parser = QueryParser("content", schema=ix.schema)
search_input = "jieba和whoosh"
query = query_parser.parse(search_input)
results = searcher.search(query, limit=None)

print(f"找到 {len(results)} 篇相关文档:")
for result in results:
    print(f"{result['id']} - {result['title']}")

实战

python 复制代码
from whoosh.index import create_in,open_dir
from whoosh.fields import Schema, TEXT, ID
from jieba.analyse import ChineseAnalyzer
from whoosh.qparser import QueryParser
from whoosh.index import open_dir
import os

import jieba
import pandas as pd

from glob import glob
from multiprocessing import Process, freeze_support

from tqdm import tqdm


class GenVocTensorForDataSet:
    def __init__(self):
        pass

    @staticmethod
    def gen_data_tensor(data_v, out_dir, process_count):
        """

        :param data_v:
        :param out_dir:
        :param process_count:
        :return:
        """
        total_l = []
        one_p_count = 0
        for one_v in tqdm(data_v):
            one_p_count += 1

            with open(one_v, "r", encoding="utf-8") as f:
                total_str = f.read()
                total_str = "".join(total_str.split())
            one_data = list(jieba.cut(total_str))
            documents = []
            text = ""
            for one in one_data:
                text += one
                if text not in total_str[len("".join(documents)) + len(text):]:
                    documents.append(text)
                    text = ""
            total_l.append(documents)
        pd.to_pickle({"voc": total_l},
                     out_dir + "/{}{}.pandas_pickle_data_set".format(process_count, one_p_count))

    def gen_voc_data_to_tensor_set(self, paths_list_dir, out_dir, works_num=8):
        """
        唯一长度拆分
        :param paths_list_dir: 多个txt 的文件夹
        :param works_num:
        :return:
        """
        paths_list_pr = glob(pathname=paths_list_dir + "*")

        p_list = []
        # 发任务到异步进程
        for i in range(0, len(paths_list_pr), len(paths_list_pr) // works_num):
            j = len(paths_list_pr) // works_num + i

            p = Process(target=self.gen_data_tensor, args=(
                paths_list_pr[i:j], out_dir, i))
            p.start()
            p_list.append(p)

        for p in p_list:
            p.join()

    @staticmethod
    def init_data_set(paths_list_dir):
        paths_list_pr = glob(pathname=paths_list_dir + "*")
        analyzer = ChineseAnalyzer()
        schema = Schema(title=TEXT(stored=True, analyzer=analyzer), content=TEXT(stored=True, analyzer=analyzer),
                        id=ID(stored=True))
        if not os.path.exists("index"):
            os.mkdir("index")
        with create_in("index", schema, indexname='article_index') as ix:


            # documents = [
            #     {
            #         "title": "下文",
            #         "content": "首先安装jieba和whoosh库,",
            #         "id": "1"
            #     },
            #     {
            #         "title": "中文自然语言处理",
            #         "content": "中文自然语言处理涉及分词、词性标注、命名实体识别等...",
            #         "id": "2"
            #     }
            # ]

            writer = ix.writer()
            total_count_id = 0
            for one_p in paths_list_pr:
                documents = pd.read_pickle(one_p)
                for doc in tqdm(documents["voc"]):
                    for doc_i, doc_j in zip(doc[1:], doc[:-1]):
                        writer.add_document(title=doc_i, content=doc_j, id=str(total_count_id))
                        total_count_id += 1
            writer.commit()

    @staticmethod
    def add_data_set(paths_list_dir):
        paths_list_pr = glob(pathname=paths_list_dir + "*")
        with open_dir("indexdir", indexname='article_index') as ix:
            writer = ix.writer()
            total_count_id = 0
            for one_p in paths_list_pr:
                documents = pd.read_pickle(one_p)
                for doc in tqdm(documents["voc"]):
                    for doc_i, doc_j in zip(doc[1:], doc[:-1]):
                        writer.add_document(title=doc_i, content=doc_j, id=str(total_count_id))
                        total_count_id += 1
            writer.commit()


    @staticmethod
    def search_by_jieba_world(search_text):
        ix = open_dir("index", indexname='article_index')
        with ix.searcher() as searcher:
            query_parser = QueryParser("content", schema=ix.schema)
            search_input = search_text
            query = query_parser.parse(search_input)
            results = searcher.search(query, limit=None)

            print(f"找到 {len(results)} 篇相关文档:")
            for result in results:
                print(f"{result['id']} - {result['title']}")
        return results


if __name__ == '__main__':
    freeze_support()
    txt_p = "E:/just_and_sum/data_sets/"
    gvt_fds = GenVocTensorForDataSet()
    # 生成分词库
    # gvt_fds.gen_voc_data_to_tensor_set(txt_p, "E:/just_and_sum/data_set_d",works_num=8)
    # 初始化数据库
    # data_base = gvt_fds.init_data_set("E:/just_and_sum/data_set_d/")
    # 搜索
    search_res = gvt_fds.search_by_jieba_world("头孢克洛头孢泊肟酯是同")
    print(search_res)
相关推荐
铁蛋AI编程实战1 分钟前
Falcon-H1-Tiny 微型 LLM 部署指南:100M 参数也能做复杂推理,树莓派 / 手机都能跑
java·人工智能·python·智能手机
写代码的【黑咖啡】14 分钟前
Python 中的自然语言处理工具:spaCy
开发语言·python·自然语言处理
高洁0115 分钟前
多模态融合驱动下的具身学习机制研究
python·算法·机器学习·数据挖掘·知识图谱
狗都不学爬虫_18 分钟前
JS逆向 -最新版 盼之(decode__1174、ssxmod_itna、ssxmod_itna2)纯算
javascript·爬虫·python·网络爬虫·wasm
七夜zippoe44 分钟前
Dask:超越内存限制的并行计算——从任务图到分布式调度的实战指南
python·集群·task·array·dataframe·dask
serve the people1 小时前
python环境搭建 (五) Dockerfile 和 docker-compose.yml 核心作用
java·python·docker
维构lbs智能定位1 小时前
工厂人员定位(一)融合定位技术如何重构安全生产与效率管理?(含系统架构、技术选型对比、实际应用)
python·物联网·智慧工厂·厂区人员定位系统·工厂人员定位·工厂定位系统
yufuu981 小时前
进阶技巧与底层原理
jvm·数据库·python
2301_817497331 小时前
使用Flask快速搭建轻量级Web应用
jvm·数据库·python
Warren981 小时前
Allure 常用装饰器:实战用法 + 最佳实践(接口自动化)
运维·服务器·git·python·单元测试·自动化·pytest