Boost 搜索引擎

boost搜索引擎

一、背景
二、技术栈和项目环境
[三、正排索引 vs 倒排索引-搜索引擎具体原理`](#三、正排索引 vs 倒排索引-搜索引擎具体原理`)
- 正排索引
[四、编写去除标签与数据清洗的模块 Parser](#四、编写去除标签与数据清洗的模块 Parser)
五、编写建立索引模块index
六、编写搜索引擎模块Seacher.hpp
七、编写http_server模块
日志信息
问题总结

一、背景

像市场上一些搜索引擎，比如百度、搜狗、360搜索等都是大型项目。我们自己做是不可能的。而我们这里做的是站内搜索（搜索数据更垂直，数据量更小）。例如 cplusplus，它就是C++一个搜索引擎。`

二、技术栈和项目环境

技术栈：C/C++，C++11，STL,准标准库Boost，jsoncpp,cppjieba,cpp-httplib，选学：html5,css,js,jquery,Ajax

三、正排索引 vs 倒排索引-搜索引擎具体原理`

正排索引

就是从文档ID找到文档内容（文档里的关键字）

目标文档进行分词（目的：方便建立倒排索引和查找）：

雷军买了四斤小米：雷军/买/四斤/小米
⽂档1：雷军买了四⽄⼩⽶

⽂档2：雷军发布了⼩⽶⼿机

文档id	文档内容
1	雷军买了四斤小米
2	雷军发布了小米手机

停止词：了，的、吗，a, the
倒排索引：根据关键字，找到文档ID的方案

关键字(具有唯一性)	文档id
雷军	1、2
买	1
四斤	1
小米	1、2
发布	2

模拟一次查找过程

用户输入：小米->倒排索引中查找->提取出文档ID(1,2)->根据正排索引->找到文档内容-> title+content(desc)+url文档结果进行摘要->构建响应结果

将boost下doc文档里html文档内容全部拷贝到data下的input文件里，当作数据源

复制代码

cp -rf boost_1_89_0/doc/html/* data/input

四、编写去除标签与数据清洗的模块 Parser

复制代码

boost官网：boost.org
//目前只需要boost_1_89_0/doc/html下文件，用它来做索引

去标签之后的数据

复制代码

syl@syl-virtual-machine:~/桌面/boost_search$ touch Parser.cc

syl@syl-virtual-machine:~/桌面/boost_search/data$ mkdir raw_html
syl@syl-virtual-machine:~/桌面/boost_search/data$ ll
总用量 24
drwxrwxr-x  4 syl syl  4096 8月  30 20:34 ./
drwxrwxr-x  4 syl syl  4096 8月  30 20:26 ../
drwxrwxr-x 56 syl syl 12288 8月  30 20:33 input/  //这里放的是原始文档
drwxrwxr-x  2 syl syl  4096 8月  30 20:34 raw_html/    //这里放的是去标签之后的文档

目标:把每个文档去标签，然后写入到同一个文件中！每个文档内容只占"一行" 文档和文档之间用\3区分

编写parser.cc

复制代码

#include<iostream>
#include<string>
#include<vector>
#include<boost/filesystem.hpp>
#include"util.hpp"
using namespace std;

//是一个目录 放的是所有html文件

const string src_path="data/input/";  //

const string raw="/data/raw_html/raw.txt";

typedef struct DocInfo
{
    string title;// 标题
    string content;//文档内容
    string url;//文档的url
}DocInfo_t;

bool Enumfile(const string& src_path,vector<string>* file_list);

bool ParseHtml(const vector<string>& file_list,vector<DocInfo_t>* result );

bool SaveHtml(const vector<DocInfo_t>& result,const string& output);

int main()
{

    vector<string>file_list;

    //递归式的把每个html文件名带路径保存到file_list种

    if(!Enumfile(src_path,&file_list))

        cerr<<"enum file name error!"<<endl;

        return 1;

    //第二步按照file_list读取每个文件内容 并进行解析

    vector<DocInfo_t>result;

    if(!ParseHtml(file_list,&result))

    {

        cerr<<"parse html error!"<<endl;

        return 2;

    }

    //第三步：把解析完毕的各个文件内容，写入output

    string output;

    if(!SaveHtml(result,output))
    {
        cerr<<"save html error!"<<endl;

        return 3;
    }
}

bool Enumfile(const string& src_path,vector<string>* file_list)
{

    namespace fs=boost::filesystem;
    fs::path root_path(src_path); //定义个变量初始化

    //判断路径是否存在

    if(!fs::exists(root_path))
    {
        cerr<<src_path<<"not exist"<<endl;
        return false;

    }

    //定义一个空的迭代器 用来进行判断递归结束

    fs::recursive_directory_iterator end;

    for(fs::recursive_directory_iterator iter(root_path);iter!=end;iter++)
    {
        //是否是普通文件 html都是普通文件

        if(!fs::is_regular_file(*iter))
        {
            continue;    
        }

        if(iter->path().extension()!=".html")
            continue;
        cout<<"debug: "<<iter->path().string();
        //当前路径一定是一个合法的 以.html结束的普通网页文件
        file_list->push_back(iter->path().string());//将所有带路径的html文件保存到file_list 方便后面分析

    }
    //定义一个空的迭代器，用来进行判断递归结束
    return true;

}

//* 输出 & 输入
//解析文章标题

static bool ParseTitle(const string& file, string* title )

{

    //先找到其实标记位置
    size_t begin =file.find("<title>");

    if(begin==string::npos)
    {
        return false;
    }

    size_t end =file.find("</title>");
    if(end==string::npos)
    {
        return false;
    }

    //接下来让begin移动到c位置 <title>chapter</title>
    // size_t begin_pos=begin+string("</title>").size();
    begin+=string("<title>").size();
    if(begin>end)
    {
        return false;
    }
    *title = file.substr(begin,end-begin);
    return true;
}

//解析文章内容 去标签 并不是吧内容提取出来 而是将标签去掉
static bool ParseContent(const string& file, string* content)
{
    //小型状态机
    enum STATUS
    {
        LABLE,     //算是标签
        CONTENT   //文本内容
    };
    enum STATUS s=LABLE; //
    for(auto c:file)
    {
        switch(c)
        {
            //只要碰到右标签 当前的标签被处理完毕
            case LABLE:
                if(c=='>')  s=CONTENT;
                break;

            case CONTENT:
                if(c=='<')  s=LABLE;//意味着新的标签开始 文本结束
                else{
                    //不想保存源文件中的回车换行符 \n作为解析之后的文本分隔符
                    if(c=='\n')
                    {
                       c=' ';
                    }
                    content->push_back(c);//插入
                }

                break;

            default:
                break;
        }
    }
    return true;
}

static bool ParseUrl(const string& file_path,string* url)
{

    string url_head="https://www.boost.org/doc/libs/1_89_0/doc/html";

    string url_tail=file_path.substr(src_path.size());

    *url =url_head+url_tail;
    return true;
}

bool ParseHtml(const vector<string>& file_list,vector<DocInfo_t>* result )
{

    for(const string& file:file_list)
    {
        //读取文件Read() 到results
        string results;

        if(!ns_utill::FileUtil::ReadFile(file,&results))
        {
            continue;
        }

        DocInfo_t doc;
        //解析指定文件 提取title 从results解析到title种
        if(!ParseTitle(results,&doc.title))
        {
            continue;
        }
        //解析指定文件，提取content
        if(!ParseContent(results,&doc.content))
        {
            continue;
        }

        //解析指定文件路径

        if(!ParseUrl(results,&doc.url))
        {
            continue;
        }

        result->push_back(doc);

        //for Debug

        //ShowDebug(doc);

    }

    return true;

}
//网页保存至目标文件中

bool SaveHtml(const vector<DocInfo_t>& result,const string& output)
{
#define SEP '\3'

    ofstream ou(output,ios::out||ios::binary);

    if(!ou.is_open())
    {
        cerr<<"open"<<output<<"error"<<endl;
        return false;
    }
    for(auto& con:result)
    {
        string out_str;
        out_str+=con.title;
        out_str+=SEP;
        out_str+=con.content;
        out_str+=SEP;
        out_str+=con.url;
        out_str+='\n';
    }
    ou.close();
    return true;

}

构建url

复制代码

官网url：https://www.boost.org/doc/libs/1_89_0/doc/htm/accumulators.html

data/input/accumulators.html //我们把下载的库copy到我们自己的根目录下面
url_head ="https://www.boost.org/doc/libs/1_89_0/doc/html";
url_tail=[data/input] /accumulators.html-> url_tail=/accumulators.html
url=url_head+url_tail

将解析内容保存到目标文件中

复制代码

采⽤下⾯的⽅案：
version2: 写⼊⽂件中，⼀定要考虑下⼀次在读取的时候，也要⽅便操作!
类似：title\3content\3url \n title\3content\3url \n title\3content\3url \n ...
⽅便我们getline(ifsream, line)，直接获取⽂档的全部内容：title\3content\3url

bool SaveHtml(const vector<DocInfo_t>& result,const string& output)
{
#define SEP '\3'

    ofstream ou(output,ios::out||ios::binary);

    if(!ou.is_open())
    {
        cerr<<"open"<<output<<"error"<<endl;

        `return false;`
    `}`

    `for(auto& con:result)`
    `{`
        `string out_str;`
        `out_str+=con.title;`
        `out_str+=SEP;`
        `out_str+=con.content;`
        `out_str+=SEP;`
        `out_str+=con.url;`
        `out_str+='\n';`
    `}`

    `ou.close();`
    `return true;`
`}`

五、编写建立索引模块index

id	文档内容
1	雷军买了四斤小米
2	雷军发布了小米手机

建立正排的代码

复制代码

struct DocInfo
    {
        string title;
        string content;
        string url;
        uint64_t doc_id;
    };
 DocInfo* BuildForwardIndex(const string& line)
            {
                //解析line 字符串切分 line ->3 string, title,content,url
                vector<string> results;
                const string sep= "\3";
                ns_utill::StringUtil::CutString(line,&results,sep); //字符串切割
                if(results.size()!=3)
                {
                    return nullptr;
                }
                //字符串进行填充到DocInfo
                DocInfo doc;
                doc.title=results[0];
                doc.content=results[1];//
                doc.url=results[2];
                doc.doc_id=forward_index.size();//先保存id，再插入，对应的id就是当前doc在vector中的下标！
                //插入到正排索引
                forward_index.push_back(doc);
                return &forward_index.back();
            }

建立倒排

复制代码

  struct InteredElm
    {
        uint64_t doc_id;
        string word;  //关键字
        int weight;  
    };
    //倒排拉链
    typedef vector<InteredElm*> InteredList;
    //  倒排索引 一定是一个一一映射关系【关键字和倒排拉链之间的关系】
     unordered_map<string,InteredList>Intered_index;
     
     //词频统计

结构

复制代码

namespace ns_Index
{
    struct DocInfo
    {
        string title;
        string content;
        string url;
        uint64_t doc_id;
    };
    struct InteredElm
    {
        uint64_t doc_id;
        string word;  //关键字
        int weight;  
        InteredElm():weight(0){}
    };
    typedef std::vector<InvertedElem> InvertedList;
	class Index{
	private:
		//正排索引的数据结构⽤数组，数组的下标天然是⽂档的ID
		std::vector<DocInfo> forward_index; //正排索引
		//倒排索引⼀定是⼀个关键字和⼀组(个)InvertedElem对应[关键字和倒排拉链的映射
		关系]		
		std::unordered_map<std::string, InvertedList> inverted_index;
	public:
		Index(){}
		~Index(){}
		public:
		//根据doc_id找到找到⽂档内容
		DocInfo *GetForwardIndex(uint64_t doc_id)
		{
			return nullptr;
		}
		//根据关键字string，获得倒排拉链
		InvertedList *GetInvertedList(const std::string &word)
		{
			return nullptr;
		}
		//根据去标签，格式化之后的⽂档，构建正排和倒排索引
		//data/raw_html/raw.txt
		bool BuildIndex(const std::string &input) //parse处理完毕的数据交给我
		{
			return true;
		}
	};
}

index.hpp 代码

复制代码

#pragma once

namespace ns_Index
{
    struct DocInfo
    {
        string title;
        string content;
        string url;
        uint64_t doc_id;
    };
    struct InteredElm
    {
        uint64_t doc_id;
        string word;  //关键字
        int weight;  
        InteredElm():weight(0){}
    };
    //倒排拉链
    typedef vector<InteredElm> InteredList;
    class Index
    {
        public:
            Index(){};
            Index(const Index&)=delete;
            Index& operator=(const Index&)=delete;
            static mutex loc;
            static Index* instance;
        public:
            ~Index(){}
            //单例
            static Index* GetInstance()
            {
                if(nullptr==instance){
                    loc.lock();
                    if(nullptr==instance)
                    {
                        instance=new Index();
                    }
                    loc.unlock();
                }
                return instance;
            }
            //根据id找到文档内容
            //正排索引对应的元素
            DocInfo* GetForwardIndex(uint64_t doc_id)
            {
                if(doc_id>=forward_index.size())
                {
                    cerr<<doc_id<<"unfind error!"<<endl;
                }
                return &(forward_index[doc_id]);
            }
            //根据关键字string获得倒排拉链 根据关键字查找
            //获取倒排索引元素
            InteredList* GetInteredList(const string& word) // 有误
            {
                auto iter=Intered_index.find(word);
                if(iter==Intered_index.end())
                {
                    cerr<<word<<" have no InteredList"<<endl;
                    return nullptr;
                }
                return &(iter->second);
            }
            //根据去标签，格式化之后的⽂档，构建正排和倒排索引
            //data/raw_html/raw.txt
            bool buildIndex(const string& input)
            {
                std::ifstream in(input,std::ios::in | ios::binary);
                if(!in.is_open())
                {
                    cerr<<"sorry"<<input<<"open error"<<endl;
                    return false;
                }//到这里就把文件打开了
                int cnt=0;
                string line;
                while(std::getline(in,line))
                {
                    DocInfo* doc=BuildForwardIndex(line);//正排
                    if(nullptr==doc)
                    {
                        cerr<<"build error"<<endl;
                        return false;
                    }
                    BuildInveredIndex(*doc);//倒排
                    cnt++;
                    if(cnt%50==0)
                    {
                        //cout<<"当前建立索引文档："<<cnt<<endl;
                        LOG(NORMAL,"当前已经建立的索引文档: "+std::to_string(cnt));
                    }
                }
                return true;
            }
        private:
            //正排索引
            DocInfo* BuildForwardIndex(const string& line)
            {
                //解析line 字符串切分 line ->3 string, title,content,url
                vector<string> results;
                const string sep= "\3";
                ns_utill::StringUtil::Split(line,&results,sep); //字符串切割
                if(results.size()!=3)
                {
                    std::cout << results.size() << std::endl;
                    std::cout << "line" << line << std::endl;
                    return nullptr;
                }
                //字符串进行填充到DocInfo
                DocInfo doc;
                doc.title=results[0];
                doc.content=results[1];//
                doc.url=results[2];
                doc.doc_id=forward_index.size();//先保存id，再插入，对应的id就是当前doc在vector中的下标！
                //插入到正排索引
                forward_index.push_back(doc);
                return &forward_index.back();
            }
            bool BuildInveredIndex(const DocInfo& doc)
            {
                //DocInfo{tile,content,url,doc_id}
                //word->倒排
                struct word_cnt
                {
                    int title_cnt;
                    int content_cnt;
                    word_cnt():title_cnt(0),content_cnt(0){}
                };
                vector<string>title_words;
                unordered_map<string,word_cnt>word_map; //存储容器 暂存词频
                ns_utill::JiebaUtil::CutString(doc.title,&title_words);//存到该容器里
               //对标题进行解析
                for(auto& c:title_words)
                {
                    boost::to_lower(c);
                    word_map[c].title_cnt++;
                }
                vector<string>content_words;
                ns_utill::JiebaUtil::CutString(doc.content,&content_words);

                //对内容进行解析
                for(auto& s:content_words)
                {
                    boost::to_lower(s);
                    word_map[s].content_cnt++;
                }
#define X 10

#define Y 1
                //hello HELLO 这一步好比搜索 找相应关联 得出搜索结果
                for(auto& word_pair:word_map)
                {
                    InteredElm item;
                    item.doc_id=doc.doc_id;
	                item.word=word_pair.first;            item.weight=X*word_pair.second.title_cnt+Y*word_pair.second.content_cnt;

                   //d
                    InteredList& Intered_list=Intered_index[word_pair.first];//k值
                    Intered_list.push_back(move(item));
                }
                return true;

            }

        private:
             //正排索引 的数据结构用数组 数组的下标是文档的id
            vector<DocInfo>forward_index;
        //  倒排索引 一定是一个一一映射关系【关键字和倒排拉链之间的关系】
            unordered_map<string,InteredList>Intered_index;

    };
    Index* Index::instance=nullptr;
    std::mutex Index::loc;
};

六、编写搜索引擎模块Seacher.hpp

基本结构

复制代码

namespace ns_searcher{
  class Searcher{
	private:
		 ns_index::Index *index; //供系统进⾏查找的索引
	public:
	Searcher(){}
	~Searcher(){}
	public:
		void InitSearcher(const std::string &input)12 {
		 //1. 获取或者创建index对象
		//2. 根据index对象建⽴索引
		 }
		//query: 搜索关键字
		//json_string: 返回给⽤⼾浏览器的搜索结果
		 void Search(const std::string &query, std::string *json_string)
		 {
			 //1.[分词]:对我们的query进⾏按照searcher的要求进⾏分词
		
			 //2.[触发]:就是根据分词的各个"词"，进⾏index查找
		
			 //3.[合并排序]：汇总查找结果，按照相关性(weight)降序排序
		
			 //4.[构建]:根据查找出来的结果，构建json串 -- jsoncpp
		
		 }

	 };
 }

完整代码(Searcher.hpp)

复制代码

namespace ns_search{
    //解决搜索结果出现重复文档
    struct InvertedElmPrint
    {
        uint64_t doc_id;
        int weight;
        vector<string>words;
        InvertedElmPrint():doc_id(0),weight(0){}
    };
    class Searcher{
        private:
            ns_Index::Index* index;
        public:
            Searcher(){}
            void InitSearcher(const string& input)
            {
                //获取或者创建index对象
                index=ns_Index::Index::GetInstance();
                LOG(NORMAL,"获取单例成功");
                //根据index对象建立索引
                index->buildIndex(input);
                // cout<<"建立正排倒排索引成功..."<<endl;
                LOG(NORMAL,"建立正排倒排索引成功");
            }
            
            void search(const std::string& query,string* json_search)
            {
                vector<string>words;
                ns_utill::JiebaUtil::CutString(query,&words);
                unordered_map<uint64_t,InvertedElmPrint>tokens_map;
                //2、触发：根据分词的各个词 进行index查找
                vector<InvertedElmPrint>intered_list_all;
                for(string word:words)
                {
                    boost::to_lower(word);
                    ns_Index::InteredList *intered_list=index->GetInteredList(word);//
                    if(nullptr==intered_list)
                    {
                        continue;
                    }
                    //intered_list_all.insert(intered_list_all.end(),intered_list->begin(),intered_list->end());
                    for(const auto& elm:*intered_list)
                    {
                        auto& item=tokens_map[elm.doc_id];//[];如果存在直接获取 如果不存在新建
                        //item 一定是doc_id 相同的print 节点
                        item.doc_id=elm.doc_id;
                        item.weight+=elm.doc_id;
                        item.words.push_back(elm.word);
                    }
                }
                for(const auto& its:tokens_map)
                {
                    intered_list_all.push_back(its.second);
                }
                //排序 按weight降序排序并去重
                sort(intered_list_all.begin(),intered_list_all.end(),[](const InvertedElmPrint& e1,const InvertedElmPrint& e2){
                    return e1.weight>e2.weight;

                });
                //将所有结果按json串格式返回
                Json::Value root;
                for(auto& it:intered_list_all)
                {
                    ns_Index::DocInfo* doc=index->GetForwardIndex(it.doc_id);
                    if(nullptr==doc)
                    {
                        continue;
                    }
                    Json::Value elm;
                    elm["title"]=doc->title;
                    elm["content"]=GetDesc(doc->content,it.words[0]);
                    elm["url"]=doc->url;
                    //for dug
                    //elm["weight"]=it.weight;
                    root.append(elm);
                }
                Json::StyledWriter writer;
                *json_search=writer.write(root);
            }
            //获取摘要
            string GetDesc(const string& html_content,const string& word)
            {
                //找到word在html_content中首次出现 然后往前找50字节（如果没有 从begin找）往后100找
                const int prev_step=50;
                const int next_step=100;
                //找到首次出现位置
                auto iter=std::search(html_content.begin(),html_content.end(),word.begin(),word.end(),[](int x,int y){
                    return (tolower(x)==tolower(y));
                });
                if(iter==html_content.end())
                {
                    return "None1";
                }
                int pos=std::distance(html_content.begin(),iter);
                //获取start end
                int start=0;
                int end=html_content.size()-1;
                //如果有50位置 就更新开始位置
                if(pos>start+prev_step) start=pos-prev_step;
                if(pos<end-next_step)   end=pos+next_step;
                //3、截取子串 return
                if(start>=end)  return "None2";    
                string desc=html_content.substr(start,end-start);
                return desc;
            }
    };

};

安装jsoncpp

复制代码

sudo apt install -y jsoncpp_devel

七、编写http_server模块

复制代码

cpp-httplib库安装路径：https://gitee.com/zhangkt1995/cpp-httplib?_from=gitee_search

const std::string root_path="./wwwroot";
const std::string input="data/raw_html/raw.txt";

int main()
{
    ns_search::Searcher sea;
    sea.InitSearcher(input);
    httplib::Server sur;
    sur.set_base_dir(root_path.c_str());
    sur.Get("/s",[&sea](const httplib::Request& req,httplib::Response& res){
        if(!req.has_param("word")){
            res.set_content("必须要搜索的关键词","text/plain: charset=utf-8");
            return;
        }
        std::string word=req.get_param_value("word");
        LOG(NORMAL,"用户在搜索："+word);
        //std::cout<<"用户在搜索"<<word<<std::endl;
        std::string json_string;
        sea.search(word,&json_string);
        res.set_content(json_string,"application/json");
    });
    // sur.Get("/hi",[](const httplib::Request& req,httplib::Response& res){
    //     res.set_content("年后再说","text/plain");
    // });

    LOG(NORMAL,"服务器启动成功....");
    sur.listen("0.0.0.0",8081);
    return 0;

}

日志信息

复制代码

#pragma once
#include<iostream>
#include<ctime>
#include<string>

#define NORMAL 1
#define WARNNING 2
#define DEBUG 3
#define FATAL 4

#define LOG(LEVEL,MESSAGE) log(#LEVEL,MESSAGE,__FILE__,__LINE__)
void log(std::string level,std::string message,std::string file,int line)

{

    std::cout<<"["<<level<<"]"<<"["<<time(nullptr)<<"]"<<"["<<message<<"]"<<"["<<file<<":"<<line<<"]"<<std::endl;

  

}

问题总结

遇到的问题：

在进行综合调试的时候（debug.cc），发现什么也搜不到，不管搜索什么都是空的。

再进行一一检查后最终发现Parser.cc文件出错了。content是空的 url 里面混杂着content内容。在ParserHtml文件中有一处解析文件路径一个参数传错了，错吧file传成results。开始索引建立有问题顺着searcher.cc文件里search函数摸索过去。再后面调试的时候发现search里面将所有结果按照json格式返回中GetForwardIndex里的参数传错。