搜索引擎的模块设计及结果展示

目录

1、清洗数据的模块(Parser)

[1.1 思路](#1.1 思路)

[1.2 代码实现](#1.2 代码实现)

[1.2.1 paeser.cc](#1.2.1 paeser.cc)

[1.2.2 util.hpp](#1.2.2 util.hpp)

[1.3 简单测试](#1.3 简单测试)

2、建立索引的模块(Index)

[2.1 思路](#2.1 思路)

[2.2 代码实现](#2.2 代码实现)

[2.2.1 index.hpp](#2.2.1 index.hpp)

[2.2.2 util.hpp](#2.2.2 util.hpp)

3、编写搜索引擎的模块(Searcher)

[3.1 思路](#3.1 思路)

[3.2 代码实现](#3.2 代码实现)

[3.2.1 searcher.hpp](#3.2.1 searcher.hpp)

[3.3 简单测试](#3.3 简单测试)

4、编写http_server模块

[4.1 调用cpp-httplib库](#4.1 调用cpp-httplib库)

[4.1.1 http_server_lib.cc](#4.1.1 http_server_lib.cc)

[4.2 自己实现http](#4.2 自己实现http)

[4.2.1 http_server_self.cc](#4.2.1 http_server_self.cc)

[4.2.2 http.hpp](#4.2.2 http.hpp)

[4.2.3 tcp_server.hpp](#4.2.3 tcp_server.hpp)

[4.2.4 socket.hpp](#4.2.4 socket.hpp)

[4.2.5 inet_addr.hpp](#4.2.5 inet_addr.hpp)

[4.2.6 服务器日志​编辑](#4.2.6 服务器日志编辑)

5、结果展示及完整代码

6、项目总结


1、清洗数据的模块(Parser)

1.1 思路

  • 获取数据

    • Boost官网中,下载并解压boost_1_89_0.tar.gz,将boost_1_89_0/doc/html/所有的文件拷贝原始数据文件夹中。
    复制代码
      tar xzf boost_1_89_0.tar.gz
      cp -rf boost_1_89_0/doc/html/* ./raw_html/
  • parser.cc

    • 存放每个html的文件名带路径,方便后续的遍历;

    • 再读取files_list中所有的html文件 ,进行解析(title+content+url)
      *

      cpp 复制代码
      // 包含的头文件
      #include <boost/filesystem.hpp>
      // 编译时,带-lfilesystem选项
    • 最后将解析完的各个html文件的内容,保存到一个目标文件 (以title**\3** content**\3** url**\n**的形式存放,使getline可以一次读取一个文档(\3,控制字符,本身不可显,几乎不会出现在文本中))。

1.2 代码实现

1.2.1 paeser.cc
cpp 复制代码
#include <iostream>
#include <string>
#include <vector>
#include <boost/filesystem.hpp>
#include "util.hpp"

const static std::string src_path = "./data/raw_html";
const static std::string dst_file = "./data/cleaned_html/cleaned.txt";
typedef struct DocInfo{
    std::string title; // 文档标题
    std::string content; // 文档内容
    std::string url; // 该文档在官网中的url
}DocInfo_t;

// const & : 输入
// * : 输出
// & : 输入输出
bool EnumFile(const std::string& src_path, std::vector<std::string>* files_list);
bool ParseHtml(const std::vector<std::string>& files_list, std::vector<DocInfo_t>* results);
bool SaveHtml(const std::vector<DocInfo_t>& results, const std::string& dst_file);

int main() {
    std::vector<std::string> files_list;
    // 1. 存放每个html的文件名带路径,方便后续的遍历。
    if(!EnumFile(src_path, &files_list)) {
        std::cerr << "enum file name error!" << std::endl;
        return 1;
    }
    // 2. 读取files_list中所有的html文件,进行解析
    std::vector<DocInfo_t> results(files_list.size()); // 避免频繁扩容
    if(!ParseHtml(files_list, &results)) {
        std::cerr << "enum parse html error!" << std::endl;
        return 2;
    }
    // 3. 将解析完的各个html文件的内容,保存到一个目标文件,以\3为分隔符
    if(!SaveHtml(results, dst_file)) {
        std::cerr << "save html error!" << std::endl;
        return 3;
    }
    
    return 0;
}

bool EnumFile(const std::string& src_path, std::vector<std::string>* files_list) {
    namespace fs = boost::filesystem;
    fs::path root_path(src_path);
    
    // 判断路径是否存在
    if(!fs::exists(src_path)) {
        std::cerr << src_path << " does not exist!" <<std::endl;
        return false;
    }
    
    // 定义一个空的迭代器,作为递归终止的标志
    fs::recursive_directory_iterator end;
    for(fs::recursive_directory_iterator iter(root_path); iter != end; ++iter) {
        if(!fs::is_regular_file(*iter)) continue;
        if(iter->path().extension() != ".html") continue;
        
        // std::cout << "debug: " << iter->path().string() << std::endl;
        // 当前的路径是一个html文件
        files_list->push_back(iter->path().string());
    }

    return true;
}

bool ParseTitle(const std::string& read_result, std::string* title) {
    size_t begin = read_result.find("<title>");
    if(begin == std::string::npos) return false;

    size_t end = read_result.find("</title>");
    if(end == std::string::npos) return false;

    begin += std::string("<title>").size();
    if(begin > end) return false;

    *title = read_result.substr(begin, end - begin);
    return true;
}
bool ParseContent(const std::string& read_result, std::string* content) {
    enum class Status{
        LABEL,
        CONTENT
    };
    Status s = Status::LABEL;
    for(char c : read_result) {
        switch(s) {
            case Status::LABEL:
                if(c == '>') s = Status::CONTENT;
                break;
            case Status::CONTENT:
                if(c == '<') s = Status::LABEL;
                else {
                    if(c == '\n') c = ' '; // \n,后面想作为解析后,文档之间的分隔符
                    content->push_back(c);
                }
                break;
            default:
                break;
        }
    }
    return true;
}
bool ParseUrl(const std::string& file_path,std::string* url) {
    std::string url_head = "https://www.boost.org/doc/libs/1_89_0/doc/html";
    std::string url_tail = file_path.substr(src_path.size());

    *url = url_head + url_tail;
    return true;
}
// for debug
static void ShowDoc(const DocInfo_t& doc) {
    std::cout << "title: " << doc.title << std::endl;
    std::cout << "content: " << doc.content << std::endl;
    std::cout << "url: " << doc.url << std::endl;
}
bool ParseHtml(const std::vector<std::string>& files_list, std::vector<DocInfo_t>* results) {
    for(const std::string& file_path : files_list) {
        // 1. 读取文件
        std::string read_result;
        if(!ns_util::FileUtil::ReadFile(file_path, &read_result)) continue;

        DocInfo_t doc;
        // 2. 解析指定文件,获取title
        if(!ParseTitle(read_result, &doc.title)) continue;
        // 3. 解析指定文件,获取content,本质是去标签
        if(!ParseContent(read_result, &doc.content)) continue;
        // 4. 解析指定文件,获取url
        if(!ParseUrl(file_path, &doc.url)) continue;

        // 5. 保存doc
        results->push_back(std::move(doc)); // move,减少拷贝

        // for debug
        // ShowDoc(doc);
        // break;
    }
    return true;
}

bool SaveHtml(const std::vector<DocInfo_t>& results, const std::string& output) {
#define SEP '\3'
    std::ofstream out(dst_file, std::ios::out | std::ios::binary);
    if(!out.is_open()) {
        std::cerr << "open file: " << dst_file << " error!" << std::endl;
        return false;
    }

    for(auto& item : results) {
        std::string out_string;
        out_string += item.title;
        out_string += SEP;
        out_string += item.content;
        out_string += SEP;
        out_string += item.url;
        out_string += '\n';

        out.write(out_string.c_str(), out_string.size());
    }

    out.close();
    return true;
}
1.2.2 util.hpp
cpp 复制代码
#pragma once

#include <iostream>
#include <string>
#include <fstream>

namespace ns_util {
    class FileUtil{
    public:
        static bool ReadFile(const std::string& file_path, std::string* out) {
            std::ifstream in(file_path, std::ios::in);
            if(!in.is_open()) {
                std::cerr << "open file " << file_path << " error" << std::endl;
                return false;
            }
            
            std::string line;
            // std::getline() 在内部会先清空字符串再填充新内容
            // getline() 返回流对象,流对象在布尔上下文中会自动转换
            while(getline(in, line)) { 
                *out += line;
            }

            in.close();
            return true;
        }
    };
}

1.3 简单测试

  • 存放html文件 测试。
  • 解析(title+content+url) 测试:
    • 一个解析结果:
    • 原html文档:
    • url正确:
  • 解析完的html文档保存到一个目标文件 测试:
    • html文档的个数相同
    • 保存的第一个解析结果:
    • url正确:

2、建立索引的模块(Index)

2.1 思路

  • 构建正排索引doc_id -> DocInfo {title, content, url, doc_id}
    • 解析line -> title, content, url;
    • 填充 DocInfo,并插入正排索引的vector。
  • 构建倒排索引word**-> 倒排拉链** (一个或多个InvertedElem {doc_id, word, weight})
    • 对title&&content分词 (使用第三方库cppjieba),并统计 title和content的词频
      *

      cpp 复制代码
      git clone https://gitcode.com/gh_mirrors/cp/cppjieba.git
      // 下面这个,可能要开一下加速器Watt Toolkit
      git clone https://github.com/yanyiwu/limonp.git
      
      // 把limonp/include/limonp,拷贝到cppjieba/include/cppjieba/
      cp -rf limonp/include/limonp cppjieba/include/cppjieba/
      
      // 建立软连接,方便找到头文件。软连接,删除时,使用unlink
      ln -s ./thirdpart/cppjieba/include/cppjieba cppjieba
      ln -s ./thirdpart/cppjieba/dict dict
      
      // 最后包含头文件
      #include "cppjieba/Jieba.hpp"
      • 如果搜索的词,出现在标题,title_cnt统计一次,内容包含了标题,content_cnt里面也会统计一次,所以内容多统计了一次;当然,是根据cppjieba分词,进行的统计,与分出来词也有关系。总体上来说,问题不大。
    • 填充 InvertedElem,计算 weight(认为,词在标题中出现,文章相关性高一些,在内容中出现,文章相关性低一些),并插入倒排索引的vector。

2.2 代码实现

2.2.1 index.hpp
cpp 复制代码
#pragma once

#include <iostream>
#include <string>
#include <vector>
#include <unordered_map>
#include <fstream>
#include <mutex>
#include "util.hpp"

namespace ns_index{
    struct DocInfo{
        std::string title;
        std::string content;
        std::string url;
        uint64_t doc_id; // 哪个doc_id索引过来的;方便构建后面InvertedElem的doc_id
    };
    struct InvertedElem{
        uint64_t doc_id;
        std::string word; // 哪个word索引过来的
        int weight;
    };
    using InvertedList = std::vector<InvertedElem>; // 倒排拉链(一个或多个InvertedElem{doc_id, word, weight})

    class Index{
    private:
        Index(){}
        Index(const Index& ) = delete;
        Index& operator=(const Index& ) = delete;
        static std::mutex mutex;
        static Index* instance;
    public:
        ~Index(){}
        static Index* GetInstance() {
            if(nullptr == instance) {
                std::lock_guard<std::mutex> lock(mutex);
                if(nullptr == instance) {
                    instance = new Index();
                }
            }
            return instance;
        }
        // 根据doc_id,获得文档内容
        DocInfo* GetDocInfo(uint64_t doc_id) {
            if(doc_id >= forward_index.size()) {
                std::cerr << doc_id << " is out of range, error!" << std::endl;
                return nullptr;
            }
            return &forward_index[doc_id];
        }
        // 根据关键字,获得倒排拉链
        InvertedList* GetInvertedList(const std::string& word) {
            auto it = inverted_index.find(word);
            if(it == inverted_index.end()) {
                std::cerr << word << " has no InvertedList, error!" << std::endl;
                return nullptr;
            }
            return &(it->second);
        }
        // 根据清洗后的文档,构建正排索引和倒排索引
        // ./data/cleaned_html/cleaned.txt
        bool BuildIndex(const std::string& input) {
            std::ifstream in(input, std::ios::in | std::ios::binary);
            if(!in.is_open()) {
                std::cerr << input << " open error!" << std::endl;
                return false;
            }
            std::string line;
            int count = 0;
            while(std::getline(in, line)) {
                DocInfo* doc = BuildForwardIndex(line);
                if(nullptr == doc) {
                    std::cerr << line << " build error!" << std::endl; // for debug
                    continue;
                }
                BuildInvertedIndex(*doc);
                ++count;
                if(count % 100 == 0) {
                    std::cout << "当前建立索引的文档序号:> " << count << std::endl;
                }
            }
            return true;
        }
    private:
        DocInfo* BuildForwardIndex(const std::string& line) {
            // 正排索引,doc_id -> DocInfo{title, content, url, doc_id}
            // 1. 解析line -> title, content, url
            std::vector<std::string> results;
            const std::string sep = "\3"; // 行内分隔符
            ns_util::StringUtil::Split(line, &results, sep);
            // 2. 填充DocInfo,并插入正排索引的vector
            DocInfo doc;
            doc.title = results[0];
            doc.content = results[1];
            doc.url = results[2];
            doc.doc_id = forward_index.size();

            forward_index.push_back(std::move(doc)); // 减少拷贝

            return &forward_index.back();
        }
        bool BuildInvertedIndex(const DocInfo& doc){
            // 倒排索引,word -> 倒排拉链(一个或多个InvertedElem{doc_id, word, weight})
            struct word_cnt{
                int title_cnt;
                int content_cnt;
                word_cnt() : title_cnt(0), content_cnt(0) {}
            };
            // 1. 对title&&content分词(使用第三方库cppjieba),并统计title和content的词频;
            std::unordered_map<std::string, word_cnt> word_map;

            std::vector<std::string> title_words;
            ns_util::JiebaUtil::CutString(doc.title, &title_words);
            for(auto& word : title_words) {
                boost::to_lower(word); // 转成小写,表示不区分大小写
                word_map[std::move(word)].title_cnt++;
            }

            std::vector<std::string> content_words;
            ns_util::JiebaUtil::CutString(doc.content, &content_words);
            for(auto& word : content_words) {
                boost::to_lower(word); // 转成小写,表示不区分大小写
                word_map[std::move(word)].content_cnt++;
            }
            // 2. 填充InvertedElem,计算weight,并插入倒排索引的vector
            #define X 10
            #define Y 1
            for(auto& word_pair : word_map) {
                InvertedElem item;
                item.doc_id = doc.doc_id;
                item.word = std::move(word_pair.first);
                item.weight = X*word_pair.second.title_cnt + Y*word_pair.second.content_cnt;
                InvertedList& inverted_list = inverted_index[item.word];
                inverted_list.push_back(std::move(item));
            }
            return true;
        }
    private:
        std::vector<DocInfo> forward_index; // 正排索引,通过id(即下标)进行索引
        std::unordered_map<std::string, InvertedList> inverted_index; // 倒排索引
    };
    Index* Index::instance = nullptr;
    std::mutex Index::mutex;
}
2.2.2 util.hpp
cpp 复制代码
#pragma once

#include <iostream>
#include <string>
#include <vector>
#include <fstream>
#include <boost/algorithm/string.hpp>
#include "cppjieba/Jieba.hpp"

namespace ns_util {
    class FileUtil{
    public:
        static bool ReadFile(const std::string& file_path, std::string* out) {
            std::ifstream in(file_path, std::ios::in);
            if(!in.is_open()) {
                std::cerr << "open file " << file_path << " error" << std::endl;
                return false;
            }
            
            std::string line;
            // std::getline() 在内部会先清空字符串再填充新内容
            // getline() 返回流对象,流对象在布尔上下文中会自动转换
            while(getline(in, line)) { 
                *out += line;
            }

            in.close();
            return true;
        }
    };
    
    class StringUtil{
    public:
        static void Split(const std::string& target, std::vector<std::string>* out, const std::string sep) {
            // boost split
            boost::split(*out, target, boost::is_any_of(sep), boost::token_compress_on);
        }
    };

    #define DICT_DIR "./dict"
    class JiebaUtil{
    public:
        static void CutString(const std::string& src, std::vector<std::string>* out) {
            jieba.CutForSearch(src, *out);
        }
    private:
          static cppjieba::Jieba jieba;
    };
    cppjieba::Jieba JiebaUtil::jieba(DICT_DIR "/jieba.dict.utf8",
                                    DICT_DIR "/hmm_model.utf8",
                                    DICT_DIR "/user.dict.utf8",
                                    DICT_DIR "/idf.utf8",
                                    DICT_DIR "/stop_words.utf8");
}

3、编写搜索引擎的模块(Searcher)

3.1 思路

  • void InitSearcher(const std::string& input)

    • 获取索引对象

    • 通过索引对象,建立索引

  • void Search(const std::string& query, std::string* json_string)

    • 分词

    • 根据word,在倒排索引中搜索并保存InvertedElem {doc_idwordsweight};

    • InvertedElem权重的降序排序

    • 通过InvertedElem.doc_id在正排索引中获取DocInfo构建json串

      cpp 复制代码
      sudo apt install -y libjsoncpp-dev
      // 包含的头文件
      #include <jsoncpp/json/json.h>
      // 编译时,带-ljsoncpp选项
  • 注意:
    • 倒排索引和query,都是转换成了小写(为了不区分大小写),但是,当用小写的词在原文中查desc时,原文不一定是小写,所以将文档的内容转成小写比较。

3.2 代码实现

3.2.1 searcher.hpp
cpp 复制代码
#pragma once

#include "index.hpp"
#include <algorithm>
#include <jsoncpp/json/json.h>

namespace ns_searcher{
    struct InvertedElem{
        uint64_t doc_id;
        std::vector<std::string> words; // 哪些word索引过来的
        int weight; // 权重相加
        InvertedElem() : weight(0) {}
    };
    class Searcher{
    public:
        void InitSearcher(const std::string& input) {
            // 1. 获取索引对象
            index = ns_index::Index::GetInstance();
            std::cout << "获取index单例成功..." << std::endl;
            // 2. 通过索引对象,建立索引
            index->BuildIndex(input);
            std::cout << "建立正倒排索引成功..." << std::endl;
        }
        void Search(const std::string& query, std::string* json_string){
            // 1. 分词
            std::vector<std::string> query_words;
            ns_util::JiebaUtil::CutString(query, &query_words);
            // 2. 根据word,在倒排索引中,搜索并保存InvertedElem
            std::unordered_map<uint64_t, InvertedElem> doc_id_map; // 文档去重
            std::vector<InvertedElem> inverted_list_all;
            for(auto& word : query_words) {
                boost::to_lower(word); // 转成小写,表示不区分大小写
                ns_index::InvertedList* inverted_list = index->GetInvertedList(word);
                if(nullptr == inverted_list) {
                    continue;
                }
                for(auto& elem : *inverted_list) {
                    auto& item = doc_id_map[elem.doc_id];
                    item.doc_id = elem.doc_id; // item可能是新建的
                    item.words.push_back(elem.word);
                    item.weight += elem.weight;
                }
            }
            for(auto& elem : doc_id_map) {
                inverted_list_all.push_back(std::move(elem.second));
            }
            // 3. InvertedElem按权重的降序排序
            std::sort(inverted_list_all.begin(), inverted_list_all.end(), 
            [](const InvertedElem& e1, const InvertedElem& e2){
                return e1.weight > e2.weight;
            });
            // 4. 通过InvertedElem.doc_id,在正排索引中,获取DocInfo,构建json串
            Json::Value root;
            for(auto& InvertedElem : inverted_list_all) {
                ns_index::DocInfo* doc = index->GetDocInfo(InvertedElem.doc_id);
                if(nullptr == doc) {
                    continue;
                }
                Json::Value item;
                item["title"] = doc->title;
                item["desc"] = GetDesc(doc->content, InvertedElem.words[0]);
                item["url"] = doc->url;
                // for debug
                item["id"] = doc->doc_id;
                item["weight"] = InvertedElem.weight;
                
                root.append(item);
            }
            Json::StyledWriter writer;
            *json_string = writer.write(root);
        }
    private:
        std::string GetDesc(const std::string& html_content, const std::string& word) {
            const int prev_step = 50;
            const int next_step = 100;

            // size_t pos = html_content.find(word); // bug:索引里的word已经是小写了,content里面不一定是小写
            // if(pos == std::string::npos) {
            //     return "None1";
            // }

            // 将文档的内容转成小写比较
            auto it = std::search(html_content.begin(), html_content.end(), word.begin(), word.end(), [](int x, int y){
                return std::tolower(x) == std::tolower(y);
            });
            if(it == html_content.end()) {
                return "None1";
            }
            int pos = std::distance(html_content.begin(), it);

            int start = 0, end = html_content.size() - 1; // [ , ]
            // 取pos+前50个字节(没有50个字节,就从start开始)+后100个字节(没有100个字节,就在end结束)
            if(pos - prev_step > start) start = pos - prev_step;
            if(pos + next_step < end) end = pos + next_step;

            if(start > end) return "None2";
            return html_content.substr(start, end - start + 1);
        }
    private:
        ns_index::Index* index;
    };
}

3.3 简单测试

cpp 复制代码
#include "searcher.hpp"

const static std::string input = "./data/cleaned_html/cleaned.txt";

int main(){
    ns_searcher::Searcher * searcher = new ns_searcher::Searcher();
    searcher->InitSearcher(input);

    std::string query;
    std::string json_string;
    while(true) {
        std::cout << "Please enter your query statement# ";
        std::cin >> query;
        searcher->Search(query, &json_string);

        std::cout << json_string <<std::endl;

        // query的分词情况
        std::vector<std::string> cut_strings;
        ns_util::JiebaUtil::CutString(query, &cut_strings);
        std::cout << query << " 分词情况:> " << cut_strings.size() << "个分词"<<  std::endl;
        int count = 1;
        for(auto& str : cut_strings) {
            std::cout << "第" << count++ << "个分词:> "<< str << std::endl;
        }
    }

    return 0;
}
  • 测试结果:

4、编写http_server模块

4.1 调用cpp-httplib库

4.1.1 http_server_lib.cc
  • 使用cpp-httplib库。
cpp 复制代码
git clone https://gitee.com/openworking/cpp-httplib.git
// 建立软连接
ln -s ./thirdpart/cpp-httplib cpp-httplib
// 要求:gcc/g++是比较新的版本。
// 编译时,带-pthread选项
cpp 复制代码
#include "searcher.hpp"
#include "cpp-httplib/httplib.h"

const static std::string input = "./data/cleaned_html/cleaned.txt";
const static std::string root_path = "./wwwroot";

int main() {
    ns_searcher::Searcher searcher;
    searcher.InitSearcher(input);

    httplib::Server server;
    server.set_base_dir(root_path);
    server.Get("/s", [&searcher](const httplib::Request& request, httplib::Response& response){
        if(!request.has_param("word")) {
            response.set_content("必须要有搜索关键字!", "text/plain; charset=utf-8");
            return;
        }
        std::string word = request.get_param_value("word");
        std::cout << "用户正在搜索: " << word << std::endl;

        std::string json_string;
        searcher.Search(word, &json_string);

        response.set_content(json_string, "application/json");
    });
    server.listen("0.0.0.0", 8080);
    return 0;
}

4.2 自己实现http

  • HTTP请求报头应答报头
  • 下面是几个主要的文件:完整的代码在本文的最后。
4.2.1 http_server_self.cc
cpp 复制代码
#include "http.hpp"
#include <memory>

// ./myhttp server_port
int main(int argc, char* argv[])
{

    if(argc != 2)
    {
        std::cerr << "Usage: " << argv[0] << " server_port" << std::endl;
        exit(USAGE_ERROR);
    }

    Enable_Console_Log_Strategy();

    uint16_t server_port = std::stoi(argv[1]);

    std::unique_ptr<Http> httpsvrp = std::make_unique<Http>(server_port);
    httpsvrp->Start();
    
    return 0;
}
4.2.2 http.hpp
cpp 复制代码
#pragma once

#include "socket.hpp"
#include "tcp_server.hpp"
#include "util.hpp"
#include "log.hpp"
#include <string>
#include <unordered_map>
#include <functional>

const static std::string sep_space = " ";
const static std::string sep_line = "\r\n";
const static std::string sep_kv = ": ";

const static std::string wwwroot = "./wwwroot";
const static std::string home_page = "/index.html";
const static std::string page_404 = "/404.html";

const static std::string input = "./data/cleaned_html/cleaned.txt";

using search = std::function<void(const std::string& query, std::string* json_string)>;

class HttpRequest
{
public:
    void ParseReqLine(std::string &req_line)
    {
        // GET / HTTP/1.1
        std::stringstream ss(req_line);
        ss >> _method >> _url >> _http_version;
    }

    bool Deserialize(std::string& req_str)
    {
        // 请求行
        std::string req_line;
        bool res = ns_util::FileUtil::ReadOneLine(req_str, &req_line, sep_line);
        if(!res)
            return false;
        ParseReqLine(req_line);
        LOG(LogLevel::DEBUG) << "reqline: " << req_line;

        if(_url == "/") {
            _url = wwwroot + home_page; // "./wwwroot/index.html";
        } else {
            const std::string search = "/s?word=";
            if(_url.substr(0, search.size()) == search) {
                _args = _url.substr(search.size()); // 搜索
            } else {
                _url = wwwroot + _url; // "./wwwroot/..."
            }
        }

        std::cout << "解析reqline: " << std::endl;
        LOG(LogLevel::DEBUG) << "_method: " << _method;
        LOG(LogLevel::DEBUG) << "_url: " << _url;
        LOG(LogLevel::DEBUG) << "_http_version: " << _http_version;

        // 请求报头 请求正文 ...

        return true;
    }

    std::string GetUrl()
    {
        return _url;
    }
    std::string GetArgs() {
        return _args;
    }

private:
    std::string _method;
    std::string _url;
    std::string _args; // 搜索关键词
    std::string _http_version;

    std::unordered_map<std::string, std::string> _headers;

    std::string _blank_line;

    std::string _text;
};

class HttpResponse
{
public:
    HttpResponse()
    :_blank_line(sep_line)
    ,_http_version("HTTP/1.1")
    {}

    // 实现: 成熟的http,应答做序列化,不要依赖任何第三方库!
    std::string Serialize()
    {
        std::string status_line = _http_version + sep_space + std::to_string(_code) + sep_space + _code_desc + sep_line;
        std::string headers;
        for(auto& header : _headers)
        {
            headers += header.first + sep_kv + header.second + sep_line;
        }

        return status_line + headers + _blank_line + _text;
    }

    void SetCode(int code)
    {
        _code = code;
        switch(code)
        {
            case 200:
                _code_desc = "OK";
                break;
            case 302:
                _code_desc = "Found";
                break;
            case 404:
                _code_desc = "Not Found";
                break;
            default:
                break;
        }
    }

    void SetHeaders(const std::string& key, const std::string& value)
    {
        auto it = _headers.find(key);
        if(it == _headers.end())
            _headers.insert({key,value});
    }
    
    std::string Uri2Suffix(const std::string &targetfile)
    {
        // ./wwwroot/a/b/c.html
        auto pos = targetfile.rfind(".");
        if (pos == std::string::npos) // 默认访问的是.htm/.html文件
        {
            return "text/html";
        }

        std::string suffix = targetfile.substr(pos);
        if (suffix == ".html" || suffix == ".htm")
            return "text/html";
        else if (suffix == ".jpg")
            return "image/jpeg";
        else if (suffix == "png")
            return "image/png";
        else
            return "";
    }

    void SetTargetFile(const std::string& file)
    {
        _target_file = file;
    }

    void SetArgs(const std::string args) {
        _args = args;
    }

    void SetText(const std::string text)
    {
        _text = text;
    }

    bool MakeResponse(uint16_t port, search Search)
    {
        // 浏览器要访问图标, 忽略他
        if(_target_file == "./wwwroot/favicon.ico")
        {
            LOG(LogLevel::INFO) << "用户请求图标: " << _target_file << "忽略他";
            return false;
        }

        // 执行搜索
        if(!_args.empty()) {
            std::cout << "用户正在搜索: " << _args << std::endl;
            SetCode(200); // 成功
            Search(_args, &_text);
            SetHeaders("Content-Type", "application/json");
            return true;
        }
        
        // 返回页面
        bool res = ns_util::FileUtil::ReadFileContent(_target_file, &_text);
        if(!res)
        {
            LOG(LogLevel::WARNING) << "没有文件: " << _target_file << " , 404";

            SetCode(302); // 没有这个文件, 临时重定向到404
            SetHeaders("Location","http://124.221.189.63:"+std::to_string(port)+"/404.html");
        }
        else
        {
            LOG(LogLevel::WARNING) << "读取文件: " << _target_file; 

            SetCode(200); // 成功
            int file_size = ns_util::FileUtil::FileSize(_target_file);
            std::string suffix = Uri2Suffix(_target_file);
            SetHeaders("Content-Type", suffix);
            SetHeaders("Content-Length", std::to_string(file_size));
        }

        return true;
    }
    
private:
    std::string _http_version;
    int _code;
    std::string _code_desc;

    std::unordered_map<std::string, std::string> _headers;

    std::string _blank_line;

    std::string _text;

    // 其他属性
    std::string _target_file;
    std::string _args;
};

class Http
{
public:
    Http(uint16_t port)
    :_tsvrp(std::make_unique<TcpServer>(port))
    ,_port(port)
    {
        searcher.InitSearcher(input); // 建立索引
    }

    void HandleHttpRequest(std::shared_ptr<Socket>& sockfd, const InetAddr& client, search Search)
    {
        LOG(LogLevel::DEBUG) << "收到新连接,准备读取请求"; 
        
        std::string http_req;
        int n = sockfd->Recv(&http_req); // 大概率一次读到完整的请求报文
        if(n > 0)
        {
            std::cout << "##########################" << std::endl;
            std::cout << "请求正文: " << std::endl;
            std::cout << http_req;
            std::cout << "##########################" << std::endl;

            HttpRequest req;
            HttpResponse resp;

            req.Deserialize(http_req);
            resp.SetTargetFile(req.GetUrl());
            resp.SetArgs(req.GetArgs());
            if(resp.MakeResponse(_port, Search))
            {
                sockfd->Send(resp.Serialize());
            }
        }
    }

    void Start()
    {
        auto search = std::bind(&ns_searcher::Searcher::Search, 
                                &searcher, 
                                std::placeholders::_1, 
                                std::placeholders::_2);
        _tsvrp->Start([this, search](std::shared_ptr<Socket>& sockfd, const InetAddr& client){
            this->HandleHttpRequest(sockfd, client, search);
        });
    }

private:
    std::unique_ptr<TcpServer> _tsvrp;
    ns_searcher::Searcher searcher;
    uint16_t _port;
};
4.2.3 tcp_server.hpp
cpp 复制代码
#pragma once

#include "socket.hpp"
#include "searcher.hpp"
#include <functional>
#include <sys/wait.h>

using ioservice_t = std::function<void(std::shared_ptr<Socket>&, const InetAddr&)>;

class TcpServer : public NoCopy
{
public:
    TcpServer(uint16_t port)
        : _listen_socket(std::make_unique<TcpSocket>())
        , _running(false)
    {   
        // 1. 创建套接字
        // 2. bind套接字
        // 3. 设置监听套接字
        _listen_socket->BuildTcpServer(port);
    }

    // version-多进程
    void Start(ioservice_t service)
    {
        _running = true;

        while (_running)
        {
            // 4. 创建已连接套接字
            InetAddr client;
            std::shared_ptr<Socket> sockfd = _listen_socket->Accept(&client);
            if(!sockfd)
                continue;

            pid_t pid = fork();
            if(pid < 0)
            {
                LOG(LogLevel::WARNING) << "fork failed";
                continue;
            }
            else if(pid == 0)
            {
                _listen_socket->Close(); // 关闭listen_sockfd
                // 子进程
                if(fork() > 0)
                    exit(OK);
                // 孙子进程
                service(sockfd,client);
                exit(OK);
            }
            else
            {
                sockfd->Close(); // 关闭sockfd
                // 父进程
                waitpid(pid,nullptr,0);
            }
        }
    }

private:
    std::unique_ptr<Socket> _listen_socket;
    bool _running;
};
4.2.4 socket.hpp
cpp 复制代码
#pragma once

#include "common.hpp"
#include "log.hpp"
#include <memory>

using namespace LogModule;

const static int default_sockfd = -1;
const static int default_backlog = 16;

class Socket
{
public:
    virtual void SocketOrDie() = 0; // = 0, 不需要实现 
    virtual void BindOrDie(uint16_t port) = 0;
    virtual void ListenOrDie(int backlog) = 0;
    virtual void ConnectOrDie(std::string &server_ip, uint16_t server_port) = 0;
    virtual std::shared_ptr<Socket> Accept(InetAddr* client) = 0;
    virtual int Recv(std::string* out) = 0;
    virtual int Send(const std::string& in) = 0;
    virtual void Close() = 0;
public:

    void BuildTcpServer(uint16_t port)
    {
        SocketOrDie();
        BindOrDie(port);
        ListenOrDie(default_backlog);
    }

    void BuildTcpClient(std::string &server_ip, uint16_t server_port)
    {
        SocketOrDie();
        ConnectOrDie(server_ip,server_port);
    }
};

class TcpSocket : public Socket
{
public:
    TcpSocket(int sockfd = default_sockfd)
        : _sockfd(sockfd)
    {
    }
    virtual void Close() override
    {
        if (_sockfd != default_sockfd)
            ::close(_sockfd); // ::表示调用 全局作用域 中的 close 函数
    }
    virtual void SocketOrDie() override
    {
        _sockfd = ::socket(AF_INET, SOCK_STREAM, 0);
        if (_sockfd < 0)
        {
            LOG(LogLevel::FATAL) << "socket error!";
            exit(SOCKET_ERROR);
        }
        LOG(LogLevel::INFO) << "socket success, socket: " << _sockfd;
    }

    virtual void BindOrDie(uint16_t port) override
    {
        InetAddr local(port);
        int n = ::bind(_sockfd, CONST_CONV(local.Addr()), local.AddrLen());
        if (n < 0)
        {
            LOG(LogLevel::FATAL) << "bind error!";
            exit(BIND_ERROR);
        }
        LOG(LogLevel::INFO) << "bind success, socket: " << _sockfd;
    }

    virtual void ListenOrDie(int backlog) override
    {
        int n = ::listen(_sockfd, default_backlog);
        if (n < 0)
        {
            LOG(LogLevel::FATAL) << "listen error!";
            exit(LISTEN_ERROR);
        }
        LOG(LogLevel::INFO) << "listen success, sockfd: " << _sockfd;
    }

    virtual void ConnectOrDie(std::string &server_ip, uint16_t server_port) override
    {
        InetAddr server(server_ip, server_port);
        int n = ::connect(_sockfd, CONST_CONV(server.Addr()), server.AddrLen());
        if (n < 0)
        {
            LOG(LogLevel::FATAL) << "connect error!";
            exit(CONNECT_ERROR);
        }
        LOG(LogLevel::INFO) << "connect success, sockfd: " << _sockfd;
    }

    virtual std::shared_ptr<Socket> Accept(InetAddr* client) override
    {
        std::cout << std::endl;

        struct sockaddr_in addr;
        socklen_t len = sizeof(addr);
        int fd = ::accept(_sockfd, CONV(addr), &len);
        if (fd < 0)
        {
            LOG(LogLevel::WARNING) << "accept failed";
            return nullptr;
        }
        client->SetAddr(addr);
        LOG(LogLevel::INFO) << "accept success, client: " << client->StringAddr();

        return std::make_shared<TcpSocket>(fd); // 这个server的sockfd就可以调用Recv和Send方法。
    }

    virtual int Recv(std::string* out) override
    {
        char buf[1024*8];
        ssize_t n = ::recv(_sockfd,buf,sizeof(buf)-1,0);
        if(n > 0)
        {
            buf[n] = 0;
            *out += buf; // += 可能要不断的读 
        }

        return n;
    }

    virtual int Send(const std::string& in) override
    {
        return ::send(_sockfd,in.c_str(),in.size(),0);
    }

private:
    int _sockfd; // 既可以是listen_sockfd,也可以是sockfd,复用代码。
};
4.2.5 inet_addr.hpp
cpp 复制代码
#pragma once

#include <iostream>
#include <string>
#include <cstring>
#include <unistd.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>

class InetAddr
{
public:
    InetAddr() // 默认构造
    {}

    InetAddr(uint16_t port) :_port(port)
    {
        // 主机 -> 网络
        memset(&_addr, 0, sizeof(_addr));
        _addr.sin_family = AF_INET;

        _addr.sin_addr.s_addr = INADDR_ANY;
        
        _addr.sin_port = htons(_port);
    }
    
    InetAddr(struct sockaddr_in &addr)
        : _addr(addr)
    {
        // 网络 -> 主机
        char buf[32];
        inet_ntop(AF_INET, &_addr.sin_addr, buf, sizeof(buf) - 1);
        _ip = buf;

        _port = ntohs(_addr.sin_port);
    }

    InetAddr(const std::string &ip, uint16_t port)
        : _ip(ip), _port(port)
    {
        // 主机 -> 网络
        memset(&_addr, 0, sizeof(_addr));
        _addr.sin_family = AF_INET;

        inet_pton(AF_INET, _ip.c_str(), &_addr.sin_addr);

        _addr.sin_port = htons(_port);
    }

    void SetAddr(struct sockaddr_in &addr)
    {
        // 网络 -> 主机
        _addr = addr;
        
        char buf[32];
        inet_ntop(AF_INET, &_addr.sin_addr, buf, sizeof(buf) - 1);
        _ip = buf;

        _port = ntohs(_addr.sin_port);
    }

    std::string Ip() const
    {
        return _ip;
    }

    uint16_t Port() const
    {
        return _port;
    }

    const struct sockaddr_in& Addr() const
    {
        return _addr;
    }

    socklen_t AddrLen() const 
    {
        return sizeof(_addr);
    }

    std::string StringAddr() const
    {
        return _ip + ":" + std::to_string(_port);
    }

    bool operator==(const InetAddr &addr) const
    {
        return _ip == addr._ip && _port == addr._port; // 我们任务ip和port相同,才相等;允许一个ip的多个端口访问。
    }

private:
    struct sockaddr_in _addr;
    std::string _ip;
    uint16_t _port;
};
4.2.6 服务器日志

5、结果展示及完整代码

6、项目总结

  • 不足:
    • 没有做到全站搜索,还有一些html文档不在doc/html路径下,并且只支持搜索一个版本的Boost库。
    • cppjieba分词,可能会存在暂停词。
  • 可优化:
    • 可以利用爬虫技术,获取文档资源,并定期更新重新构建索引。
    • 可以进行热词统计,显示搜索关键词。
    • 可以添加登录注册的功能,引入对MySQL的使用。