Java8 API 文档搜索引擎_2.索引模块（程序）

文档搜索引擎模块划分（第一篇）见系列文章：

https://blog.csdn.net/m0_63299495/article/details/145805937https://blog.csdn.net/m0_63299495/article/details/145805937

本文为索引模块程序篇。

[3.1 索引模块](#3.1 索引模块)

[3.1.1 Paser类](#3.1.1 Paser类)

[3.1.2 Index类](#3.1.2 Index类)

[1. DocInfo类和Weight类](#1. DocInfo类和Weight类)

[2. 成员方法](#2. 成员方法)

3.1 索引模块

Parser相当于是制作索引的入口，Index相当于实现了索引的数据结构，提供了一些API，Parser调用Index类，即可制作整个索引。

3.1.1 Paser类

Parser类用于读取和解析下载的文档，制作并输出索引到文件中：

从指定目录中枚举所有子文件，读取每个文件，从文件中解析除HTML的标题、正文、URL；

java 复制代码

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

// Parser类用于读取和解析下载的文档，并制作索引
public class Parser {
//    指定加载文档的路径
    private static final String INPUT_PATH="E:/SearchEngineProject/jdk-8u441-docs-all/docs/api/";
//    创建一个Index实例制作索引
    private Index index = new Index();

//    run方法是Parser类的入口
    public void run(){
        long beg = System.currentTimeMillis();
        System.out.println("开始构造索引");

        long Beg = System.currentTimeMillis();
//        1、根据加载文档路径，枚举该路径目录及其子目录下的所有文件(html)
        ArrayList<File> fileList=new ArrayList<>();
        // INPUT-PATH表示开始进行递归遍历的起始目录
        // fileList表示递归遍历的结果
        enumFile(INPUT_PATH, fileList);
        long enumFileEnd = System.currentTimeMillis();
        System.out.println("枚举文件耗时："+(enumFileEnd-Beg)+" ms");

//        System.out.println(fileList);
//        System.out.println(fileList.size());
//        2、根据罗列出的文件路径打开文件，读取文件内容，进行解析并构建索引

        for(File f: fileList){
//            parseHTML方法用于解析单个HTML文件
            System.out.println("开始解析 "+f.getAbsolutePath());
            parseHTML(f);
        }
        long forEnd = System.currentTimeMillis();
        System.out.println("遍历文件耗时："+(forEnd-enumFileEnd)+" ms");
//        3、把内存中构造的索引数据结构保存到指定文件中
        index.save();
        System.out.println("完成构造索引");
        long end = System.currentTimeMillis();
        System.out.println("构建索引耗时："+(end - beg)+" ms ");
    }

    // 解析当前HTML文件
    private void parseHTML(File f) {
//        1、解析HTML的标题
        String title=parseTitle(f);
//        2、解析HTML对应的URL
        String url=parseURL(f);
//        3、解析HTML对应的正文（后续根据正文再处理HTML的描述）
        String content=parseContent(f);
//        4、将解析出来的信息加入到索引中
        index.addDoc(title,url,content);
    }

    // 解析HTML的正文
    public String parseContent(File f) {
//        去标签
        try (FileReader fileReader=new FileReader(f)){
//            拷贝标志位
            boolean isCopy=true;
//            创建保存结果的StringBuilder
            StringBuilder content=new StringBuilder();
            while(true){
//                使用字符流读取方式fileReader
//                read方法的返回值是整型，当返回-1时表示文件读取结束
                int ret = fileReader.read();
                if(ret==-1){
                    break;
                }
                char c= (char)ret;
                if(isCopy){
//                    开关打开：进行拷贝
                    if(c=='<'){
                        isCopy=false;
                        continue;
                    }
//                    去除正文信息中的空行：若当前字符为换行符或回车符，则替换为空格
                    if(c=='\n' || c=='\r'){
                        c=' ';
                    }
                    // 其他字符则进行拷贝
                    content.append(c);
                }else{
//                    开关关闭：不进行拷贝
                    if(c=='>'){
                        isCopy=true;
                    }
                }
            }
            return content.toString();
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return "";
    }

    // 解析HTML文件的URL
    private String parseURL(File f) {
//        对固定前缀+本地文档路径进行拼接以实现向线上文档对应页面的跳转
        // 以在线文档的基地址为固定前缀
        String part1="https://docs.oracle.com/javase/8/docs/api/";
        // 以本地文档的具体某个文档为后缀
        String part2=f.getAbsolutePath().substring(INPUT_PATH.length());
        // 拼接得到线上文档的完整路径
        return part1+part2;
    }

    // 解析HTML的标题
    private String parseTitle(File f) {
//        html文件名去掉html后缀就是标题
        return f.getName().substring(0,f.getName().length()-5);
    }

    // 列举指定目录的所有html文件（包括子目录中的子文件）
    private void enumFile(String inputPath, ArrayList<File> fileList) {
        // 参数1：inputPath：开始递归遍历的目录；
        // 参数2：fileList：表示递归遍历的结果；
        File rootPath=new File(inputPath);
        File[] files=rootPath.listFiles();
        for(File f: files){
            // 当前f为目录，则递归调用enumFile进，以当前目录为根目录进行子目录的解析
            if(f.isDirectory()){
                enumFile(f.getAbsolutePath(),fileList);
                // 仅将html文件加入最终文件结果列表
            }else{
                if(f.getAbsolutePath().endsWith(".html")) {
                    fileList.add(f);
                }
            }
        }
    }
    //    制作索引
    public static void main(String[] args) {
        Parser parser = new Parser();
        try {
            parser.run();
        } catch (InterruptedException e) {
            throw new RuntimeException(e);
        }
    }
}

3.1.2 Index类

Index类用于在内存中制作索引。

首先实现两个封装类，一个DocInfo表示文档信息，一个Weight表示文档Id与关键词和文档的关联性强弱：

1. DocInfo类和Weight类

DocInfo类：

java 复制代码

public class DocInfo {
    private int docId;
    private String title;
    private String url;
    private String content;
// Getter与Setter方法略
}

Weight类：

java 复制代码

// 将  文档id  与 文档和查找关键词的相关性 的权重进行封装
public class Weight {
    // 文档id
    private int docId;
    // 文档与关键词的相关性：值越大表示相关性越强
    private int weight;
// Getter与Setter方法略
}

2. 成员方法

（1）给定一个docId，在正排索引中查询文档的详细信息：

正排索引采用一个数组实现，数组下标表示文档id，数组元素类型为DocInfo：

java 复制代码

    // 正排索引：使用数组下标表示docId
    private ArrayList<DocInfo> forwardIndex = new ArrayList<>();

    /*
    * 1. 给定一个docId，在正排索引中查询对应文档的详细信息；
    */
    public DocInfo getDocInfo(int docId){
        return forwardIndex.get(docId);
    }

（2）给定一个词，在倒排索引中查找哪些文档与这个词关联：

倒排索引使用哈希表表示，key是关键词，value是以该关键词关联的文章的Weight为元素的数组：

java 复制代码

    // 倒排索引：使用哈希表表示，key是关键词，value为以与该关键词关联的文章的封装类Weight为元素的数组
    private HashMap<String, ArrayList<Weight>> invertedIndex = new HashMap<>();


    /* 2. 给定一个关键词，在倒排索引中查询与该关键词关联的文档；
       将文档id 与 文档和关键词的相关性 封装为Weight类
    */
    public List<Weight> getInverted(String term){
        // 需将用户输入的一句话进行分词，再作为关键词去查倒排索引
        return invertedIndex.get(term);
    }

（3）往索引中新增文档，保存正排索引与倒排索引：

java 复制代码

    /* 3. 向索引中新增文档：正排索引与倒排索引中均需新增；
    */
    public void addDoc(String title, String url, String content){
        // 构建正排索引
        DocInfo docInfo = buildForward(title, url, content);
        // 构建倒排索引
        buildInverted(docInfo);
    }

其中，构建正排索引的方法如下：

java 复制代码

// 构建正排索引
    private DocInfo buildForward(String title, String url, String content) {
        DocInfo docInfo = new DocInfo();
        // 将该DocInfo插入到以docId为下标的正排索引数组位置处

        docInfo.setTitle(title);
        docInfo.setContent(content);
        docInfo.setUrl(url);
        synchronized (locker1){
            docInfo.setDocId(forwardIndex.size());
            forwardIndex.add(docInfo);
        }
        return docInfo;
    }

构建倒排索引的方法如下：

java 复制代码

// 构建倒排索引
    private void buildInverted(DocInfo docInfo) {
        class WordCount{
            // 关键词在标题中出现的次数
            public int titleCount;
            // 关键词在正文中出现的次数
            public int contentCount;
        }
        // 统计词频
        HashMap<String, WordCount> wordCountHashMap = new HashMap<>();
        // 1. 针对文档标题进行分词(ansj分词库提供了大写转小写功能)
        List<Term> terms= ToAnalysis.parse(docInfo.getTitle()).getTerms();
        // 2. 遍历分词结果，统计每个词出现的次数
        for(Term term: terms){
            String word = term.getName();
            WordCount wordCount =wordCountHashMap.get(word);
            // 该词首次出现，创建新键值对进行插入，并将计数置为1
            if(wordCount == null){
                WordCount newWordCount = new WordCount();
                newWordCount.titleCount =1;
                newWordCount.contentCount=0;
                wordCountHashMap.put(word,newWordCount);
            }else{
                // 该词非首次出现，找到对应键值对，进行计数自增
                wordCount.titleCount+=1;
            }
        }
        // 3. 针对正文进行分词
        terms = ToAnalysis.parse(docInfo.getContent()).getTerms();
        // 4. 遍历分词结果，统计每个词出现的次数
        for(Term term: terms){
            String word = term.getName();
            WordCount wordCount = wordCountHashMap.get(word);
            if(wordCount == null){
                WordCount newWordCount = new WordCount();
                newWordCount.titleCount = 0;
                newWordCount.contentCount=1;
                wordCountHashMap.put(word,newWordCount);
            }else{
                wordCount.contentCount+=1;
            }
        }
        // 5. 将以上分词结果均汇总到一个HashMap中，并进行最终权重的计算
        // 6. 遍历HashMap,依次更新倒排索引结构
        for(Map.Entry<String, WordCount> entry : wordCountHashMap.entrySet()){
            // 倒排拉链，即根据关键词去倒排索引中查找的结果
            synchronized (locker2){
                List<Weight> invertedList = invertedIndex.get(entry.getKey());
                // 如果为空则插入新键值对
                if(invertedList == null){
                    ArrayList<Weight> newInvertedList = new ArrayList<>();
                    // 把当前的文档信息docInfo构造成Weight对象
                    Weight weight = new Weight();
                    weight.setDocId(docInfo.getDocId());
                    // 假定权重公式：标题中出现的次数*10+正文中出现的次数*1
                    weight.setWeight(entry.getValue().titleCount*10+entry.getValue().contentCount);
                    newInvertedList.add(weight);
                    invertedIndex.put(entry.getKey(),newInvertedList);
                }else{
                    //非空则将当前文档信息docInfo构造成Weight对象插入倒排拉链
                    Weight weight = new Weight();
                    weight.setDocId(docInfo.getDocId());
                    weight.setWeight(entry.getValue().titleCount*10+entry.getValue().contentCount);
                    invertedList.add(weight);
                }
            }
        }
    }

（4）把内存中的索引结构保存到磁盘中：

java 复制代码

    /* 4. 把内存中的索引结构保存在磁盘中；
    */
    public void save(){
        long beg = System.currentTimeMillis();
        // 判断索引对应目录是否存在
        System.out.println("开始保存索引");
        File indexPathFile = new File(INDEX_PATH);
        if(!indexPathFile.exists()){
            indexPathFile.mkdirs();
        }
        // 创建两个文件分别保存正排索引和倒排索引
        File forwardIndexFile = new File(INDEX_PATH+"forward.txt");
        File invertedIndexFile = new File(INDEX_PATH+"inverted.txt");
        try {
            objectMapper.writeValue(forwardIndexFile, forwardIndex);
            objectMapper.writeValue(invertedIndexFile, invertedIndex);
        } catch (IOException e) {
            e.printStackTrace();
        }
        System.out.println("完成保存索引");
        long end = System.currentTimeMillis();
        System.out.println("索引保存耗时："+(end-beg)+" ms ");
    }

（5）把磁盘中的索引数据加载到内存中：

java 复制代码

    /* 5. 把磁盘中的索引数据加载到内存中；*/
    public void load(){
        long beg = System.currentTimeMillis();
        System.out.println("开始加载索引");
        // 指定加载索引路径
        File forwardIndexFile = new File(INDEX_PATH+"forward.txt");
        File invertedIndexFile = new File(INDEX_PATH+"inverted.txt");
        try{
            // 创建匿名内部类（该类实现了TypeReference),再创建一个匿名内部类的实例
            forwardIndex = objectMapper.readValue(forwardIndexFile, new TypeReference<ArrayList<DocInfo>>() {});
        }catch (Exception e){
            e.printStackTrace();
        }
        System.out.println("完成加载索引");
        long end = System.currentTimeMillis();
        System.out.println("索引加载耗时："+(end-beg) +" ms ");
    }

程序详细解释见后续文章。