UVa 1597 Searching the Web

题目分析

问题描述

本题要求实现一个小型搜索引擎，基于倒排索引结构处理布尔查询。系统需要从给定的文档集合中构建索引，并支持四种查询类型：

单个词查询
AND 查询（两个词的交集）
OR 查询（两个词的并集）
NOT 查询（不包含某个词的文档）

输入格式

文档数量 NNN (0<N<1000 < N < 1000<N<100)
NNN 个文档，每个文档以 ********** 结束
查询数量 MMM (0<M≤500000 < M \leq 500000<M≤50000)
MMM 个查询，每行一个

输出要求

对于普通查询，输出包含查询词的行
对于 NOT 查询，输出整个文档
文档间用 ---------- 分隔
每个查询结果以 ========== 结束
无结果时输出 Sorry, I found nothing.

关键约束

词法分析：非字母字符分隔单词，忽略大小写
停用词："the", "a", "to", "and", "or", "not" 不参与索引和查询
不考虑词形变化和连字符组合

解题思路

1. 数据结构设计

使用以下核心数据结构：

vector<vector<string>> documents：存储原始文档内容
unordered_map<string, vector<pair<int, int>>> invertedIndex：倒排索引，键为词，值为 (文档ID, 行号) 列表
unordered_set<string> stopWords：停用词集合

2. 索引构建流程

文档读取 ：逐行读取文档，遇到 ********** 结束
词法分析 ：
- 转换为小写
- 按非字母字符分词
- 过滤停用词
索引更新 ：对每个有效词，记录其出现的 (文档ID, 行号)

3. 查询处理策略

单个词查询

直接从倒排索引获取对应列表，按文档和行号排序输出

AND\texttt{AND}AND 查询

分别获取两个词的文档集合
求交集（两个词都出现的文档）
合并匹配行并去重输出

OR\texttt{OR}OR 查询

分别获取两个词的文档集合
求并集（任一词出现的文档）
合并匹配行并去重输出

NOT\texttt{NOT}NOT 查询

获取查询词的文档集合
输出所有不包含该词的完整文档

4. 输出处理优化

使用 set 自动去重和排序
维护文档顺序和行号顺序
正确处理分隔符的插入时机

参考代码

cpp 复制代码

// Searching the Web
// UVa ID: 1597
// Verdict: Accepted
// Submission Date: 2025-11-01
// UVa Run Time: 0.030s
//
// 版权所有（C）2025，邱秋。metaphysis # yeah dot net

#include <bits/stdc++.h>

using namespace std;

// 停用词集合
const unordered_set<string> STOP_WORDS = {"the", "a", "to", "and", "or", "not"};

// 将字符串转为小写
string toLowerCase(const string& inputStr) {
    string result = inputStr;
    for (char& ch : result) {
        ch = tolower(ch);
    }
    return result;
}

// 分词函数：按非字母字符切分
vector<string> tokenizeLine(const string& lineText) {
    vector<string> tokens;
    string currentToken;
    for (char ch : lineText) {
        if (isalpha(ch)) {
            currentToken += tolower(ch);
        } else {
            if (!currentToken.empty()) {
                tokens.push_back(currentToken);
                currentToken.clear();
            }
        }
    }
    if (!currentToken.empty()) {
        tokens.push_back(currentToken);
    }
    return tokens;
}

int main() {
    ios::sync_with_stdio(false);
    cin.tie(nullptr);

    int docCount;
    cin >> docCount;
    cin.ignore(); // 忽略换行符

    vector<vector<string>> documents(docCount); // 存储所有文档内容
    unordered_map<string, vector<pair<int, int>>> invertedIndex; // 倒排索引：词 -> [(文档ID, 行号)]

    // 读取并处理所有文档
    for (int docId = 0; docId < docCount; ++docId) {
        string line;
        while (getline(cin, line)) {
            if (line == "**********") break; // 文档结束标记
            documents[docId].push_back(line);
            // 分词并构建索引
            vector<string> words = tokenizeLine(line);
            for (const string& word : words) {
                // 过滤停用词
                if (STOP_WORDS.find(word) == STOP_WORDS.end()) {
                    invertedIndex[word].emplace_back(docId, documents[docId].size() - 1);
                }
            }
        }
    }

    int queryCount;
    cin >> queryCount;
    cin.ignore();

    // 处理每个查询
    for (int queryIdx = 0; queryIdx < queryCount; ++queryIdx) {
        string queryStr;
        getline(cin, queryStr);

        // 根据查询类型分别处理
        if (queryStr.find("AND") != string::npos) {
            // AND 查询：term1 AND term2
            size_t andPos = queryStr.find(" AND ");
            string term1 = queryStr.substr(0, andPos);
            string term2 = queryStr.substr(andPos + 5);

            auto& term1List = invertedIndex[term1];
            auto& term2List = invertedIndex[term2];

            // 标记包含每个词的文档
            vector<bool> hasTerm1(docCount, false), hasTerm2(docCount, false);
            for (const auto& entry : term1List) hasTerm1[entry.first] = true;
            for (const auto& entry : term2List) hasTerm2[entry.first] = true;

            // 找出同时包含两个词的文档
            vector<int> matchingDocs;
            for (int docId = 0; docId < docCount; ++docId) {
                if (hasTerm1[docId] && hasTerm2[docId]) {
                    matchingDocs.push_back(docId);
                }
            }

            // 输出匹配的行
            bool foundMatch = false;
            for (size_t i = 0; i < matchingDocs.size(); ++i) {
                int docId = matchingDocs[i];
                if (i > 0) cout << "----------\n";

                // 收集匹配行并去重
                set<int> matchedLines;
                for (const auto& entry : term1List) {
                    if (entry.first == docId) {
                        matchedLines.insert(entry.second);
                    }
                }
                for (const auto& entry : term2List) {
                    if (entry.first == docId) {
                        matchedLines.insert(entry.second);
                    }
                }

                // 输出匹配行
                for (int lineNum : matchedLines) {
                    cout << documents[docId][lineNum] << '\n';
                    foundMatch = true;
                }
            }
            if (!foundMatch) {
                cout << "Sorry, I found nothing.\n";
            }

        } else if (queryStr.find("OR") != string::npos) {
            // OR 查询：term1 OR term2
            size_t orPos = queryStr.find(" OR ");
            string term1 = queryStr.substr(0, orPos);
            string term2 = queryStr.substr(orPos + 4);

            auto& term1List = invertedIndex[term1];
            auto& term2List = invertedIndex[term2];

            // 标记包含任一词的文档
            vector<bool> matchedDocs(docCount, false);
            for (const auto& entry : term1List) matchedDocs[entry.first] = true;
            for (const auto& entry : term2List) matchedDocs[entry.first] = true;

            vector<int> docsInOrder;
            for (int docId = 0; docId < docCount; ++docId) {
                if (matchedDocs[docId]) docsInOrder.push_back(docId);
            }

            // 输出匹配的行
            bool foundMatch = false;
            for (size_t i = 0; i < docsInOrder.size(); ++i) {
                int docId = docsInOrder[i];
                if (i > 0) cout << "----------\n";

                // 收集匹配行并去重
                set<int> matchedLines;
                for (const auto& entry : term1List) {
                    if (entry.first == docId) {
                        matchedLines.insert(entry.second);
                    }
                }
                for (const auto& entry : term2List) {
                    if (entry.first == docId) {
                        matchedLines.insert(entry.second);
                    }
                }

                // 输出匹配行
                for (int lineNum : matchedLines) {
                    cout << documents[docId][lineNum] << '\n';
                    foundMatch = true;
                }
            }
            if (!foundMatch) {
                cout << "Sorry, I found nothing.\n";
            }

        } else if (queryStr.find("NOT") != string::npos) {
            // NOT 查询：NOT term
            string term = queryStr.substr(4);

            // 标记包含该词的文档
            vector<bool> excludeDocs(docCount, false);
            if (invertedIndex.find(term) != invertedIndex.end()) {
                for (const auto& entry : invertedIndex[term]) {
                    excludeDocs[entry.first] = true;
                }
            }

            // 输出不包含该词的完整文档
            bool foundMatch = false;
            for (int docId = 0; docId < docCount; ++docId) {
                if (!excludeDocs[docId]) {
                    if (foundMatch) cout << "----------\n";
                    for (const string& line : documents[docId]) {
                        cout << line << '\n';
                    }
                    foundMatch = true;
                }
            }
            if (!foundMatch) {
                cout << "Sorry, I found nothing.\n";
            }

        } else {
            // 单个词查询
            string term = queryStr;

            if (invertedIndex.find(term) == invertedIndex.end() || invertedIndex[term].empty()) {
                cout << "Sorry, I found nothing.\n";
            } else {
                auto& termList = invertedIndex[term];

                // 去重并排序
                set<pair<int, int>> uniqueEntries;
                for (const auto& entry : termList) {
                    uniqueEntries.insert(entry);
                }

                // 按文档和行号顺序输出
                int prevDocId = -1;
                for (const auto& entry : uniqueEntries) {
                    int docId = entry.first;
                    int lineNum = entry.second;
                    if (docId != prevDocId) {
                        if (prevDocId != -1) cout << "----------\n";
                        prevDocId = docId;
                    }
                    cout << documents[docId][lineNum] << '\n';
                }
            }
        }

        // 每个查询结果结束标记
        cout << "==========\n";
    }

    return 0;
}