深入解析大语言模型的网页总结原理

一、引言

现在豆包、kimi 这些AI应用都出了浏览器插件，可以非常方便的进行网页总结。但大语言模型是不支持直接获取互联网信息的，都是通过应用层获取网页内容然后组织提示词再让大语言模型回答。

这里通过我之前封装的

浏览器引擎-Playwright工具封装

大语言模型客户端的工厂模式实践

然后配上 DeepSeek 的 API 进行大语言模型网页总结实践。

二、网页总结

裸llm是不支持获取网页内容，它也不知道最新的互联网信息。

python 复制代码

import asyncio

from src.llm.client import OpenAIClient
from src.llm.config import OpenAIConfig
from src.llm.factory import LLMFactory
from src.llm.schemas import LLMType, OpenAIModel
from src.settings import llm_setting


def get_llm_client() -> OpenAIClient:
    llm_config = OpenAIConfig(
        api_key=llm_setting.deepseek_api_key,
        base_url=llm_setting.deepseek_base_url,
        llm_model=OpenAIModel.DEEPSEEK_CODER,
    )
    llm_client: OpenAIClient = LLMFactory.build(llm_type=LLMType.OPENAI, llm_config=llm_config)
    return llm_client


async def main():
    llm_client = get_llm_client()

    query = "总结这篇文章 https://juejin.cn/post/7283532551473725497"
    print("query:", query)
    resp = await llm_client.aask(query)
    print("resp:", resp)
    
    query = "厦门今天天气"
    print("\nquery:", query)
    resp = await llm_client.aask(query)
    print("resp:", resp)


if __name__ == "__main__":
    asyncio.run(main())

这里通过 LLMFactory 工厂类构造大语言模型客户端，测试结果如图

总结文章llm在乱回答，文章内容根本不是这个
查询天气信息llm它不会，在让你自己去查询天气信息

浏览器引擎获取网页内容

通过浏览器引擎- playwright 来获取网页内容，然后增强 llm

python 复制代码

import asyncio

from py_tools.utils import RegexUtil

from src.llm.client import OpenAIClient
from src.llm.config import OpenAIConfig
from src.llm.factory import LLMFactory
from src.llm.schemas import LLMType, OpenAIModel
from src.settings import llm_setting
from src.tools.browser_engine import BrowserEngine


def get_llm_client() -> OpenAIClient:
    llm_config = OpenAIConfig(
        api_key=llm_setting.deepseek_api_key,
        base_url=llm_setting.deepseek_base_url,
        llm_model=OpenAIModel.DEEPSEEK_CODER,
    )
    llm_client: OpenAIClient = LLMFactory.build(llm_type=LLMType.OPENAI, llm_config=llm_config)
    return llm_client


async def main():
    llm_client = get_llm_client()

    query = "总结这篇文章 https://juejin.cn/post/7283532551473725497"
    print("query:", query)
    
    urls = RegexUtil.find_http_links(query)
    print(urls)
    
    browser_engine = BrowserEngine(headless=True)
    web_pages = await browser_engine.fetch_page_content(urls, timeout=3)
    context = ""
    for web_page in web_pages:
        # print(web_page.inner_text)
        context = f"{context}{web_page.inner_text}\n"
    
    query = query + f"文章内容如下 {context}"
    print("answer:")
    for token in llm_client.ask(query, stream=True):
        print(token, end="")


if __name__ == "__main__":
    asyncio.run(main())

先通过正则把用户的问题的网页链接提取出来
然后用 BrowserEngine 浏览器引擎获取网页链接内容
用户问题拼接上网页内容后再提问

注意：使用浏览器引擎- playwright 进行获取网页内容，由于自带了一个浏览器内核进行网页渲染并获取网页内容整体的耗时相对来说会有点长，如果使用requests aiohttp 等库性能会比较好，但是现在网页内容很少直接返回全部的html结构，大部分都是通过 js 动态渲染内容，所以很难直接获取浏览器渲染之后的网页内容信息。

封装网页总结优化提示词

上面已经实现了网页总结功能，但网页总结逻辑太分散了，这里通过封装一个 WebPageSummaryAction 来把网页总结逻辑组合，并配置比较完善的提示词模板进行组织。

python 复制代码

import asyncio

from py_tools.utils import RegexUtil

from src.actions import WebPageSummaryAction
from src.llm.client import OpenAIClient
from src.llm.config import OpenAIConfig
from src.llm.factory import LLMFactory
from src.llm.schemas import LLMType, OpenAIModel
from src.settings import llm_setting
from src.tools.browser_engine import BrowserEngine


def get_llm_client() -> OpenAIClient:
    llm_config = OpenAIConfig(
        api_key=llm_setting.deepseek_api_key,
        base_url=llm_setting.deepseek_base_url,
        llm_model=OpenAIModel.DEEPSEEK_CODER,
    )
    llm_client: OpenAIClient = LLMFactory.build(llm_type=LLMType.OPENAI, llm_config=llm_config)
    return llm_client


async def main():
    llm_client = get_llm_client()

    query = "总结这篇文章 https://juejin.cn/post/7283532551473725497"
    print("query:", query)
    
    resp = await WebPageSummaryAction(llm_client=llm_client).run(query, stream=True)
    for token in resp:
        print(token, end="")


if __name__ == "__main__":
    asyncio.run(main())

网页总结的操作都封装到了 WebPageSummaryAction 类，调用起来就非常方便了

python 复制代码

#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Author: Hui
# @File: content_summarize.py
# @Desc: { 内容总结：网页、文档总结等 }
# @Date: 2024/09/20 15:26
from py_tools.utils import RegexUtil

from src.llm.base import BaseLLMClient
from src.prompts import action_prompts
from src.tools.browser_engine import BrowserEngine


class WebPageSummaryAction:
    """网页总结"""

sys_prompt = action_prompts.WEBPAGE_SUMMARIZE_SYS_PROMPT
    prompt_template = """用户问题：{query}
    已获取到以下网页内容：
    
    网页内容：
    {page_context}
    
    请根据用户的问题和网页内容，为我总结其主要信息
    """
    browser_engine = BrowserEngine(headless=True)

    def __init__(self, llm_client: BaseLLMClient):
        self.client = llm_client
        self.client.setup_system_content(self.sys_prompt)

    async def _get_page_context(self, query):
        """获取网页内容作为用户的上下文信息"""
page_context = ""
        urls = RegexUtil.find_http_links(query)
        if not urls:
            return page_context

        web_pages = await self.browser_engine.fetch_page_content(urls, timeout=3)
        for web_page in web_pages:
            page_context = f"{page_context}{web_page.inner_text}\n"
        return page_context

    async def run(self, query: str, stream: bool = False):
        page_context = await self._get_page_context(query)
        query = self.prompt_template.format(query=query, page_context=page_context)
        return await self.client.aask(query=query, stream=stream)

其内部维护了一个 llm 客户端与浏览器引擎，以及相关的系统提示词与用户提示词模板

网页总结系统提示词如下

python 复制代码

# 网页总结
WEBPAGE_SUMMARIZE_SYS_PROMPT = """你是一个智能系统，专门用于对网页内容进行总结。你的任务是根据提供的网页内容生成简明的摘要。总结应抓住核心要点，突出文章的主要内容，保持简洁易懂。以下是任务的具体要求：

- 读取网页内容并理解其主要信息。
- 提取出最重要的观点、事实或结论。

请基于提供的网页内容完成摘要任务。
"""

整个逻辑就是

正则获取用户需要总结的网页地址
通过浏览器引擎（BroswerEngine）获取网页地址内容
内设 llm 的系统提示词，把网页内容作为用户的上下文信息进行组织用户提示词模板
根据新的提示词向 llm 提问

提示词对于大语言模型来说其实很重要，好的提示词可以让llm生成较准确的内容

加上意图识别

网页总结需要知道具体的场景才能如上面这样直接使用，像豆包、kimi的浏览器插件助手就是明确了网页总结场景，但其网页端的会话就是一个比较通用的问答，是不知道用户的意图，故需要封装一个意图识别，好动态的进行使用不同工具来增强 llm 操作。

意图枚举如下

python 复制代码

class IntentType(Enum):
    NORMAL = "NORMAL" # 正常裸llm
    SEARCH = "SEARCH" # 联网搜索
    WEBPAGE_SUMMARIZE = "WEBPAGE_SUMMARIZE" # 网页总结
    DOC_SUMMARIZE = "DOC_SUMMARIZE" # 文档总结

意图识别Action封装

python 复制代码

#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Author: Hui
# @File: intent_recognition.py
# @Desc: { 意图识别模块 }
# @Date: 2024/08/24 22:55
from py_tools.logging import logger

from src.actions.schemas import IntentType
from src.llm.base import BaseLLMClient
from src.prompts import action_prompts


class IntentRecognitionAction:
    """意图识别"""

sys_prompt = action_prompts.INTENT_RECOGNITION_SYS_PROMPT
    prompt_template = """请根据用户的问题返回最匹配的意图类型名称。问题: {query}"""

    def __init__(self, llm_client: BaseLLMClient):
        self.client = llm_client
        self.client.setup_system_content(self.sys_prompt)

    async def run(self, query: str, stream: bool = False) -> IntentType:
        try:
            query = self.prompt_template.format(query=query)
            intent_type = await self.client.aask(query=query, stream=stream)
            intent_type = str(intent_type).strip()
            intent_type = IntentType(intent_type)
        except Exception as e:
            logger.warning(str(e))
            intent_type = IntentType.NORMAL
        return intent_type

意图识别系统提示词

python 复制代码

# 意图识别
INTENT_RECOGNITION_SYS_PROMPT = """你是一个意图识别模型，需要根据用户的输入判断其意图类型。以下是你可以返回的意图类型及其定义：

- NORMAL: 用户的问题是日常询问，不需要搜索或对内容进行总结。
- SEARCH: 用户的问题表明需要进行信息检索或搜索操作来找到答案。
- WEBPAGE_SUMMARIZE: 用户的问题表明需要对一个网页的内容进行总结。
- DOC_SUMMARIZE: 用户的问题表明需要对一个文档内容进行总结。

任务：
1. 仔细阅读用户的问题。
2. 根据问题内容，判断最符合的意图类型。
3. 返回意图类型名称（例如：NORMAL, SEARCH, WEBPAGE_SUMMARIZE, DOC_SUMMARIZE）。

例子：
- 用户输入："你好"
  返回：NORMAL
  
- 用户输入："厦门天气情况"
  返回：SEARCH

- 用户输入："帮我总结这篇文章 https://juejin.cn/post/7283532551473725497"
  返回：WEBPAGE_SUMMARIZE

- 用户输入："这个PDF文档的主要内容是什么？"
  返回：DOC_SUMMARIZE

请根据这些指引对用户的问题进行意图识别。
"""

效果测试

python 复制代码

#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Author: Hui
# @File: main.py
# @Desc: { 模块描述 }
# @Date: 2024/08/20 14:24
import asyncio

from src.actions import IntentRecognitionAction, InternetSearchAction, WebPageSummaryAction
from src.actions.schemas import IntentType
from src.llm.client import OpenAIClient
from src.llm.config import OpenAIConfig
from src.llm.factory import LLMFactory
from src.llm.schemas import LLMType, OpenAIModel
from src.settings import llm_setting


def get_llm_client() -> OpenAIClient:
    llm_config = OpenAIConfig(
        api_key=llm_setting.deepseek_api_key,
        base_url=llm_setting.deepseek_base_url,
        llm_model=OpenAIModel.DEEPSEEK_CODER,
    )
    llm_client: OpenAIClient = LLMFactory.build(llm_type=LLMType.OPENAI, llm_config=llm_config)
    return llm_client


async def internet_recognition_action(query: str):
    print("query:", query)
    llm_client = get_llm_client()

    # 意图识别
    intent_type = await IntentRecognitionAction(llm_client=llm_client).run(query)
    print("intent type:", intent_type)

    if intent_type == IntentType.WEBPAGE_SUMMARIZE:
        # 网页总结
        resp = await WebPageSummaryAction(llm_client=llm_client).run(query, stream=True)
        for token in resp:
            print(token, end="")

    elif intent_type == IntentType.DOC_SUMMARIZE:
        # 文档总结
        print("resp: 暂无")

    elif intent_type == IntentType.SEARCH:
        # 联网搜索
        ie_search_action = InternetSearchAction(llm_client=llm_client)
        ret = await ie_search_action.run(query)
        print("resp:", ret)

    else:
        # 裸llm
        llm_client.setup_system_content()   # 共享同一个llm 需要清空之前配置的系统提示词
        ret = await llm_client.aask(query=query)
        print("resp:", ret)

    print()


async def main():
    queries = [
        "深圳天气情况",
        "总结这pdf",
        "你是谁",
        "总结这篇文章 https://juejin.cn/post/7283532551473725497",
    ]
    for query in queries:
        await internet_recognition_action(query)


if __name__ == "__main__":
    asyncio.run(main())

三、总结

大语言模型并非万能，许多细节需要通过专业的提示词设计和应用层技术手段来增强大语言模型功能，从而实现网页总结、联网搜索、私有知识库检索、AI智能体等应用场景。

四、源代码

AGI-Demo：AGI技术练习案例 Github：github.com/HuiDBK/AGI-...