AI - 谈谈RAG中的查询分析（2）

大家好，RAG中的查询分析是比较有趣的一个点，内容丰富，并不是一句话能聊的清楚的。今天接着上一篇，继续探讨RAG中的查询分析，并在功能层面和代码层面持续改进。

功能层面

如果用户问了一个不着边际的问题，也就是和工具无关的问题，那么无须调用工具，直接生成答案。否则，就调用工具，检索本地知识库，生成答案。

代码方面

考虑到在对答聊天中，对话状态是如此重要，所以我们可以直接使用LangChain内置的MessagesState，而不用自己定义State类。
python 复制代码
```
class State(TypedDict):
    question: str
    query: Search
    context: List[Document]
    answer: str
```
上一篇的Search工具主要用于结构化输出，工具本身没有实质性内容，所以本篇会将retrieve作为一个工具，既可以绑定到LLM，也可以通过LangGraph内置的组件 ToolNode,，形成一个Graph节点，在收到LLM的应答之后，开始执行从本地知识库语义搜索的动作，最终生成一个ToolMessage。

实例代码

备注：对于本文中的代码片段，主体来源于LangChain官网，有兴趣的读者可以去官网查看。

python 复制代码

import os
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore
import bs4
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.graph import START, StateGraph, MessagesState
from typing_extensions import List, TypedDict
from langchain_core.tools import tool
from langchain_core.messages import SystemMessage
from langgraph.graph import END
from langgraph.prebuilt import ToolNode, tools_condition

# Setup environment variables for authentication
os.environ["OPENAI_API_KEY"] = 'your_openai_api_key'

# Initialize OpenAI embeddings using a specified model
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

# Create an in-memory vector store to store the embeddings
vector_store = InMemoryVectorStore(embeddings)

# Initialize the language model from OpenAI
llm = ChatOpenAI(model="gpt-4o-mini")

# Setup the document loader for a given web URL, specifying elements to parse
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
# Load the documents from the web page
docs = loader.load()

# Initialize a text splitter to chunk the document text
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
all_splits = text_splitter.split_documents(docs)

# Index the chunks in the vector store
_ = vector_store.add_documents(documents=all_splits)

# Define a retrieval tool to get relevant documents for a query
@tool(response_format="content_and_artifact")
def retrieve(query: str):
    """Retrieve information related to a query."""
    retrieved_docs = vector_store.similarity_search(query, k=2)
    serialized = "\n\n".join(
        (f"Source: {doc.metadata}\n" f"Content: {doc.page_content}")
        for doc in retrieved_docs
    )
    return serialized, retrieved_docs

# Step 1: Function to generate a tool call or respond based on the state
def query_or_respond(state: MessagesState):
    """Generate tool call for retrieval or respond."""
    llm_with_tools = llm.bind_tools([retrieve])  # Bind the retrieve tool to LLM
    response = llm_with_tools.invoke(state["messages"])  # Invoke the LLM with current messages
    return {"messages": [response]}  # Return the response messages

# Step 2: Execute the retrieval tool
tools = ToolNode([retrieve])

# Step 3: Function to generate a response using retrieved content
def generate(state: MessagesState):
    """Generate answer."""
    # Get the most recent tool messages
    recent_tool_messages = []
    for message in reversed(state["messages"]):
        if message.type == "tool":
            recent_tool_messages.append(message)
        else:
            break
    tool_messages = recent_tool_messages[::-1]  # Reverse to get the original order

    # Create a system message with the retrieved context
    docs_content = "\n\n".join(doc.content for doc in tool_messages)
    system_message_content = (
        "You are an assistant for question-answering tasks. "
        "Use the following pieces of retrieved context to answer "
        "the question. If you don't know the answer, say that you "
        "don't know. Use three sentences maximum and keep the "
        "answer concise."
        "\n\n"
        f"{docs_content}"
    )
    # Filter human and system messages for the prompt
    conversation_messages = [
        message
        for message in state["messages"]
        if message.type in ("human", "system")
        or (message.type == "ai" and not message.tool_calls)
    ]
    prompt = [SystemMessage(system_message_content)] + conversation_messages

    # Invoke the LLM with the prompt
    response = llm.invoke(prompt)
    return {"messages": [response]}

# Build the state graph for managing message state transitions
graph_builder = StateGraph(MessagesState)
graph_builder.add_node(query_or_respond)  # Add query_or_respond node to the graph
graph_builder.add_node(tools)             # Add tools node to the graph
graph_builder.add_node(generate)          # Add generate node to the graph

# Set the entry point for the state graph
graph_builder.set_entry_point("query_or_respond")
# Define conditional edges based on tool invocation
graph_builder.add_conditional_edges(
    "query_or_respond",
    tools_condition,
    {END: END, "tools": "tools"},
)
graph_builder.add_edge("tools", "generate")  # Define transition from tools to generate
graph_builder.add_edge("generate", END)      # Define transition from generate to END

# Compile the graph
graph = graph_builder.compile()

# Interact with the compiled graph using an initial input message
input_message = "Hello"
for step in graph.stream(
    {"messages": [{"role": "user", "content": input_message}]},
    stream_mode="values",
):
    step["messages"][-1].pretty_print()  # Print the final message

# Another interaction with the graph with a different input message
input_message = "What is Task Decomposition?"
for step in graph.stream(
    {"messages": [{"role": "user", "content": input_message}]},
    stream_mode="values",
):
    step["messages"][-1].pretty_print()  # Print the final message

代码详解

导入必要的库

python 复制代码

import os
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore
import bs4
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.graph import START, StateGraph, MessagesState
from typing_extensions import List, TypedDict
from langchain_core.tools import tool
from langchain_core.messages import SystemMessage
from langgraph.graph import END
from langgraph.prebuilt import ToolNode, tools_condition

我们首先导入了需要的库，这些库提供了处理语言和存储向量的工具。

设置环境变量

python 复制代码

os.environ["OPENAI_API_KEY"] = 'your_openai_api_key'

设置一些环境变量，用于API的身份验证和项目配置。

初始化嵌入模型和向量存储

python 复制代码

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
vector_store = InMemoryVectorStore(embeddings)

我们使用OpenAI的嵌入模型来创建文本嵌入，并在内存中初始化一个向量存储，用于后续的向量操作。

python 复制代码

llm = ChatOpenAI(model="gpt-4o-mini")

初始化GPT-4小版本的语言模型，用于后续的AI对话生成。

加载和分割文档

python 复制代码

loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
all_splits = text_splitter.split_documents(docs)

加载指定网页的内容，并对页面内容进行解析和分割。分割后的文本块将用于嵌入和向量存储。

向量存储文档

python 复制代码

_ = vector_store.add_documents(documents=all_splits)

将分割后的文档片段添加到向量存储中，以供后续检索操作。

定义检索工具

python 复制代码

@tool(response_format="content_and_artifact")
def retrieve(query: str):
    """Retrieve information related to a query."""
    retrieved_docs = vector_store.similarity_search(query, k=2)
    serialized = "\n\n".join(
        (f"Source: {doc.metadata}\n" f"Content: {doc.page_content}")
        for doc in retrieved_docs
    )
    return serialized, retrieved_docs

定义一个检索工具函数retrieve，该函数可以根据查询在向量存储中进行相似性搜索，并返回检索到的文档内容。

定义步骤：生成工具调用或直接回复

python 复制代码

def query_or_respond(state: MessagesState):
    """Generate tool call for retrieval or respond."""
    llm_with_tools = llm.bind_tools([retrieve])
    response = llm_with_tools.invoke(state["messages"])
    return {"messages": [response]}

该函数根据当前的消息状态生成调用检索工具的请求或直接生成回复。

定义步骤：执行检索工具

python 复制代码

tools = ToolNode([retrieve])

定义一个执行检索工具的步骤。

定义步骤：生成回答

python 复制代码

def generate(state: MessagesState):
    """Generate answer."""
    recent_tool_messages = []
    for message in reversed(state["messages"]):
        if message.type == "tool":
            recent_tool_messages.append(message)
        else:
            break
    tool_messages = recent_tool_messages[::-1]

    docs_content = "\n\n".join(doc.content for doc in tool_messages)
    system_message_content = (
        "You are an assistant for question-answering tasks. "
        "Use the following pieces of retrieved context to answer "
        "the question. If you don't know the answer, say that you "
        "don't know. Use three sentences maximum and keep the "
        "answer concise."
        "\n\n"
        f"{docs_content}"
    )
    conversation_messages = [
        message
        for message in state["messages"]
        if message.type in ("human", "system")
        or (message.type == "ai" and not message.tool_calls)
    ]
    prompt = [SystemMessage(system_message_content)] + conversation_messages

    response = llm.invoke(prompt)
    return {"messages": [response]}

该函数生成最后的回答。它会首先收集最近的工具消息，并结合这些消息内容生成系统消息，然后与现有对话消息一起作为提示，最终调用LLM生成回复。

构建状态图

python 复制代码

graph_builder = StateGraph(MessagesState)
graph_builder.add_node(query_or_respond)
graph_builder.add_node(tools)
graph_builder.add_node(generate)

graph_builder.set_entry_point("query_or_respond")
graph_builder.add_conditional_edges(
    "query_or_respond",
    tools_condition,
    {END: END, "tools": "tools"},
)
graph_builder.add_edge("tools", "generate")
graph_builder.add_edge("generate", END)

graph = graph_builder.compile()

使用状态图构建器创建一个消息状态图，并添加节点和条件边，确定消息的流转逻辑。

与状态图进行交互

python 复制代码

input_message = "Hello"
for step in graph.stream(
    {"messages": [{"role": "user", "content": input_message}]},
    stream_mode="values",
):
    step["messages"][-1].pretty_print()

input_message = "What is Task Decomposition?"
for step in graph.stream(
    {"messages": [{"role": "user", "content": input_message}]},
    stream_mode="values",
):
    step["messages"][-1].pretty_print()

我们通过给定的输入消息与状态图进行互动，流式处理消息，并最终打印出生成的回复。

LLM消息抓取

以上整个过程中，我们都是在调用LangChain API与LLM在进行交互，至于底层发送的请求细节，一无所知。在某些场景下面，我们还是需要去探究一下这些具体的细节，这样可以有一个全面的了解。下面我们看一下具体的发送内容，以上代码涉及到三个LLM交互。

交互1

提问

json 复制代码

{
  "messages": [
    [
      {
        "lc": 1,
        "type": "constructor",
        "id": [
          "langchain",
          "schema",
          "messages",
          "HumanMessage"
        ],
        "kwargs": {
          "content": "Hello",
          "type": "human",
          "id": "da95e909-50bb-4204-8aad-4181dcccbffb"
        }
      }
    ]
  ]
}

回答

json 复制代码

{
  "generations": [
    [
      {
        "text": "Hello! How can I assist you today?",
        "generation_info": {
          "finish_reason": "stop",
          "logprobs": null
        },
        "type": "ChatGeneration",
        "message": {
          "lc": 1,
          "type": "constructor",
          "id": [
            "langchain",
            "schema",
            "messages",
            "AIMessage"
          ],
          "kwargs": {
            "content": "Hello! How can I assist you today?",
            "additional_kwargs": {
              "refusal": null
            },
            "response_metadata": {
              "token_usage": {
                "completion_tokens": 10,
                "prompt_tokens": 44,
                "total_tokens": 54,
                "completion_tokens_details": {
                  "accepted_prediction_tokens": 0,
                  "audio_tokens": 0,
                  "reasoning_tokens": 0,
                  "rejected_prediction_tokens": 0
                },
                "prompt_tokens_details": {
                  "audio_tokens": 0,
                  "cached_tokens": 0
                }
              },
              "model_name": "gpt-4o-mini-2024-07-18",
              "system_fingerprint": "fp_3de1288069",
              "finish_reason": "stop",
              "logprobs": null
            },
            "type": "ai",
            "id": "run-611efcc9-1fe5-47e4-83fc-f42623556d93-0",
            "usage_metadata": {
              "input_tokens": 44,
              "output_tokens": 10,
              "total_tokens": 54,
              "input_token_details": {
                "audio": 0,
                "cache_read": 0
              },
              "output_token_details": {
                "audio": 0,
                "reasoning": 0
              }
            },
            "tool_calls": [],
            "invalid_tool_calls": []
          }
        }
      }
    ]
  ],
  "llm_output": {
    "token_usage": {
      "completion_tokens": 10,
      "prompt_tokens": 44,
      "total_tokens": 54,
      "completion_tokens_details": {
        "accepted_prediction_tokens": 0,
        "audio_tokens": 0,
        "reasoning_tokens": 0,
        "rejected_prediction_tokens": 0
      },
      "prompt_tokens_details": {
        "audio_tokens": 0,
        "cached_tokens": 0
      }
    },
    "model_name": "gpt-4o-mini-2024-07-18",
    "system_fingerprint": "fp_3de1288069"
  },
  "run": null,
  "type": "LLMResult"
}

交互2

提问

json 复制代码

{
  "messages": [
    [
      {
        "lc": 1,
        "type": "constructor",
        "id": [
          "langchain",
          "schema",
          "messages",
          "HumanMessage"
        ],
        "kwargs": {
          "content": "What is Task Decomposition?",
          "type": "human",
          "id": "6a790b36-fafd-4ff3-b293-9bb3ac9f4157"
        }
      }
    ]
  ]
}

回答

json 复制代码

{
  "generations": [
    [
      {
        "text": "",
        "generation_info": {
          "finish_reason": "tool_calls",
          "logprobs": null
        },
        "type": "ChatGeneration",
        "message": {
          "lc": 1,
          "type": "constructor",
          "id": [
            "langchain",
            "schema",
            "messages",
            "AIMessage"
          ],
          "kwargs": {
            "content": "",
            "additional_kwargs": {
              "tool_calls": [
                {
                  "id": "call_RClqnmrtp2sbwIbb2jHm0VeQ",
                  "function": {
                    "arguments": "{\"query\":\"Task Decomposition\"}",
                    "name": "retrieve"
                  },
                  "type": "function"
                }
              ],
              "refusal": null
            },
            "response_metadata": {
              "token_usage": {
                "completion_tokens": 15,
                "prompt_tokens": 49,
                "total_tokens": 64,
                "completion_tokens_details": {
                  "accepted_prediction_tokens": 0,
                  "audio_tokens": 0,
                  "reasoning_tokens": 0,
                  "rejected_prediction_tokens": 0
                },
                "prompt_tokens_details": {
                  "audio_tokens": 0,
                  "cached_tokens": 0
                }
              },
              "model_name": "gpt-4o-mini-2024-07-18",
              "system_fingerprint": "fp_0705bf87c0",
              "finish_reason": "tool_calls",
              "logprobs": null
            },
            "type": "ai",
            "id": "run-056b1c5a-cd5c-40cf-940c-bbf98512615d-0",
            "tool_calls": [
              {
                "name": "retrieve",
                "args": {
                  "query": "Task Decomposition"
                },
                "id": "call_RClqnmrtp2sbwIbb2jHm0VeQ",
                "type": "tool_call"
              }
            ],
            "usage_metadata": {
              "input_tokens": 49,
              "output_tokens": 15,
              "total_tokens": 64,
              "input_token_details": {
                "audio": 0,
                "cache_read": 0
              },
              "output_token_details": {
                "audio": 0,
                "reasoning": 0
              }
            },
            "invalid_tool_calls": []
          }
        }
      }
    ]
  ],
  "llm_output": {
    "token_usage": {
      "completion_tokens": 15,
      "prompt_tokens": 49,
      "total_tokens": 64,
      "completion_tokens_details": {
        "accepted_prediction_tokens": 0,
        "audio_tokens": 0,
        "reasoning_tokens": 0,
        "rejected_prediction_tokens": 0
      },
      "prompt_tokens_details": {
        "audio_tokens": 0,
        "cached_tokens": 0
      }
    },
    "model_name": "gpt-4o-mini-2024-07-18",
    "system_fingerprint": "fp_0705bf87c0"
  },
  "run": null,
  "type": "LLMResult"
}

交互3

提问

json 复制代码

{
  "messages": [
    [
      {
        "lc": 1,
        "type": "constructor",
        "id": [
          "langchain",
          "schema",
          "messages",
          "SystemMessage"
        ],
        "kwargs": {
          "content": "You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Use three sentences maximum and keep the answer concise.\n\nSource: {'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}\nContent: Fig. 1. Overview of a LLM-powered autonomous agent system.\nComponent One: Planning#\nA complicated task usually involves many steps. An agent needs to know what they are and plan ahead.\nTask Decomposition#\nChain of thought (CoT; Wei et al. 2022) has become a standard prompting technique for enhancing model performance on complex tasks. The model is instructed to "think step by step" to utilize more test-time computation to decompose hard tasks into smaller and simpler steps. CoT transforms big tasks into multiple manageable tasks and shed lights into an interpretation of the model's thinking process.\n\nSource: {'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}\nContent: Tree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-first search) with each state evaluated by a classifier (via a prompt) or majority vote.\nTask decomposition can be done (1) by LLM with simple prompting like \"Steps for XYZ.\\n1.\", \"What are the subgoals for achieving XYZ?\", (2) by using task-specific instructions; e.g. \"Write a story outline.\" for writing a novel, or (3) with human inputs.",
          "type": "system"
        }
      },
      {
        "lc": 1,
        "type": "constructor",
        "id": [
          "langchain",
          "schema",
          "messages",
          "HumanMessage"
        ],
        "kwargs": {
          "content": "What is Task Decomposition?",
          "type": "human",
          "id": "6a790b36-fafd-4ff3-b293-9bb3ac9f4157"
        }
      }
    ]
  ]
}

回答

json 复制代码

{
  "generations": [
    [
      {
        "text": "Task Decomposition is the process of breaking down a complicated task into smaller, more manageable steps. It often involves techniques like Chain of Thought (CoT), where the model is prompted to think step-by-step, enhancing performance on complex tasks. This approach helps to clarify the model's thinking process and makes it easier to tackle difficult problems.",
        "generation_info": {
          "finish_reason": "stop",
          "logprobs": null
        },
        "type": "ChatGeneration",
        "message": {
          "lc": 1,
          "type": "constructor",
          "id": [
            "langchain",
            "schema",
            "messages",
            "AIMessage"
          ],
          "kwargs": {
            "content": "Task Decomposition is the process of breaking down a complicated task into smaller, more manageable steps. It often involves techniques like Chain of Thought (CoT), where the model is prompted to think step-by-step, enhancing performance on complex tasks. This approach helps to clarify the model's thinking process and makes it easier to tackle difficult problems.",
            "additional_kwargs": {
              "refusal": null
            },
            "response_metadata": {
              "token_usage": {
                "completion_tokens": 67,
                "prompt_tokens": 384,
                "total_tokens": 451,
                "completion_tokens_details": {
                  "accepted_prediction_tokens": 0,
                  "audio_tokens": 0,
                  "reasoning_tokens": 0,
                  "rejected_prediction_tokens": 0
                },
                "prompt_tokens_details": {
                  "audio_tokens": 0,
                  "cached_tokens": 0
                }
              },
              "model_name": "gpt-4o-mini-2024-07-18",
              "system_fingerprint": "fp_0705bf87c0",
              "finish_reason": "stop",
              "logprobs": null
            },
            "type": "ai",
            "id": "run-b3565b23-18d5-439d-a87b-f836ee281d91-0",
            "usage_metadata": {
              "input_tokens": 384,
              "output_tokens": 67,
              "total_tokens": 451,
              "input_token_details": {
                "audio": 0,
                "cache_read": 0
              },
              "output_token_details": {
                "audio": 0,
                "reasoning": 0
              }
            },
            "tool_calls": [],
            "invalid_tool_calls": []
          }
        }
      }
    ]
  ],
  "llm_output": {
    "token_usage": {
      "completion_tokens": 67,
      "prompt_tokens": 384,
      "total_tokens": 451,
      "completion_tokens_details": {
        "accepted_prediction_tokens": 0,
        "audio_tokens": 0,
        "reasoning_tokens": 0,
        "rejected_prediction_tokens": 0
      },
      "prompt_tokens_details": {
        "audio_tokens": 0,
        "cached_tokens": 0
      }
    },
    "model_name": "gpt-4o-mini-2024-07-18",
    "system_fingerprint": "fp_0705bf87c0"
  },
  "run": null,
  "type": "LLMResult"
}

总结

本文通过OpenAI语言模型和自定义检索工具，构建了一个智能问答系统。首先，从网络上加载和分割文档内容，并将其存储到向量数据库中。然后，定义一个检索工具，可以根据查询请求从数据库中寻找相关文档。使用状态图管理对话流程，根据不同条件，系统会决定是否调用检索工具或者直接生成回复。最终，通过与状态图交互，实现智能应答。这样一个系统大大增强了自动化问答的能力，通过结合嵌入模型和语言模型，能够处理更为复杂和多样化的用户查询。