AI Agent学习：MetaGPT项目之RAG

examples文件夹下分析rag_pipeline.py

用到的类RAGExample

全流程run_pipeline

来看函数run_pipeline

python 复制代码

class RAGExample:
    """Show how to use RAG."""

    def __init__(self, engine: SimpleEngine = None, use_llm_ranker: bool = True):
        self._engine = engine
        self._use_llm_ranker = use_llm_ranker

    @property
    def engine(self):
        if not self._engine:
            ranker_configs = [LLMRankerConfig()] if self._use_llm_ranker else None

            self._engine = SimpleEngine.from_docs(
                input_files=[DOC_PATH],
                retriever_configs=[FAISSRetrieverConfig()],
                ranker_configs=ranker_configs,
            )
        return self._engine

    @engine.setter
    def engine(self, value: SimpleEngine):
        self._engine = value

    @handle_exception
    async def run_pipeline(self, question=QUESTION, print_title=True):
        """This example run rag pipeline, use faiss retriever and llm ranker, will print something like:

        Retrieve Result:
        0. Productivi..., 10.0
        1. I wrote cu..., 7.0
        2. I highly r..., 5.0

        Query Result:
        Passion, adaptability, open-mindedness, creativity, discipline, and empathy are key qualities to be a good writer.
        """
        if print_title:
            self._print_title("Run Pipeline")

        nodes = await self.engine.aretrieve(question)
        self._print_retrieve_result(nodes)

        answer = await self.engine.aquery(question)
        self._print_query_result(answer)

用这个engine 综合了文档读取，embedding配置，indexing索引，retrieving召回，ranking排序的工作流。

OK 我们从这个engine开始分析RAG的过程

文档读取

首先是文档读取

python 复制代码

class SimpleEngine(RetrieverQueryEngine):
    """SimpleEngine is designed to be simple and straightforward.

    It is a lightweight and easy-to-use search engine that integrates
    document reading, embedding, indexing, retrieving, and ranking functionalities
    into a single, straightforward workflow. It is designed to quickly set up a
    search engine from a collection of documents.
    """

    def __init__(
        self,
        retriever: BaseRetriever,
        response_synthesizer: Optional[BaseSynthesizer] = None,
        node_postprocessors: Optional[list[BaseNodePostprocessor]] = None,
        callback_manager: Optional[CallbackManager] = None,
        transformations: Optional[list[TransformComponent]] = None,
    ) -> None:
        super().__init__(
            retriever=retriever,
            response_synthesizer=response_synthesizer,
            node_postprocessors=node_postprocessors,
            callback_manager=callback_manager,
        )
        self._transformations = transformations or self._default_transformations()
        self._filenames = set()

    @classmethod
    def from_docs(
        cls,
        input_dir: str = None,
        input_files: list[str] = None,
        transformations: Optional[list[TransformComponent]] = None,
        embed_model: BaseEmbedding = None,
        llm: LLM = None,
        retriever_configs: list[BaseRetrieverConfig] = None,
        ranker_configs: list[BaseRankerConfig] = None,
        fs: Optional[fsspec.AbstractFileSystem] = None,
    ) -> "SimpleEngine":
        """From docs.

        Must provide either `input_dir` or `input_files`.

        Args:
            input_dir: Path to the directory.
            input_files: List of file paths to read (Optional; overrides input_dir, exclude).
            transformations: Parse documents to nodes. Default [SentenceSplitter].
            embed_model: Parse nodes to embedding. Must supported by llama index. Default OpenAIEmbedding.
            llm: Must supported by llama index. Default OpenAI.
            retriever_configs: Configuration for retrievers. If more than one config, will use SimpleHybridRetriever.
            ranker_configs: Configuration for rankers.
            fs: File system to use.
        """
        if not input_dir and not input_files:
            raise ValueError("Must provide either `input_dir` or `input_files`.")

        file_extractor = cls._get_file_extractor()
        documents = SimpleDirectoryReader(
            input_dir=input_dir, input_files=input_files, file_extractor=file_extractor, fs=fs
        ).load_data()
        cls._fix_document_metadata(documents)

        transformations = transformations or cls._default_transformations()
        nodes = run_transformations(documents, transformations=transformations)

        return cls._from_nodes(
            nodes=nodes,
            transformations=transformations,
            embed_model=embed_model,
            llm=llm,
            retriever_configs=retriever_configs,
            ranker_configs=ranker_configs,
        )

用到llama_index库from llama_index.core import SimpleDirectoryReader的 SimpleDirectoryReader来读取文档

关于这个reader使用来看SimpleDirectoryReader | LlamaIndex Python Documentation

然后把读取的documents 转换为自行标准定义的node格式严格来说这个documents就是nodes 这里transform纯粹是修改格式。 run_transformations用到这里的函数 https://developers.llamaindex.ai/python/framework-api-reference/ingestion/?h=run_transformations#llama_index.core.ingestion.pipeline.run_transformations

我们这里简单跑reader来看node表示

python 复制代码

from llama_index.core import SimpleDirectoryReader
LLM_TIP = "If you not sure, just answer I don't know."

DOC_PATH = "/home/metagpt_test/MetaGPT/examples/data/rag/"

reader = SimpleDirectoryReader(input_dir=DOC_PATH)
documents = reader.load_data()
print(documents)

这个文件夹下面有两个txt文件： travel.txt writer.txt 看到的输出是这样的

python 复制代码

[Document(id_='3eec2155-918e-4b24-8464-4a9057bda062', embedding=None, metadata={'file_path': '/home/metagpt_test/MetaGPT/examples/data/rag/travel.txt', 'file_name': '/home/metagpt_test/MetaGPT/examples/data/rag/travel.txt', 'file_type': 'text/plain', 'file_size': 20, 'creation_date': '2025-12-10', 'last_modified_date': '2025-12-10'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='Bob likes traveling.', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'), 
 Document(id_='1606c97d-5bb8-4a10-ab7a-8ed7a677c8b6', embedding=None, metadata={'file_path': '/home/metagpt_test/MetaGPT/examples/data/rag/writer.txt', 'file_name': '/home/metagpt_test/MetaGPT/examples/data/rag/writer.txt', 'file_type': 'text/plain', 'file_size': 12318, 'creation_date': '2025-12-10', 'last_modified_date': '2025-12-10'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='Open-Mindedness and curiosity are essential to writers\n\n', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')]

然后经过transform以后变成textnode

python 复制代码

[TextNode(id_='6e705d42-c2a2-4dfa-96ca-90919ef648f6', embedding=None, metadata={'file_path': '/home/metagpt_test/MetaGPT/examples/data/rag/travel.txt', 'file_name': '/home/metagpt_test/MetaGPT/examples/data/rag/travel.txt', 'file_type': 'text/plain', 'file_size': 20, 'creation_date': '2025-12-10', 'last_modified_date': '2025-12-10'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='c1d3f4dc-901e-48a5-a20d-f8799cb5d880', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': '/home/metagpt_test/MetaGPT/examples/data/rag/travel.txt', 'file_name': '/home/metagpt_test/MetaGPT/examples/data/rag/travel.txt', 'file_type': 'text/plain', 'file_size': 20, 'creation_date': '2025-12-10', 'last_modified_date': '2025-12-10'}, hash='1f6079d5b40377549681028ea0ebe0ed4e223d6e2a322d55ebd245006ffb573f'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='b9f234a0-a8fd-4b1a-aac9-bf9d2443900b', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='56d0fd37006cf2d9446a6c639aa75876581d219506413b590dec12960ca0a63d')}, text='Bob likes traveling.', start_char_idx=0, end_char_idx=20, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'), 
 TextNode(id_='b9f234a0-a8fd-4b1a-aac9-bf9d2443900b', embedding=None, metadata={'file_path': '/home/metagpt_test/MetaGPT/examples/data/rag/writer.txt', 'file_name': '/home/metagpt_test/MetaGPT/examples/data/rag/writer.txt', 'file_type': 'text/plain', 'file_size': 12318, 'creation_date': '2025-12-10', 'last_modified_date': '2025-12-10'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='b840d481-e6f4-4f8f-a291-bb930e706310', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': '/home/metagpt_test/MetaGPT/examples/data/rag/writer.txt', 'file_name': '/home/metagpt_test/MetaGPT/examples/data/rag/writer.txt', 'file_type': 'text/plain', 'file_size': 12318, 'creation_date': '2025-12-10', 'last_modified_date': '2025-12-10'}, hash='e745b08cbc2b3a56cebadf5d44b64e21dabc0af99436bbf7946540d5857e76f5'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='6e705d42-c2a2-4dfa-96ca-90919ef648f6', node_type=<ObjectType.TEXT: '1'>, metadata={'file_path': '/home/metagpt_test/MetaGPT/examples/data/rag/travel.txt', 'file_name': '/home/metagpt_test/MetaGPT/examples/data/rag/travel.txt', 'file_type': 'text/plain', 'file_size': 20, 'creation_date': '2025-12-10', 'last_modified_date': '2025-12-10'}, hash='1f6079d5b40377549681028ea0ebe0ed4e223d6e2a322d55ebd245006ffb573f'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='df415a4c-411a-434e-b9f0-f3ea80aa0a3d', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='78e5050e315122646264c5e828bf4fb1dec557080685781389884ef52140815e')}, text='Productivity\nI think I am at least somewhat more productive than average, and people sometimes ask me for productivity tips.  So I decided to just write them all down in one place.\n\nCompound growth gets discussed as a financial concept, but it works in careers as well, and it is magic.  A small productivity gain, compounded over 50 years, is worth a lot.  So it's worth figuring out how to optimize productivity. If you get 10% more done and 1% better every day compared to someone else, the compounded difference is massive. \n\nWhat you work on\n\nFamous writers have some essential qualities, creativity and discipline\n\nIt doesn't matter how fast you move if it's in a worthless direction.  Picking the right thing to work on is the most important element of productivity and usually almost ignored.  So think about it more!  Independent thought is hard but it's something you can get better at with practice.\n\nThe most impressive people I know have strong beliefs about the world, which is rare in the general population.  If you find yourself always agreeing with whomever you last spoke with, that's bad.  You will of course be wrong sometimes, but develop the confidence to stick with your convictions.  It will let you be courageous when you're right about something important that most people don't see.\n\nI make sure to leave enough time in my schedule to think about what to work on.  The best ways for me to do this are reading books, hanging out with interesting people, and spending time in nature.\n\nI've learned that I can't be very productive working on things I don't care about or don't like.  So I just try not to put myself in a position where I have to do them (by delegating, avoiding, or something else).  Stuff that you don't like is a painful drag on morale and momentum.\n\nBy the way, here is an important lesson about delegation: remember that everyone else is also most productive when they're doing what they like, and do what you'd want other people to do for you---try to figure out who likes (and is good at) doing what, and delegate that way.  \n\nIf you find yourself not liking what you're doing for a long period of time, seriously consider a major job change.  Short-term burnout happens, but if it isn't resolved with some time off, maybe it's time to do something you're more interested in. \n\nI've been very fortunate to find work I like so much I'd do it for free, which makes it easy to be really productive.\n\nIt's important to learn that you can learn anything you want, and that you can get better quickly.  This feels like an unlikely miracle the first few times it happens, but eventually you learn to trust that you can do it.\n\nDoing great work usually requires colleagues of some sort.  Try to be around smart, productive, happy, and positive people that don't belittle your ambitions.  I love being around people who push me and inspire me to be better.  To the degree you able to, avoid the opposite kind of people---the cost of letting them take up your mental cycles is horrific. \n\nYou have to both pick the right problem and do the work.  There aren't many shortcuts.  If you're going to do something really important, you are very likely going to work both smart and hard.  The biggest prizes are heavily competed for.  This isn't true in every field (there are great mathematicians who never spend that many hours a week working) but it is in most.\n\n\ufeffPrioritization\n\nWriters have to work hard to be successful \n\nMy system has three key pillars: "Make sure to get the important shit done", "Don't waste time on stupid shit", and "make a lot of lists".\n\nI highly recommend using lists.  I make lists of what I want to accomplish each year, each month, and each day.  Lists are very focusing, and they help me with multitasking because I don't have to keep as much in my head.  If I'm not in the mood for some particular task, I can always find something else I'm excited to do.\n\nI prefer lists written down on paper.  It's easy to add and remove tasks.  I can access them during meetings without feeling rude.  I re-transcribe lists frequently, which forces me to think about everything on the list and gives me an opportunity to add and remove items.\n\nI don't bother with categorization or trying to size tasks or anything like that (the most I do is put a star next to really important items).  \n\nI try to prioritize in a way that generates momentum.  The more I get done, the better I feel, and then the more I get done.  I like to start and end each day with something I can really make progress on.', start_char_idx=0, end_char_idx=4545, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'),
   TextNode(id_='df415a4c-411a-434e-b9f0-f3ea80aa0a3d', embedding=None, metadata={'file_path': '/home/metagpt_test/MetaGPT/examples/data/rag/writer.txt', 'file_name': '/home/metagpt_test/MetaGPT/examples/data/rag/writer.txt', 'file_type': 'text/plain', 'file_size': 12318, 'creation_date': '2025-12-10', 'last_modified_date': '2025-12-10'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='b840d481-e6f4-4f8f-a291-bb930e706310', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': '/home/metagpt_test/MetaGPT/examples/data/rag/writer.txt', 'file_name': '/home/metagpt_test/MetaGPT/examples/data/rag/writer.txt', 'file_type': 'text/plain', 'file_size': 12318, 'creation_date': '2025-12-10', 'last_modified_date': '2025-12-10'}, hash='e745b08cbc2b3a56cebadf5d44b64e21dabc0af99436bbf7946540d5857e76f5'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='b9f234a0-a8fd-4b1a-aac9-bf9d2443900b', node_type=<ObjectType.TEXT: '1'>, metadata={'file_path': '/home/metagpt_test/MetaGPT/examples/data/rag/writer.txt', 'file_name': '/home/metagpt_test/MetaGPT/examples/data/rag/writer.txt', 'file_type': 'text/plain', 'file_size': 12318, 'creation_date': '2025-12-10', 'last_modified_date': '2025-12-10'}, hash='3e0f5d5b2d4667d679108191ea6f22965422f551fe2781abb8d1437b60cc5564'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='b04b01ba-e0e8-4a49-bdb3-2eeb54d9f3fb', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='0eb17c3b636582b79998ab00b9132d4c4b94cab9f0d8160aa1b4031c4531dc12')}, text='Lists are very focusing, and they help me with multitasking because I don't have to keep as much in my head.  If I'm not in the mood for some particular task, I can always find something else I'm excited to do.\n\nI prefer lists written down on paper.  It's easy to add and remove tasks.  I can access them during meetings without feeling rude.  I re-transcribe lists frequently, which forces me to think about everything on the list and gives me an opportunity to add and remove items.\n\nI don't bother with categorization or trying to size tasks or anything like that (the most I do is put a star next to really important items).  \n\nI try to prioritize in a way that generates momentum.  The more I get done, the better I feel, and then the more I get done.  I like to start and end each day with something I can really make progress on.\n\nI am relentless about getting my most important projects done---I've found that if I really want something to happen and I push hard enough, it usually happens. \n\nI try to be ruthless about saying no to stuff, and doing non-critical things in the quickest way possible.  I probably take this too far---for example, I am almost sure I am terse to the point of rudeness when replying to emails.\n\nPassion and adaptability are key qualities to writers\n\nI generally try to avoid meetings and conferences as I find the time cost to be huge---I get the most value out of time in my office.  However, it is critical that you keep enough space in your schedule to allow for chance encounters and exposure to new people and ideas.  Having an open network is valuable; though probably 90% of the random meetings I take are a waste of time, the other 10% really make up for it.\n\nI find most meetings are best scheduled for 15-20 minutes, or 2 hours.  The default of 1 hour is usually wrong, and leads to a lot of wasted time.\n\nI have different times of day I try to use for different kinds of work.  The first few hours of the morning are definitely my most productive time of the day, so I don't let anyone schedule anything then.  I try to do meetings in the afternoon.  I take a break, or switch tasks, whenever I feel my attention starting to fade. \n\nI don't think most people value their time enough---I am surprised by the number of people I know who make $100 an hour and yet will spend a couple of hours doing something they don't want to do to save $20.\n\nAlso, don't fall into the trap of productivity porn---chasing productivity for its own sake isn't helpful.  Many people spend too much time thinking about how to perfectly optimize their system, and not nearly enough asking if they're working on the right problems.  It doesn't matter what system you use or if you squeeze out every second if you're working on the wrong thing.\n\nThe right goal is to allocate your year optimally, not your day.\n\nPhysical factors\n\nVery likely what is optimal for me won't be optimal for you.  You'll have to experiment to find out what works best for your body.  It's definitely worth doing---it helps in all aspects of life, and you'll feel a lot better and happier overall.\n\nIt probably took a little bit of my time every week for a few years to arrive at what works best for me, but my sense is if I do a good job at all the below I'm at least 1.5x more productive than if not.\n\nSleep seems to be the most important physical factor in productivity for me.  Some sort of sleep tracker to figure out how to sleep best is helpful.  I've found the only thing I'm consistent with are in the set-it-and-forget-it category, and I really like the Emfit QS+Active.\n\nI like a cold, dark, quiet room, and a great mattress (I resisted spending a bunch of money on a great mattress for years, which was stupid---it makes a huge difference to my sleep quality.  I love this one).  Not eating a lot in the few hours before sleep helps.  Not drinking alcohol helps a lot, though I'm not willing to do that all the time.\n\nI use a Chili Pad to be cold while I sleep if I can't get the room cold enough, which is great but loud (I set it up to have the cooler unit outside my room).\n\nWhen traveling, I use an eye mask and ear plugs.\n\nWriters usually have empathy to write good books.\n\nThis is likely to be controversial, but I take a low dose of sleeping pills (like a third of a normal dose) or a very low dose of cannabis whenever I can't sleep.  I am a bad sleeper in general, and a particularly bad sleeper when I travel.', start_char_idx=3709, end_char_idx=8126, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'), 
   TextNode(id_='b04b01ba-e0e8-4a49-bdb3-2eeb54d9f3fb', embedding=None, metadata={'file_path': '/home/metagpt_test/MetaGPT/examples/data/rag/writer.txt', 'file_name': '/home/metagpt_test/MetaGPT/examples/data/rag/writer.txt', 'file_type': 'text/plain', 'file_size': 12318, 'creation_date': '2025-12-10', 'last_modified_date': '2025-12-10'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='b840d481-e6f4-4f8f-a291-bb930e706310', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': '/home/metagpt_test/MetaGPT/examples/data/rag/writer.txt', 'file_name': '/home/metagpt_test/MetaGPT/examples/data/rag/writer.txt', 'file_type': 'text/plain', 'file_size': 12318, 'creation_date': '2025-12-10', 'last_modified_date': '2025-12-10'}, hash='e745b08cbc2b3a56cebadf5d44b64e21dabc0af99436bbf7946540d5857e76f5'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='df415a4c-411a-434e-b9f0-f3ea80aa0a3d', node_type=<ObjectType.TEXT: '1'>, metadata={'file_path': '/home/metagpt_test/MetaGPT/examples/data/rag/writer.txt', 'file_name': '/home/metagpt_test/MetaGPT/examples/data/rag/writer.txt', 'file_type': 'text/plain', 'file_size': 12318, 'creation_date': '2025-12-10', 'last_modified_date': '2025-12-10'}, hash='513aaabda2c7e8bfd139b913c577e8ca428c741ecee4f092b20073e5f43901f1'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='82fb358d-1ca4-425f-8382-df94aebadcbb', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='70fd3d8105c24dc4afb6e34e5343db64756f608707e461ade3bb894fe0600478')}, text='I like a cold, dark, quiet room, and a great mattress (I resisted spending a bunch of money on a great mattress for years, which was stupid---it makes a huge difference to my sleep quality.  I love this one).  Not eating a lot in the few hours before sleep helps.  Not drinking alcohol helps a lot, though I'm not willing to do that all the time.\n\nI use a Chili Pad to be cold while I sleep if I can't get the room cold enough, which is great but loud (I set it up to have the cooler unit outside my room).\n\nWhen traveling, I use an eye mask and ear plugs.\n\nWriters usually have empathy to write good books.\n\nThis is likely to be controversial, but I take a low dose of sleeping pills (like a third of a normal dose) or a very low dose of cannabis whenever I can't sleep.  I am a bad sleeper in general, and a particularly bad sleeper when I travel.  It likely has tradeoffs, but so does not sleeping well.  If you can already sleep well, I wouldn't recommend this.\n\nI use a full spectrum LED light most mornings for about 10-15 minutes while I catch up on email.  It's great---if you try nothing else in here, this is the thing I'd try.  It's a ridiculous gain for me.  I like this one, and it's easy to travel with.\n\nExercise is probably the second most important physical factor.  I tried a number of different exercise programs for a few months each and the one that seemed best was lifting heavy weights 3x a week for an hour, and high intensity interval training occasionally.  In addition to productivity gains, this is also the exercise program that makes me feel the best overall.  \n\nThe third area is nutrition.  I very rarely eat breakfast, so I get about 15 hours of fasting most days (except an espresso when I wake up).  I know this is contrary to most advice, and I suspect it's not optimal for most people, but it definitely works well for me.\n\nEating lots of sugar is the thing that makes me feel the worst and that I try hardest to avoid.  I also try to avoid foods that aggravate my digestion or spike up inflammation (for example, very spicy foods).  I don't have much willpower when it comes to sweet things, so I mostly just try to keep junk food out of the house.\n\nI have one big shot of espresso immediately when I wake up and one after lunch.  I assume this is about 200mg total of caffeine per day.  I tried a few other configurations; this was the one that worked by far the best.  I otherwise aggressively avoid stimulants, but I will have more coffee if I'm super tired and really need to get something done.\n\nIf a writer want to be super, then should include innovative thinking.\n\nI'm vegetarian and have been since I was a kid, and I supplement methyl B-12, Omega-3, Iron, and Vitamin D-3.  I got to this list with a year or so of quarterly blood tests; it's worked for me ever since (I re-test maybe every year and a half or so).  There are many doctors who will happily work with you on a super comprehensive blood test (and services like WellnessFX).  I also go out of my way to drink a lot of protein shakes, which I hate and I wouldn't do if I weren't vegetarian.\n\n\ufeffOther stuff\n\nHere's what I like in a workspace: natural light, quiet, knowing that I won't be interrupted if I don't want to be, long blocks of time, and being comfortable and relaxed (I've got a beautiful desk with a couple of 4k monitors on it in my office, but I spend almost all my time on my couch with my laptop).\n\nI wrote custom software for the annoying things I have to do frequently, which is great.  I also made an effort to learn to type really fast and the keyboard shortcuts that help with my workflow.\n\nLike most people, I sometimes go through periods of a week or two where I just have no motivation to do anything (I suspect it may have something to do with nutrition).  This sucks and always seems to happen at inconvenient times.  I have not figured out what to do about it besides wait for the fog to lift, and to trust that eventually it always does.  And I generally try to avoid people and situations that put me in bad moods, which is good advice whether you care about productivity or not.\n\nIn general, I think it's good to overcommit a little bit.  I find that I generally get done what I take on, and if I have a little bit too much to do it makes me more efficient at everything, which is a way to train to avoid distractions (a great habit to build!).', start_char_idx=7279, end_char_idx=11657, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'), 
   TextNode(id_='82fb358d-1ca4-425f-8382-df94aebadcbb', embedding=None, metadata={'file_path': '/home/metagpt_test/MetaGPT/examples/data/rag/writer.txt', 'file_name': '/home/metagpt_test/MetaGPT/examples/data/rag/writer.txt', 'file_type': 'text/plain', 'file_size': 12318, 'creation_date': '2025-12-10', 'last_modified_date': '2025-12-10'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='b840d481-e6f4-4f8f-a291-bb930e706310', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': '/home/metagpt_test/MetaGPT/examples/data/rag/writer.txt', 'file_name': '/home/metagpt_test/MetaGPT/examples/data/rag/writer.txt', 'file_type': 'text/plain', 'file_size': 12318, 'creation_date': '2025-12-10', 'last_modified_date': '2025-12-10'}, hash='e745b08cbc2b3a56cebadf5d44b64e21dabc0af99436bbf7946540d5857e76f5'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='b04b01ba-e0e8-4a49-bdb3-2eeb54d9f3fb', node_type=<ObjectType.TEXT: '1'>, metadata={'file_path': '/home/metagpt_test/MetaGPT/examples/data/rag/writer.txt', 'file_name': '/home/metagpt_test/MetaGPT/examples/data/rag/writer.txt', 'file_type': 'text/plain', 'file_size': 12318, 'creation_date': '2025-12-10', 'last_modified_date': '2025-12-10'}, hash='44381bdb8c4e4c940f37c2128acc4bc8e0bb13a48a327d599582aca158a2d8e9')}, text='I also made an effort to learn to type really fast and the keyboard shortcuts that help with my workflow.\n\nLike most people, I sometimes go through periods of a week or two where I just have no motivation to do anything (I suspect it may have something to do with nutrition).  This sucks and always seems to happen at inconvenient times.  I have not figured out what to do about it besides wait for the fog to lift, and to trust that eventually it always does.  And I generally try to avoid people and situations that put me in bad moods, which is good advice whether you care about productivity or not.\n\nIn general, I think it's good to overcommit a little bit.  I find that I generally get done what I take on, and if I have a little bit too much to do it makes me more efficient at everything, which is a way to train to avoid distractions (a great habit to build!).  However, overcommitting a lot is disastrous.\n\nDon't neglect your family and friends for the sake of productivity---that's a very stupid tradeoff (and very likely a net productivity loss, because you'll be less happy).  Don't neglect doing things you love or that clear your head either.\n\nFinally, to repeat one more time: productivity in the wrong direction isn't worth anything at all.  Think more about what to work on.\n\nOpen-Mindedness and curiosity are essential to writers', start_char_idx=10788, end_char_idx=12134, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')]

得到这些TextNode后传入到_from_nodes方法

python 复制代码

class SimpleEngine(RetrieverQueryEngine):

...

    @classmethod
    def _from_nodes(
        cls,
        nodes: list[BaseNode],
        transformations: Optional[list[TransformComponent]] = None,
        embed_model: BaseEmbedding = None,
        llm: LLM = None,
        retriever_configs: list[BaseRetrieverConfig] = None,
        ranker_configs: list[BaseRankerConfig] = None,
    ) -> "SimpleEngine":
        embed_model = cls._resolve_embed_model(embed_model, retriever_configs)
        llm = llm or get_rag_llm()

        retriever = get_retriever(configs=retriever_configs, nodes=nodes, embed_model=embed_model)
        rankers = get_rankers(configs=ranker_configs, llm=llm)  # Default []

        return cls(
            retriever=retriever,
            node_postprocessors=rankers,
            response_synthesizer=get_response_synthesizer(llm=llm),
            transformations=transformations,
        )

OK，这里就要开始embedding retriever ranker三件套的配置

embedding

先看embedding

从metagpt/rag/engines/simple.py的_resolve_embed_model(embed_model, retriever_configs)

从这个函数指路到 metagpt/rag/factories/embedding.py 的get_rag_embedding函数

python 复制代码

def get_rag_embedding(key: EmbeddingType = None, config: Optional[Config] = None):
    return RAGEmbeddingFactory(config=config).get_rag_embedding(key)

python 复制代码

class RAGEmbeddingFactory(GenericFactory):
    """Create LlamaIndex Embedding with MetaGPT's embedding config."""

    def __init__(self, config: Optional[Config] = None):
        creators = {
            EmbeddingType.OPENAI: self._create_openai,
            EmbeddingType.AZURE: self._create_azure,
            EmbeddingType.GEMINI: self._create_gemini,
            EmbeddingType.OLLAMA: self._create_ollama,
            # For backward compatibility
            LLMType.OPENAI: self._create_openai,
            LLMType.AZURE: self._create_azure,
        }
        super().__init__(creators)
        self.config = config if config else Config.default()

    def get_rag_embedding(self, key: EmbeddingType = None) -> BaseEmbedding:
        """Key is EmbeddingType."""
        return super().get_instance(key or self._resolve_embedding_type())

    def _resolve_embedding_type(self) -> EmbeddingType | LLMType:
        """Resolves the embedding type.

        If the embedding type is not specified, for backward compatibility, it checks if the LLM API type is either OPENAI or AZURE.
        Raise TypeError if embedding type not found.
        """
        if self.config.embedding.api_type:
            return self.config.embedding.api_type

        if self.config.llm.api_type in [LLMType.OPENAI, LLMType.AZURE]:
            return self.config.llm.api_type

        raise TypeError("To use RAG, please set your embedding in config2.yaml.")
    def _create_ollama(self) -> "OllamaEmbedding":
        from llama_index.embeddings.ollama import OllamaEmbedding

        params = dict(
            base_url=self.config.embedding.base_url,
        )

        self._try_set_model_and_batch_size(params)

        return OllamaEmbedding(**params)

    def _try_set_model_and_batch_size(self, params: dict):
        """Set the model_name and embed_batch_size only when they are specified."""
        if self.config.embedding.model:
            params["model_name"] = self.config.embedding.model

        if self.config.embedding.embed_batch_size:
            params["embed_batch_size"] = self.config.embedding.embed_batch_size

到这里就返回ollamaembedding类

retriever

到retriever

python 复制代码

retriever = get_retriever(configs=retriever_configs, nodes=nodes, embed_model=embed_model)

指路到metagpt/rag/factories/retriever.py

python 复制代码

from metagpt.rag.retrievers.faiss_retriever import FAISSRetriever
class RetrieverFactory(ConfigBasedFactory):
    """Modify creators for dynamically instance implementation."""

    def __init__(self):
        creators = {
            FAISSRetrieverConfig: self._create_faiss_retriever,
            BM25RetrieverConfig: self._create_bm25_retriever,
            ChromaRetrieverConfig: self._create_chroma_retriever,
            ElasticsearchRetrieverConfig: self._create_es_retriever,
            ElasticsearchKeywordRetrieverConfig: self._create_es_retriever,
        }
        super().__init__(creators)

    def get_retriever(self, configs: list[BaseRetrieverConfig] = None, **kwargs) -> RAGRetriever:
        """Creates and returns a retriever instance based on the provided configurations.

        If multiple retrievers, using SimpleHybridRetriever.
        """
        if not configs:
            return self._create_default(**kwargs)

        retrievers = super().get_instances(configs, **kwargs)

        return SimpleHybridRetriever(*retrievers) if len(retrievers) > 1 else retrievers[0]

    def _create_default(self, **kwargs) -> RAGRetriever:
        index = self._extract_index(None, **kwargs) or self._build_default_index(**kwargs)

        return index.as_retriever()

    def _create_faiss_retriever(self, config: FAISSRetrieverConfig, **kwargs) -> FAISSRetriever:
        config.index = self._build_faiss_index(config, **kwargs)

        return FAISSRetriever(**config.model_dump())

这里在rag_pipeline.py定义用FAISSRetriever, 用文件内容生成index然后传到FAISSRetriever

是 from metagpt.rag.retrievers.faiss_retriever import FAISSRetriever

python 复制代码

"""FAISS retriever."""

from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.schema import BaseNode


class FAISSRetriever(VectorIndexRetriever):
    """FAISS retriever."""

    def add_nodes(self, nodes: list[BaseNode], **kwargs) -> None:
        """Support add nodes."""
        self._index.insert_nodes(nodes, **kwargs)

    def persist(self, persist_dir: str, **kwargs) -> None:
        """Support persist."""
        self._index.storage_context.persist(persist_dir)

还有FAISSRetriever，BM25RetrieverConfig，ChromaRetriever， ElasticsearchRetriever， ElasticsearchKeywordRetriever

ranker

同理到ranker 用到的是from llama_index.core.postprocessor import LLMRerank 预置设的

python 复制代码

from llama_index.core.postprocessor import LLMRerank
class RankerFactory(ConfigBasedFactory):
    """Modify creators for dynamically instance implementation."""

    def __init__(self):
        creators = {
            LLMRankerConfig: self._create_llm_ranker,
            ColbertRerankConfig: self._create_colbert_ranker,
            ObjectRankerConfig: self._create_object_ranker,
            CohereRerankConfig: self._create_cohere_rerank,
            BGERerankConfig: self._create_bge_rerank,
        }
        super().__init__(creators)

    def get_rankers(self, configs: list[BaseRankerConfig] = None, **kwargs) -> list[BaseNodePostprocessor]:
        """Creates and returns a retriever instance based on the provided configurations."""
        if not configs:
            return []

        return super().get_instances(configs, **kwargs)

    def _create_llm_ranker(self, config: LLMRankerConfig, **kwargs) -> LLMRerank:
        config.llm = self._extract_llm(config, **kwargs)

        return LLMRerank(**config.model_dump())

还有 ColbertRerank，ObjectRanker，CohereRerank， BGERerank

RetrieverQueryEngine

返回到metagpt\rag\engines\simple.py下面_from_nodes的return cls

就是RetrieverQueryEngine返回的内容

class SimpleEngine(RetrieverQueryEngine):

return cls(

retriever=retriever,

node_postprocessors=rankers,

response_synthesizer=get_response_synthesizer(llm=llm),

transformations=transformations,

)

到这里读取文档 embedding index retriever reranker内容配齐

python 复制代码

@handle_exception
    async def run_pipeline(self, question=QUESTION, print_title=True):
        """This example run rag pipeline, use faiss retriever and llm ranker, will print something like:

        Retrieve Result:
        0. Productivi..., 10.0
        1. I wrote cu..., 7.0
        2. I highly r..., 5.0

        Query Result:
        Passion, adaptability, open-mindedness, creativity, discipline, and empathy are key qualities to be a good writer.
        """
        if print_title:
            self._print_title("Run Pipeline")

        nodes = await self.engine.aretrieve(question)
        self._print_retrieve_result(nodes)

        answer = await self.engine.aquery(question)
        self._print_query_result(answer)

到这里self.engine.aretrieve

python 复制代码

    
class SimpleEngine(RetrieverQueryEngine):

    async def aretrieve(self, query: QueryType) -> list[NodeWithScore]:
        """Allow query to be str."""
        query_bundle = QueryBundle(query) if isinstance(query, str) else query

        nodes = await super().aretrieve(query_bundle)
        self._try_reconstruct_obj(nodes)
        return nodes

    @staticmethod
    def _try_reconstruct_obj(nodes: list[NodeWithScore]):
        """If node is object, then dynamically reconstruct object, and save object to node.metadata["obj"]."""
        for node in nodes:
            if node.metadata.get("is_obj", False):
                obj_cls = import_class(node.metadata["obj_cls_name"], node.metadata["obj_mod_name"])
                obj_dict = json.loads(node.metadata["obj_json"])
                node.metadata["obj"] = obj_cls(**obj_dict)

super.aretrieve用到这里的RetrieverQueryEngine 的aretrieve

python 复制代码

class RetrieverQueryEngine(BaseQueryEngine):
    """
    Retriever query engine.

    Args:
        retriever (BaseRetriever): A retriever object.
        response_synthesizer (Optional[BaseSynthesizer]): A BaseSynthesizer
            object.
        callback_manager (Optional[CallbackManager]): A callback manager.

    """

    def __init__(
        self,
        retriever: BaseRetriever,
        response_synthesizer: Optional[BaseSynthesizer] = None,
        node_postprocessors: Optional[List[BaseNodePostprocessor]] = None,
        callback_manager: Optional[CallbackManager] = None,
    ) -> None:
        self._retriever = retriever
        self._response_synthesizer = response_synthesizer or get_response_synthesizer(
            llm=Settings.llm,
            callback_manager=callback_manager or Settings.callback_manager,
        )

        self._node_postprocessors = node_postprocessors or []
        callback_manager = (
            callback_manager or self._response_synthesizer.callback_manager
        )
        for node_postprocessor in self._node_postprocessors:
            node_postprocessor.callback_manager = callback_manager
        super().__init__(callback_manager=callback_manager)

    def _get_prompt_modules(self) -> PromptMixinType:
        """Get prompt sub-modules."""
        return {"response_synthesizer": self._response_synthesizer}

    @classmethod
    def from_args(
        cls,
        retriever: BaseRetriever,
        llm: Optional[LLM] = None,
        response_synthesizer: Optional[BaseSynthesizer] = None,
        node_postprocessors: Optional[List[BaseNodePostprocessor]] = None,
        callback_manager: Optional[CallbackManager] = None,
        # response synthesizer args
        response_mode: ResponseMode = ResponseMode.COMPACT,
        text_qa_template: Optional[BasePromptTemplate] = None,
        refine_template: Optional[BasePromptTemplate] = None,
        summary_template: Optional[BasePromptTemplate] = None,
        simple_template: Optional[BasePromptTemplate] = None,
        output_cls: Optional[Type[BaseModel]] = None,
        use_async: bool = False,
        streaming: bool = False,
        verbose: bool = False,
        **kwargs: Any,
    ) -> "RetrieverQueryEngine":
        """
        Initialize a RetrieverQueryEngine object.".

        Args:
            retriever (BaseRetriever): A retriever object.
            llm (Optional[LLM]): An instance of an LLM.
            response_synthesizer (Optional[BaseSynthesizer]): An instance of a response
                synthesizer.
            node_postprocessors (Optional[List[BaseNodePostprocessor]]): A list of
                node postprocessors.
            callback_manager (Optional[CallbackManager]): A callback manager.
            response_mode (ResponseMode): A ResponseMode object.
            text_qa_template (Optional[BasePromptTemplate]): A BasePromptTemplate
                object.
            refine_template (Optional[BasePromptTemplate]): A BasePromptTemplate object.
            summary_template (Optional[BasePromptTemplate]): A BasePromptTemplate object.
            simple_template (Optional[BasePromptTemplate]): A BasePromptTemplate object.
            output_cls (Optional[Type[BaseModel]]): The pydantic model to pass to the
                response synthesizer.
            use_async (bool): Whether to use async.
            streaming (bool): Whether to use streaming.
            verbose (bool): Whether to print verbose output.

        """
        llm = llm or Settings.llm

        response_synthesizer = response_synthesizer or get_response_synthesizer(
            llm=llm,
            text_qa_template=text_qa_template,
            refine_template=refine_template,
            summary_template=summary_template,
            simple_template=simple_template,
            response_mode=response_mode,
            output_cls=output_cls,
            use_async=use_async,
            streaming=streaming,
            verbose=verbose,
        )

        callback_manager = callback_manager or Settings.callback_manager

        return cls(
            retriever=retriever,
            response_synthesizer=response_synthesizer,
            callback_manager=callback_manager,
            node_postprocessors=node_postprocessors,
        )

    .....
    def retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        nodes = self._retriever.retrieve(query_bundle)
        return self._apply_node_postprocessors(nodes, query_bundle=query_bundle)

    async def aretrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        nodes = await self._retriever.aretrieve(query_bundle)
        return await self._async_apply_node_postprocessors(
            nodes, query_bundle=query_bundle
        )

最后打印即可。

过程

所以两个过程

构建读取document reader, embedding model , retriever , reranker 。
使用RetrieverQueryEngine 即simpleEngine本身做比对

组件

用到的组件metagpt/rag下面

engine 用于RetrieverQueryEngine

factories 就是各种组件的初始化

parsers 用于文档读取

prompts 存取模板

rankers和retrievers存取各种类型的组件

schema.py设置参数

interface.py 没什么用存一些util函数