项目中遇到各种数据资源想要加载近langchain构建本地知识ai系统,怎么加载对应的文件格式呢,一起研究下
引入Langchain
from langchain.document_loaders import UnstructuredWordDocumentLoader,PyPDFium2Loader,DirectoryLoader,PyPDFLoader,TextLoader
import os
pdf文件加载
def load_pdf(directory_path):
data = []
for filename in os.listdir(directory_path):
if filename.endswith(".pdf"):
print(filename)
# print the file name
loader = PyPDFium2Loader(f'{directory_path}/{filename}')
print(loader)
data.append(loader.load())
return data
word文档加载如,doc或者docx格式
def load_word(directory_path):
data = []
for filename in os.listdir(directory_path):
# check if the file is a doc or docx file
# 检查所有doc以及docx后缀的文件
if filename.endswith(".doc") or filename.endswith(".docx"):
# langchain自带功能,加载word文档
loader = UnstructuredWordDocumentLoader(f'{directory_path}/{filename}')
data.append(loader.load())
return data
txt加载
def load_txt(directory_path):
data = []
for filename in os.listdir(directory_path):
if filename.endswith(".txt"):
print(filename)
loader = TextLoader(f'{directory_path}/{filename}')
print(loader)
data.append(loader.load())
return data
上述中常见的文档格式基本上都可以加载进去了,主要就是不同格式对应不同的加载方式,如果想简单也可以直接加载目录
def load_docs(directory):
loader = DirectoryLoader(directory)
documents = loader.load()
return documents