项目中遇到各种数据资源想要加载近langchain构建本地知识ai系统,怎么加载对应的文件格式呢,一起研究下
from langchain.document_loaders import UnstructuredWordDocumentLoader,PyPDFium2Loader,DirectoryLoader,PyPDFLoader,TextLoader import os
def load_pdf(directory_path): data = [] for filename in os.listdir(directory_path): if filename.endswith(".pdf"): print(filename) # print the file name loader = PyPDFium2Loader(f'{directory_path}/{filename}') print(loader) data.append(loader.load()) return data
def load_word(directory_path): data = [] for filename in os.listdir(directory_path): # check if the file is a doc or docx file # 检查所有doc以及docx后缀的文件 if filename.endswith(".doc") or filename.endswith(".docx"): # langchain自带功能,加载word文档 loader = UnstructuredWordDocumentLoader(f'{directory_path}/{filename}') data.append(loader.load()) return data
def load_txt(directory_path): data = [] for filename in os.listdir(directory_path): if filename.endswith(".txt"): print(filename) loader = TextLoader(f'{directory_path}/{filename}') print(loader) data.append(loader.load()) return data
上述中常见的文档格式基本上都可以加载进去了,主要就是不同格式对应不同的加载方式,如果想简单也可以直接加载目录
def load_docs(directory): loader = DirectoryLoader(directory) documents = loader.load() return documents