|
from langchain_community.document_loaders import PyPDFLoader |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
import os |
|
|
|
def load_pdf_files(directory): |
|
documents = [] |
|
for filename in os.listdir(directory): |
|
if filename.endswith('.pdf'): |
|
file_path = os.path.join(directory, filename) |
|
loader = PyPDFLoader(file_path) |
|
documents.extend(loader.load()) |
|
return documents |
|
|
|
def create_chunks(documents): |
|
text_splitter = RecursiveCharacterTextSplitter( |
|
chunk_size=1000, |
|
chunk_overlap=200 |
|
) |
|
return text_splitter.split_documents(documents) |