medVedaReportAnalysis / embeddings.py
rishi002's picture
Create embeddings.py
1fd98cd verified
raw
history blame contribute delete
629 Bytes
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
def load_pdf_files(directory):
documents = []
for filename in os.listdir(directory):
if filename.endswith('.pdf'):
file_path = os.path.join(directory, filename)
loader = PyPDFLoader(file_path)
documents.extend(loader.load())
return documents
def create_chunks(documents):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200
)
return text_splitter.split_documents(documents)