Spaces:
Running
Running
#RAG method | |
from PyPDF2 import PdfReader | |
from langchain.document_loaders import PyPDFLoader | |
from langchain.docstore.document import Document | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings | |
from langchain_core.vectorstores import InMemoryVectorStore | |
from dotenv import load_dotenv | |
import os | |
load_dotenv() | |
hf_token = os.getenv("HF_TOKEN") | |
def load_and_chunk_pdfs(directory_path): | |
docs = [] | |
for filename in os.listdir(directory_path): | |
if filename.endswith(".pdf"): | |
file_path = os.path.join(directory_path, filename) | |
reader = PdfReader(file_path) | |
text = "" | |
for page in reader.pages: | |
text += page.extract_text() | |
doc = Document(page_content=text, metadata={"source": filename}) | |
docs.append(doc) | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) | |
chunked_docs = text_splitter.split_documents(docs) | |
return chunked_docs | |
def create_retriever(documents: list): | |
""" | |
Function to create and return a retriever using HuggingFace Embeddings and InMemory VectorStore. | |
Args: | |
api_key (str): Hugging Face API key. | |
model_name (str): The model name for sentence transformer embeddings. | |
documents (list): The list of documents to be embedded and added to the vectorstore. | |
Returns: | |
retriever: A retriever object to query the vector store. | |
""" | |
embeddings = HuggingFaceInferenceAPIEmbeddings(api_key=hf_token, model_name="sentence-transformers/all-MiniLM-l6-v2") | |
vectorstore = InMemoryVectorStore(embedding=embeddings) | |
vectorstore.add_documents(documents) | |
return vectorstore.as_retriever() | |