Spaces:
Running
Running
File size: 1,881 Bytes
eee9fe9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
#RAG method
from PyPDF2 import PdfReader
from langchain.document_loaders import PyPDFLoader
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore
from dotenv import load_dotenv
import os
load_dotenv()
hf_token = os.getenv("HF_TOKEN")
def load_and_chunk_pdfs(directory_path):
docs = []
for filename in os.listdir(directory_path):
if filename.endswith(".pdf"):
file_path = os.path.join(directory_path, filename)
reader = PdfReader(file_path)
text = ""
for page in reader.pages:
text += page.extract_text()
doc = Document(page_content=text, metadata={"source": filename})
docs.append(doc)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunked_docs = text_splitter.split_documents(docs)
return chunked_docs
def create_retriever(documents: list):
"""
Function to create and return a retriever using HuggingFace Embeddings and InMemory VectorStore.
Args:
api_key (str): Hugging Face API key.
model_name (str): The model name for sentence transformer embeddings.
documents (list): The list of documents to be embedded and added to the vectorstore.
Returns:
retriever: A retriever object to query the vector store.
"""
embeddings = HuggingFaceInferenceAPIEmbeddings(api_key=hf_token, model_name="sentence-transformers/all-MiniLM-l6-v2")
vectorstore = InMemoryVectorStore(embedding=embeddings)
vectorstore.add_documents(documents)
return vectorstore.as_retriever()
|