MAgentRag / app.py
SaraM2727's picture
Update app.py
2c69748 verified
# βœ… Import Libraries
import os
import gradio as gr
import datasets
from tqdm import tqdm
from transformers import AutoTokenizer
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS # βœ… FIXED IMPORT
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
from smolagents import Tool, ToolCallingAgent, HfApiModel, DuckDuckGoSearchTool
from langchain_community.document_loaders import PyPDFLoader # βœ… FIXED IMPORT
# βœ… REMOVE notebook_login()
# notebook_login() # ❌ DELETE THIS LINE
# βœ… Step 2: Load PDF Documents for RAG
def load_documents(pdf_folder):
"""Loads PDFs and extracts text for RAG."""
docs = []
if not os.path.exists(pdf_folder):
raise ValueError(f"❌ Error: The folder {pdf_folder} does not exist!")
for file in os.listdir(pdf_folder):
if file.endswith(".pdf"):
file_path = os.path.join(pdf_folder, file)
print(f"πŸ“‚ Loading: {file_path}")
loader = PyPDFLoader(file_path) # βœ… FIXED
docs.extend(loader.load())
if not docs:
raise ValueError("❌ Error: No valid PDFs found in the directory!")
return docs
# βœ… Ensure PDF Folder Exists
pdf_folder = "/content" # Change if needed
if os.path.exists(pdf_folder):
documents = load_documents(pdf_folder)
else:
documents = []
# βœ… Process Documents for Vector Search
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=200, chunk_overlap=20, add_start_index=True, strip_whitespace=True
)
docs_processed = text_splitter.split_documents(documents)
# βœ… Create FAISS Vector Database
embedding_model = HuggingFaceEmbeddings(model_name="thenlper/gte-small")
vector_db = FAISS.from_documents(
documents=docs_processed,
embedding=embedding_model,
distance_strategy=DistanceStrategy.COSINE,
)
print("βœ… FAISS Vector Database Successfully Created!")