# ✅ Import Libraries import os import gradio as gr import datasets from tqdm import tqdm from transformers import AutoTokenizer from langchain.docstore.document import Document from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.vectorstores import FAISS # ✅ FIXED IMPORT from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores.utils import DistanceStrategy from smolagents import Tool, ToolCallingAgent, HfApiModel, DuckDuckGoSearchTool from langchain_community.document_loaders import PyPDFLoader # ✅ FIXED IMPORT # ✅ REMOVE notebook_login() # notebook_login() # ❌ DELETE THIS LINE # ✅ Step 2: Load PDF Documents for RAG def load_documents(pdf_folder): """Loads PDFs and extracts text for RAG.""" docs = [] if not os.path.exists(pdf_folder): raise ValueError(f"❌ Error: The folder {pdf_folder} does not exist!") for file in os.listdir(pdf_folder): if file.endswith(".pdf"): file_path = os.path.join(pdf_folder, file) print(f"📂 Loading: {file_path}") loader = PyPDFLoader(file_path) # ✅ FIXED docs.extend(loader.load()) if not docs: raise ValueError("❌ Error: No valid PDFs found in the directory!") return docs # ✅ Ensure PDF Folder Exists pdf_folder = "/content" # Change if needed if os.path.exists(pdf_folder): documents = load_documents(pdf_folder) else: documents = [] # ✅ Process Documents for Vector Search text_splitter = RecursiveCharacterTextSplitter( chunk_size=200, chunk_overlap=20, add_start_index=True, strip_whitespace=True ) docs_processed = text_splitter.split_documents(documents) # ✅ Create FAISS Vector Database embedding_model = HuggingFaceEmbeddings(model_name="thenlper/gte-small") vector_db = FAISS.from_documents( documents=docs_processed, embedding=embedding_model, distance_strategy=DistanceStrategy.COSINE, ) print("✅ FAISS Vector Database Successfully Created!")