|
|
|
import os |
|
import gradio as gr |
|
import datasets |
|
from tqdm import tqdm |
|
from transformers import AutoTokenizer |
|
from langchain.docstore.document import Document |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain_community.vectorstores import FAISS |
|
from langchain_community.embeddings import HuggingFaceEmbeddings |
|
from langchain_community.vectorstores.utils import DistanceStrategy |
|
from smolagents import Tool, ToolCallingAgent, HfApiModel, DuckDuckGoSearchTool |
|
from langchain_community.document_loaders import PyPDFLoader |
|
|
|
|
|
|
|
|
|
|
|
def load_documents(pdf_folder): |
|
"""Loads PDFs and extracts text for RAG.""" |
|
docs = [] |
|
if not os.path.exists(pdf_folder): |
|
raise ValueError(f"β Error: The folder {pdf_folder} does not exist!") |
|
|
|
for file in os.listdir(pdf_folder): |
|
if file.endswith(".pdf"): |
|
file_path = os.path.join(pdf_folder, file) |
|
print(f"π Loading: {file_path}") |
|
loader = PyPDFLoader(file_path) |
|
docs.extend(loader.load()) |
|
|
|
if not docs: |
|
raise ValueError("β Error: No valid PDFs found in the directory!") |
|
return docs |
|
|
|
|
|
pdf_folder = "/content" |
|
if os.path.exists(pdf_folder): |
|
documents = load_documents(pdf_folder) |
|
else: |
|
documents = [] |
|
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter( |
|
chunk_size=200, chunk_overlap=20, add_start_index=True, strip_whitespace=True |
|
) |
|
docs_processed = text_splitter.split_documents(documents) |
|
|
|
|
|
embedding_model = HuggingFaceEmbeddings(model_name="thenlper/gte-small") |
|
vector_db = FAISS.from_documents( |
|
documents=docs_processed, |
|
embedding=embedding_model, |
|
distance_strategy=DistanceStrategy.COSINE, |
|
) |
|
|
|
print("β
FAISS Vector Database Successfully Created!") |
|
|