rites-pdf / tools.py
akshansh36's picture
Update tools.py
9bd3750 verified
from langchain_core.tools import tool
import pinecone
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import os
from dotenv import load_dotenv
load_dotenv()
GOOGLE_API_KEY = os.getenv("GEMINI_API_KEY")
PINECONE_API = os.getenv("PINECONE_API_KEY")
google_embeddings = GoogleGenerativeAIEmbeddings(
model="models/embedding-001", # Correct model name
google_api_key=GOOGLE_API_KEY
)
pc = pinecone.Pinecone(
api_key=PINECONE_API
)
PINECONE_INDEX = "rites-pdf"
index = pc.Index(PINECONE_INDEX)
@tool
def get_context(query: str) -> str:
"""
Retrieve context information by performing a semantic search on indexed document chunks.
This tool embeds the provided user query using a Google Generative AI embeddings model,
then queries a Pinecone index to fetch the top 10 matching document chunks. Each match
includes metadata such as the text chunk, starting page, ending page, and the source PDF URL.
The function aggregates these details into a formatted string.
Args:
query (str): A user query search string used for semantic matching against the document index.
Returns:
str: A formatted string containing the matched document chunks along with their associated metadata,
including start page, end page, and PDF URL.
"""
embedding = google_embeddings.embed_query(query)
search_results = index.query(
vector=embedding,
top_k=20, # Retrieve top 10 results
include_metadata=True
)
context = " "
count = 1
for match in search_results["matches"]:
chunk = match["metadata"].get("chunk")
url = match["metadata"].get("pdf_url")
start_page = match["metadata"].get("start_page")
end_page = match["metadata"].get("end_page")
context += f"""
Chunk {count}:
{chunk}
start_page: {start_page}
end_page: {end_page}
pdf_url: {url}
#########################################
"""
count += 1
return context