Spaces:
Sleeping
Sleeping
from langchain_core.tools import tool | |
import pinecone | |
from langchain_google_genai import GoogleGenerativeAIEmbeddings | |
import os | |
from dotenv import load_dotenv | |
load_dotenv() | |
GOOGLE_API_KEY = os.getenv("GEMINI_API_KEY") | |
PINECONE_API = os.getenv("PINECONE_API_KEY") | |
google_embeddings = GoogleGenerativeAIEmbeddings( | |
model="models/embedding-001", # Correct model name | |
google_api_key=GOOGLE_API_KEY | |
) | |
pc = pinecone.Pinecone( | |
api_key=PINECONE_API | |
) | |
PINECONE_INDEX = "rites-pdf" | |
index = pc.Index(PINECONE_INDEX) | |
def get_context(query: str) -> str: | |
""" | |
Retrieve context information by performing a semantic search on indexed document chunks. | |
This tool embeds the provided user query using a Google Generative AI embeddings model, | |
then queries a Pinecone index to fetch the top 10 matching document chunks. Each match | |
includes metadata such as the text chunk, starting page, ending page, and the source PDF URL. | |
The function aggregates these details into a formatted string. | |
Args: | |
query (str): A user query search string used for semantic matching against the document index. | |
Returns: | |
str: A formatted string containing the matched document chunks along with their associated metadata, | |
including start page, end page, and PDF URL. | |
""" | |
embedding = google_embeddings.embed_query(query) | |
search_results = index.query( | |
vector=embedding, | |
top_k=20, # Retrieve top 10 results | |
include_metadata=True | |
) | |
context = " " | |
count = 1 | |
for match in search_results["matches"]: | |
chunk = match["metadata"].get("chunk") | |
url = match["metadata"].get("pdf_url") | |
start_page = match["metadata"].get("start_page") | |
end_page = match["metadata"].get("end_page") | |
context += f""" | |
Chunk {count}: | |
{chunk} | |
start_page: {start_page} | |
end_page: {end_page} | |
pdf_url: {url} | |
######################################### | |
""" | |
count += 1 | |
return context | |