File size: 2,051 Bytes
9bd3750
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from langchain_core.tools import tool
import pinecone
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import os
from dotenv import load_dotenv

load_dotenv()
GOOGLE_API_KEY = os.getenv("GEMINI_API_KEY")
PINECONE_API = os.getenv("PINECONE_API_KEY")

google_embeddings = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001",  # Correct model name
    google_api_key=GOOGLE_API_KEY
)

pc = pinecone.Pinecone(
    api_key=PINECONE_API
)

PINECONE_INDEX = "rites-pdf"
index = pc.Index(PINECONE_INDEX)

@tool
def get_context(query: str) -> str:
    """
    Retrieve context information by performing a semantic search on indexed document chunks.

    This tool embeds the provided user query using a Google Generative AI embeddings model,
    then queries a Pinecone index to fetch the top 10 matching document chunks. Each match
    includes metadata such as the text chunk, starting page, ending page, and the source PDF URL.
    The function aggregates these details into a formatted string.

    Args:
        query (str): A user query search string used for semantic matching against the document index.

    Returns:
        str: A formatted string containing the matched document chunks along with their associated metadata,
             including start page, end page, and PDF URL.
    """
    embedding = google_embeddings.embed_query(query)
    search_results = index.query(
        vector=embedding,
        top_k=20,  # Retrieve top 10 results
        include_metadata=True
    )
    context = " "
    count = 1
    for match in search_results["matches"]:
        chunk = match["metadata"].get("chunk")
        url = match["metadata"].get("pdf_url")
        start_page = match["metadata"].get("start_page")
        end_page = match["metadata"].get("end_page")

        context += f"""
        Chunk {count}:
        {chunk}
        start_page: {start_page}
        end_page: {end_page}
        pdf_url: {url}
        #########################################
        """
        count += 1

    return context