import numpy as np from langchain_community.embeddings import OpenAIEmbeddings from pinecone import Pinecone, ServerlessSpec from tqdm.notebook import tqdm import langchain import openai from openai import OpenAI import string from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader import os def get_text_from_document(document): # Assuming 'text' is a list of Document objects, each with a 'page_content' attribute # Concatenate the page_content of each Document into a single text string # text = "".join([doc.page_content for doc in document]) text = "".join(document).replace('\n\n', '\n') # Now, 'full_text' should contain the actual text extracted from the PDF print(f"Total length of text: {len(text)} characters") # If you want to see a part of the extracted text print(text[:1000]) # Adjust the number as necessary to inspect more of the text return text # Function to get the embeddings of the text using OpenAI text-embedding-ada-002 model def get_embedding(text, model="text-embedding-ada-002"): text = text.replace("\n", " ") openai_key = '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' client = OpenAI(api_key=openai_key) return client.embeddings.create(input=[text], model=model).data[0].embedding ## TODO: Function to query the Pinecone vector store and return the top-k results def query_pinecone_vector_store(query, top_k=5): # Generate an embedding for the query query_embedding = get_embedding(query) # pc = Pinecone(api_key="c25f9e89-fc9e-4d21-b3eb-057dbc21c17c") pc = Pinecone(api_key="52ef9136-6188-4e51-af13-9639bf95c163") pinecone_index_name = "ee596llm-project2" index = pc.Index(pinecone_index_name) # Query the Pinecone index with the generated embedding query_results = index.query( vector=query_embedding, top_k=top_k, include_metadata=True ) # Extract and return the most relevant documents along with their scores relevant_docs = [ (result['id'], result['score'], result['metadata']['text']) for result in query_results['matches'] ] return relevant_docs def get_completion(prompt, model="gpt-3.5-turbo"): message = {"role": "user", "content": prompt} client = OpenAI(api_key='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') response = client.chat.completions.create( model=model, messages=[message] ) return response.choices[0].message.content def generate_answer_with_context(query, results): # Construct the prompt with the top-k results as context context_texts = "\n\n".join( [f"Context {idx + 1}: {result[1]}" for idx, result in enumerate(results)]) # Assuming result[1] is the text print(f"context_texts is : {context_texts} \n\n\n") prompt = f"Given the following contexts related to the query '{query}', provide a detailed answer:\n\n{context_texts}\n\nAnswer the query:" # Generate the answer using the GPT-3.5 Turbo model with the constructed prompt answer = get_completion(prompt, model="gpt-3.5-turbo") return answer class Relevant_Documents_Agent: def __init__(self, openai_client) -> None: # TODO: Initialize the Relevant_Documents_Agent self.openai_client = openai_client def get_relevance(self, conversation) -> str: # TODO: Get if the returned documents are relevant # Generate embeddings for the query and all documents top_k_results = query_pinecone_vector_store(conversation, top_k=4) answer = generate_answer_with_context(conversation, top_k_results) most_relevant_document = answer return most_relevant_document def compute_cosine_similarity(self, vec1, vec2): # Ensure the vectors are numpy arrays for mathematical operations vec1 = np.array(vec1) vec2 = np.array(vec2) # Compute the cosine similarity dot_product = np.dot(vec1, vec2) norm_vec1 = np.linalg.norm(vec1) norm_vec2 = np.linalg.norm(vec2) cosine_similarity = dot_product / (norm_vec1 * norm_vec2) return cosine_similarity