#imports from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.document_loaders import WebBaseLoader, TextLoader, PyPDFLoader from langchain_community.embeddings import GPT4AllEmbeddings from langchain_community.vectorstores import Chroma from googlesearch import search import google.generativeai as genai import warnings import streamlit as st warnings.filterwarnings("ignore") #Gemini configuration gemini_api_key = "AIzaSyCdMEDGRTlN7_camesAFg6z0ygRK5fCHvs" genai.configure(api_key=gemini_api_key) safety_settings = [ { "category": "HARM_CATEGORY_DANGEROUS", "threshold": "BLOCK_NONE", }, { "category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE", }, { "category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE", }, { "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE", }, { "category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE", }, ] #CRAG Internal_knowledge_base = "lemh201 (2).pdf" def get_doc(question): """ Retrieves the relevant document page content based on the given question. Args: question (str): The question to search for in the document. Returns: str: The page content of the most relevant document. """ pdf = Internal_knowledge_base loader = PyPDFLoader(pdf) docs = loader.load() text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1000, chunk_overlap=50) all_splits = text_splitter.split_documents(docs) embegging = GPT4AllEmbeddings() vectorstore = Chroma.from_documents(documents=all_splits, collection_name="rag-chroma", embedding=embegging) retriver = vectorstore.as_retriever() docs = retriver.get_relevant_documents(question, k=1) return docs[0].page_content def get_prompt_retriever(context, question): """ Returns a prompt for a grader assessing the relevance of a retrieved document to a user question. Parameters: context (str): The retrieved document. question (str): The user question. Returns: str: The prompt for the grader, including the retrieved document, user question, and instructions for grading. """ return f'''You are grader assessing relavance of a retrieved document to a user question. \n Here is the retrieved document:\n\n {context} \n Here is the user question:\n\n {question} \n If the document document contains keywords related to the user question, grade it as relevant. \n It does not need to be a stringent test.The goal is to filter out erroneous retrievels. \n Give a score between 0 and 1 score to indicate the document is relevant to the question. \n Provide the score without any premable or explaination. \n''' def get_score(docs, question): """ Calculates the score for a given question based on the provided documents. Parameters: - docs (list): A list of documents to consider for generating the score. - question (str): The question for which the score needs to be calculated. Returns: - float: The score for the given question. """ model_score_crag = genai.GenerativeModel('gemini-pro') response = model_score_crag.generate_content(get_prompt_retriever(docs, question), safety_settings=safety_settings) return float(response.text) def get_prompt_rewriter(question): """ Returns a rewritten prompt for a given question. Parameters: question (str): The original user question. Returns: str: The rewritten prompt. """ return f'''You are a question rewriter. \n Here is the user question:\n\n {question} \n Rewrite the question to make it more clear and concise. \n At the same time, try to keep the meaning of the question the same. \n ''' def rewrite_question(question): """ Rewrites the given question using the Gemini API. Args: question (str): The original question to be rewritten. Returns: str: The rewritten question generated by the Gemini model. """ model_prompt_rewritter = genai.GenerativeModel('gemini-pro') response = model_prompt_rewritter.generate_content(get_prompt_rewriter(question), safety_settings=safety_settings) return response.text def refine_doc(doc, question): """ Refines the given document by splitting it into smaller chunks, embedding them, and retrieving the most relevant documents based on a given question. Args: doc (str): The document to be refined. question (str): The question to find relevant documents for. Returns: tuple: A tuple containing the indices of the best documents and the best documents themselves. """ file = open('docs_to_refine.md', 'w', encoding="utf-8") file.write(doc) file.close() loader = TextLoader('docs_to_refine.md', encoding='UTF-8') docs_to_refine = loader.load() text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=250, chunk_overlap=50) all_splits = text_splitter.split_documents(docs_to_refine) embegging = GPT4AllEmbeddings() vectorstore = Chroma.from_documents(documents=all_splits, collection_name="rag-chroma", embedding=embegging) retriver = vectorstore.as_retriever() docs_refined = retriver.get_relevant_documents(question, k=1) score = [] for i in docs_refined: score.append(get_score(i.page_content, question)) best_doc_index = sorted(range(len(score)), key=lambda i: score[i])[-2:] best_doc = [docs_refined[i] for i in best_doc_index] return best_doc_index, best_doc def web_search(query, num_results=5): """ Perform a web search using the specified query and return a list of results. Args: query (str): The search query. num_results (int, optional): The number of search results to retrieve. Defaults to 5. Returns: list: A list of search results. """ results = [] for result in search(query, num_results=num_results): results.append(result) return results def External_Knowledge(question): """ Retrieves external knowledge related to the given question. Args: question (str): The question to search for external knowledge. Returns: str: The page content of the most relevant document found. """ url= web_search(question)[0] loader=WebBaseLoader(url) docs=loader.load() text_splitter=RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=500,chunk_overlap=50) all_splits_=text_splitter.split_documents(docs) embedding=GPT4AllEmbeddings() vectorstore_=Chroma.from_documents(documents=all_splits_,collection_name="rag-chroma",embedding=embedding) retriver_=vectorstore_.as_retriever() docs=retriver_.get_relevant_documents(question,k=1) return docs[0].page_content def CRAG(question): """ Retrieves relevant documents based on the given question and returns the content of the documents. Args: question (str): The question to be answered. Returns: str: The content of the relevant documents. """ docs=get_doc(question) score=get_score(docs,question) if score >=0.7 : score_refined_doc,refined_doc=refine_doc(docs,question) return refined_doc[0].page_content + '\n\n' + refined_doc[1].page_content elif 0.3