Spaces:

techconspartners
/

ConversAI

Sleeping

App Files Files Community

Rauhan commited on Aug 21, 2024

Commit

6d8505d

1 Parent(s): e3475f1

DEBUG: retrieval

Browse files

Files changed (1) hide show

functions.py +9 -24

functions.py CHANGED Viewed

@@ -1,18 +1,17 @@
 import pymupdf
 from concurrent.futures import ThreadPoolExecutor
-from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
 from langchain_core.runnables import RunnablePassthrough, RunnableLambda
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_qdrant import QdrantVectorStore
 from langchain_qdrant import RetrievalMode
 from langchain_core.prompts.chat import ChatPromptTemplate
 from langchain_core.output_parsers import StrOutputParser
 from langchain.retrievers import ParentDocumentRetriever
 from langchain_core.runnables.history import RunnableWithMessageHistory
 from langchain.memory import ChatMessageHistory
 from pandasai import SmartDataframe
 from langchain_core.chat_history import BaseChatMessageHistory
-from langchain.storage import InMemoryStore
 from langchain_community.document_loaders import YoutubeLoader
 from langchain.docstore.document import Document
 from langchain_huggingface import HuggingFaceEmbeddings
@@ -70,7 +69,6 @@ CHAT HISTORY:
 NOTE: Generate responses directly without using phrases like "Response:" or "Answer:". Do not mention the use of extracted context or provide unnecessary details.
 """
 prompt = ChatPromptTemplate.from_template(prompt)
-store = InMemoryStore()
 chatHistoryStore = dict()
@@ -115,16 +113,14 @@ def createTable(tablename: str):
 def addDocuments(text: str, source: str, vectorstore: str):
     global vectorEmbeddings
     global sparseEmbeddings
-    global store
-    parentSplitter = RecursiveCharacterTextSplitter(
-        chunk_size=2000,
-        add_start_index=True
-    )
-    childSplitter = RecursiveCharacterTextSplitter(
-        chunk_size=400,
         add_start_index=True
     )
     texts = [Document(page_content=text, metadata={"source": source})]
     vectorstore = QdrantVectorStore.from_existing_collection(
         embedding=vectorEmbeddings,
         sparse_embedding=sparseEmbeddings,
@@ -133,13 +129,7 @@ def addDocuments(text: str, source: str, vectorstore: str):
         api_key=os.environ["QDRANT_API_KEY"],
         retrieval_mode=RetrievalMode.HYBRID
     )
-    retriever = ParentDocumentRetriever(
-        vectorstore=vectorstore,
-        docstore=store,
-        child_splitter=childSplitter,
-        parent_splitter=parentSplitter
-    )
-    retriever.add_documents(documents=texts)
     return {
         "output": "SUCCESS"
     }
@@ -189,12 +179,7 @@ def answerQuery(query: str, vectorstore: str, llmModel: str = "llama3-70b-8192")
         api_key=os.environ["QDRANT_API_KEY"],
         retrieval_mode=RetrievalMode.HYBRID
     )
-    retriever = ParentDocumentRetriever(
-        vectorstore=vectorstore,
-        docstore=store,
-        child_splitter=RecursiveCharacterTextSplitter(),
-        search_kwargs={"k": 4, "score_threshold": 0}
-    )
     baseChain = (
             {"context": RunnableLambda(lambda x: x["question"]) | retriever | RunnableLambda(format_docs),
              "question": RunnableLambda(lambda x: x["question"]), "chatHistory": RunnableLambda(lambda x: x["chatHistory"])}
@@ -280,7 +265,7 @@ def getLinks(url: str, timeout=30):
 def getTextFromImagePDF(pdfBytes):
     def getText(image):
         global reader
-        return "\n".join([text[1] for text in reader.readtext(np.array(image), paragraph=True, x_ths = 0)])
     allImages = convert_from_bytes(pdfBytes)
     texts = [getText(image) for image in allImages]
     return "\n\n\n".join(texts)

 import pymupdf
 from concurrent.futures import ThreadPoolExecutor
 from langchain_core.runnables import RunnablePassthrough, RunnableLambda
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_qdrant import QdrantVectorStore
 from langchain_qdrant import RetrievalMode
 from langchain_core.prompts.chat import ChatPromptTemplate
+from uuid import uuid4
 from langchain_core.output_parsers import StrOutputParser
 from langchain.retrievers import ParentDocumentRetriever
 from langchain_core.runnables.history import RunnableWithMessageHistory
 from langchain.memory import ChatMessageHistory
 from pandasai import SmartDataframe
 from langchain_core.chat_history import BaseChatMessageHistory
 from langchain_community.document_loaders import YoutubeLoader
 from langchain.docstore.document import Document
 from langchain_huggingface import HuggingFaceEmbeddings
 NOTE: Generate responses directly without using phrases like "Response:" or "Answer:". Do not mention the use of extracted context or provide unnecessary details.
 """
 prompt = ChatPromptTemplate.from_template(prompt)
 chatHistoryStore = dict()
 def addDocuments(text: str, source: str, vectorstore: str):
     global vectorEmbeddings
     global sparseEmbeddings
+    splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1500,
+        chunk_overlap=250,
         add_start_index=True
     )
     texts = [Document(page_content=text, metadata={"source": source})]
+    texts = splitter.split_documents(texts)
+    ids = [str(uuid4()) for _ in range(len(texts))]
     vectorstore = QdrantVectorStore.from_existing_collection(
         embedding=vectorEmbeddings,
         sparse_embedding=sparseEmbeddings,
         api_key=os.environ["QDRANT_API_KEY"],
         retrieval_mode=RetrievalMode.HYBRID
     )
+    vectorstore.add_documents(documents=texts, ids=ids)
     return {
         "output": "SUCCESS"
     }
         api_key=os.environ["QDRANT_API_KEY"],
         retrieval_mode=RetrievalMode.HYBRID
     )
+    retriever = vectorstore.as_retriever(search_type = "mmr", search_kwargs={"k": 4, "score_threshold": None})
     baseChain = (
             {"context": RunnableLambda(lambda x: x["question"]) | retriever | RunnableLambda(format_docs),
              "question": RunnableLambda(lambda x: x["question"]), "chatHistory": RunnableLambda(lambda x: x["chatHistory"])}
 def getTextFromImagePDF(pdfBytes):
     def getText(image):
         global reader
+        return "\n".join([text[1] for text in reader.readtext(np.array(image), paragraph=True)])
     allImages = convert_from_bytes(pdfBytes)
     texts = [getText(image) for image in allImages]
     return "\n\n\n".join(texts)