Rauhan commited on
Commit
6d8505d
1 Parent(s): e3475f1

DEBUG: retrieval

Browse files
Files changed (1) hide show
  1. functions.py +9 -24
functions.py CHANGED
@@ -1,18 +1,17 @@
1
  import pymupdf
2
  from concurrent.futures import ThreadPoolExecutor
3
- from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
4
  from langchain_core.runnables import RunnablePassthrough, RunnableLambda
5
  from langchain_text_splitters import RecursiveCharacterTextSplitter
6
  from langchain_qdrant import QdrantVectorStore
7
  from langchain_qdrant import RetrievalMode
8
  from langchain_core.prompts.chat import ChatPromptTemplate
 
9
  from langchain_core.output_parsers import StrOutputParser
10
  from langchain.retrievers import ParentDocumentRetriever
11
  from langchain_core.runnables.history import RunnableWithMessageHistory
12
  from langchain.memory import ChatMessageHistory
13
  from pandasai import SmartDataframe
14
  from langchain_core.chat_history import BaseChatMessageHistory
15
- from langchain.storage import InMemoryStore
16
  from langchain_community.document_loaders import YoutubeLoader
17
  from langchain.docstore.document import Document
18
  from langchain_huggingface import HuggingFaceEmbeddings
@@ -70,7 +69,6 @@ CHAT HISTORY:
70
  NOTE: Generate responses directly without using phrases like "Response:" or "Answer:". Do not mention the use of extracted context or provide unnecessary details.
71
  """
72
  prompt = ChatPromptTemplate.from_template(prompt)
73
- store = InMemoryStore()
74
  chatHistoryStore = dict()
75
 
76
 
@@ -115,16 +113,14 @@ def createTable(tablename: str):
115
  def addDocuments(text: str, source: str, vectorstore: str):
116
  global vectorEmbeddings
117
  global sparseEmbeddings
118
- global store
119
- parentSplitter = RecursiveCharacterTextSplitter(
120
- chunk_size=2000,
121
- add_start_index=True
122
- )
123
- childSplitter = RecursiveCharacterTextSplitter(
124
- chunk_size=400,
125
  add_start_index=True
126
  )
127
  texts = [Document(page_content=text, metadata={"source": source})]
 
 
128
  vectorstore = QdrantVectorStore.from_existing_collection(
129
  embedding=vectorEmbeddings,
130
  sparse_embedding=sparseEmbeddings,
@@ -133,13 +129,7 @@ def addDocuments(text: str, source: str, vectorstore: str):
133
  api_key=os.environ["QDRANT_API_KEY"],
134
  retrieval_mode=RetrievalMode.HYBRID
135
  )
136
- retriever = ParentDocumentRetriever(
137
- vectorstore=vectorstore,
138
- docstore=store,
139
- child_splitter=childSplitter,
140
- parent_splitter=parentSplitter
141
- )
142
- retriever.add_documents(documents=texts)
143
  return {
144
  "output": "SUCCESS"
145
  }
@@ -189,12 +179,7 @@ def answerQuery(query: str, vectorstore: str, llmModel: str = "llama3-70b-8192")
189
  api_key=os.environ["QDRANT_API_KEY"],
190
  retrieval_mode=RetrievalMode.HYBRID
191
  )
192
- retriever = ParentDocumentRetriever(
193
- vectorstore=vectorstore,
194
- docstore=store,
195
- child_splitter=RecursiveCharacterTextSplitter(),
196
- search_kwargs={"k": 4, "score_threshold": 0}
197
- )
198
  baseChain = (
199
  {"context": RunnableLambda(lambda x: x["question"]) | retriever | RunnableLambda(format_docs),
200
  "question": RunnableLambda(lambda x: x["question"]), "chatHistory": RunnableLambda(lambda x: x["chatHistory"])}
@@ -280,7 +265,7 @@ def getLinks(url: str, timeout=30):
280
  def getTextFromImagePDF(pdfBytes):
281
  def getText(image):
282
  global reader
283
- return "\n".join([text[1] for text in reader.readtext(np.array(image), paragraph=True, x_ths = 0)])
284
  allImages = convert_from_bytes(pdfBytes)
285
  texts = [getText(image) for image in allImages]
286
  return "\n\n\n".join(texts)
 
1
  import pymupdf
2
  from concurrent.futures import ThreadPoolExecutor
 
3
  from langchain_core.runnables import RunnablePassthrough, RunnableLambda
4
  from langchain_text_splitters import RecursiveCharacterTextSplitter
5
  from langchain_qdrant import QdrantVectorStore
6
  from langchain_qdrant import RetrievalMode
7
  from langchain_core.prompts.chat import ChatPromptTemplate
8
+ from uuid import uuid4
9
  from langchain_core.output_parsers import StrOutputParser
10
  from langchain.retrievers import ParentDocumentRetriever
11
  from langchain_core.runnables.history import RunnableWithMessageHistory
12
  from langchain.memory import ChatMessageHistory
13
  from pandasai import SmartDataframe
14
  from langchain_core.chat_history import BaseChatMessageHistory
 
15
  from langchain_community.document_loaders import YoutubeLoader
16
  from langchain.docstore.document import Document
17
  from langchain_huggingface import HuggingFaceEmbeddings
 
69
  NOTE: Generate responses directly without using phrases like "Response:" or "Answer:". Do not mention the use of extracted context or provide unnecessary details.
70
  """
71
  prompt = ChatPromptTemplate.from_template(prompt)
 
72
  chatHistoryStore = dict()
73
 
74
 
 
113
  def addDocuments(text: str, source: str, vectorstore: str):
114
  global vectorEmbeddings
115
  global sparseEmbeddings
116
+ splitter = RecursiveCharacterTextSplitter(
117
+ chunk_size=1500,
118
+ chunk_overlap=250,
 
 
 
 
119
  add_start_index=True
120
  )
121
  texts = [Document(page_content=text, metadata={"source": source})]
122
+ texts = splitter.split_documents(texts)
123
+ ids = [str(uuid4()) for _ in range(len(texts))]
124
  vectorstore = QdrantVectorStore.from_existing_collection(
125
  embedding=vectorEmbeddings,
126
  sparse_embedding=sparseEmbeddings,
 
129
  api_key=os.environ["QDRANT_API_KEY"],
130
  retrieval_mode=RetrievalMode.HYBRID
131
  )
132
+ vectorstore.add_documents(documents=texts, ids=ids)
 
 
 
 
 
 
133
  return {
134
  "output": "SUCCESS"
135
  }
 
179
  api_key=os.environ["QDRANT_API_KEY"],
180
  retrieval_mode=RetrievalMode.HYBRID
181
  )
182
+ retriever = vectorstore.as_retriever(search_type = "mmr", search_kwargs={"k": 4, "score_threshold": None})
 
 
 
 
 
183
  baseChain = (
184
  {"context": RunnableLambda(lambda x: x["question"]) | retriever | RunnableLambda(format_docs),
185
  "question": RunnableLambda(lambda x: x["question"]), "chatHistory": RunnableLambda(lambda x: x["chatHistory"])}
 
265
  def getTextFromImagePDF(pdfBytes):
266
  def getText(image):
267
  global reader
268
+ return "\n".join([text[1] for text in reader.readtext(np.array(image), paragraph=True)])
269
  allImages = convert_from_bytes(pdfBytes)
270
  texts = [getText(image) for image in allImages]
271
  return "\n\n\n".join(texts)