Spaces:
Sleeping
Sleeping
DEBUG: retrieval
Browse files- functions.py +9 -24
functions.py
CHANGED
@@ -1,18 +1,17 @@
|
|
1 |
import pymupdf
|
2 |
from concurrent.futures import ThreadPoolExecutor
|
3 |
-
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
|
4 |
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
|
5 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
6 |
from langchain_qdrant import QdrantVectorStore
|
7 |
from langchain_qdrant import RetrievalMode
|
8 |
from langchain_core.prompts.chat import ChatPromptTemplate
|
|
|
9 |
from langchain_core.output_parsers import StrOutputParser
|
10 |
from langchain.retrievers import ParentDocumentRetriever
|
11 |
from langchain_core.runnables.history import RunnableWithMessageHistory
|
12 |
from langchain.memory import ChatMessageHistory
|
13 |
from pandasai import SmartDataframe
|
14 |
from langchain_core.chat_history import BaseChatMessageHistory
|
15 |
-
from langchain.storage import InMemoryStore
|
16 |
from langchain_community.document_loaders import YoutubeLoader
|
17 |
from langchain.docstore.document import Document
|
18 |
from langchain_huggingface import HuggingFaceEmbeddings
|
@@ -70,7 +69,6 @@ CHAT HISTORY:
|
|
70 |
NOTE: Generate responses directly without using phrases like "Response:" or "Answer:". Do not mention the use of extracted context or provide unnecessary details.
|
71 |
"""
|
72 |
prompt = ChatPromptTemplate.from_template(prompt)
|
73 |
-
store = InMemoryStore()
|
74 |
chatHistoryStore = dict()
|
75 |
|
76 |
|
@@ -115,16 +113,14 @@ def createTable(tablename: str):
|
|
115 |
def addDocuments(text: str, source: str, vectorstore: str):
|
116 |
global vectorEmbeddings
|
117 |
global sparseEmbeddings
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
add_start_index=True
|
122 |
-
)
|
123 |
-
childSplitter = RecursiveCharacterTextSplitter(
|
124 |
-
chunk_size=400,
|
125 |
add_start_index=True
|
126 |
)
|
127 |
texts = [Document(page_content=text, metadata={"source": source})]
|
|
|
|
|
128 |
vectorstore = QdrantVectorStore.from_existing_collection(
|
129 |
embedding=vectorEmbeddings,
|
130 |
sparse_embedding=sparseEmbeddings,
|
@@ -133,13 +129,7 @@ def addDocuments(text: str, source: str, vectorstore: str):
|
|
133 |
api_key=os.environ["QDRANT_API_KEY"],
|
134 |
retrieval_mode=RetrievalMode.HYBRID
|
135 |
)
|
136 |
-
|
137 |
-
vectorstore=vectorstore,
|
138 |
-
docstore=store,
|
139 |
-
child_splitter=childSplitter,
|
140 |
-
parent_splitter=parentSplitter
|
141 |
-
)
|
142 |
-
retriever.add_documents(documents=texts)
|
143 |
return {
|
144 |
"output": "SUCCESS"
|
145 |
}
|
@@ -189,12 +179,7 @@ def answerQuery(query: str, vectorstore: str, llmModel: str = "llama3-70b-8192")
|
|
189 |
api_key=os.environ["QDRANT_API_KEY"],
|
190 |
retrieval_mode=RetrievalMode.HYBRID
|
191 |
)
|
192 |
-
retriever =
|
193 |
-
vectorstore=vectorstore,
|
194 |
-
docstore=store,
|
195 |
-
child_splitter=RecursiveCharacterTextSplitter(),
|
196 |
-
search_kwargs={"k": 4, "score_threshold": 0}
|
197 |
-
)
|
198 |
baseChain = (
|
199 |
{"context": RunnableLambda(lambda x: x["question"]) | retriever | RunnableLambda(format_docs),
|
200 |
"question": RunnableLambda(lambda x: x["question"]), "chatHistory": RunnableLambda(lambda x: x["chatHistory"])}
|
@@ -280,7 +265,7 @@ def getLinks(url: str, timeout=30):
|
|
280 |
def getTextFromImagePDF(pdfBytes):
|
281 |
def getText(image):
|
282 |
global reader
|
283 |
-
return "\n".join([text[1] for text in reader.readtext(np.array(image), paragraph=True
|
284 |
allImages = convert_from_bytes(pdfBytes)
|
285 |
texts = [getText(image) for image in allImages]
|
286 |
return "\n\n\n".join(texts)
|
|
|
1 |
import pymupdf
|
2 |
from concurrent.futures import ThreadPoolExecutor
|
|
|
3 |
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
|
4 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
5 |
from langchain_qdrant import QdrantVectorStore
|
6 |
from langchain_qdrant import RetrievalMode
|
7 |
from langchain_core.prompts.chat import ChatPromptTemplate
|
8 |
+
from uuid import uuid4
|
9 |
from langchain_core.output_parsers import StrOutputParser
|
10 |
from langchain.retrievers import ParentDocumentRetriever
|
11 |
from langchain_core.runnables.history import RunnableWithMessageHistory
|
12 |
from langchain.memory import ChatMessageHistory
|
13 |
from pandasai import SmartDataframe
|
14 |
from langchain_core.chat_history import BaseChatMessageHistory
|
|
|
15 |
from langchain_community.document_loaders import YoutubeLoader
|
16 |
from langchain.docstore.document import Document
|
17 |
from langchain_huggingface import HuggingFaceEmbeddings
|
|
|
69 |
NOTE: Generate responses directly without using phrases like "Response:" or "Answer:". Do not mention the use of extracted context or provide unnecessary details.
|
70 |
"""
|
71 |
prompt = ChatPromptTemplate.from_template(prompt)
|
|
|
72 |
chatHistoryStore = dict()
|
73 |
|
74 |
|
|
|
113 |
def addDocuments(text: str, source: str, vectorstore: str):
|
114 |
global vectorEmbeddings
|
115 |
global sparseEmbeddings
|
116 |
+
splitter = RecursiveCharacterTextSplitter(
|
117 |
+
chunk_size=1500,
|
118 |
+
chunk_overlap=250,
|
|
|
|
|
|
|
|
|
119 |
add_start_index=True
|
120 |
)
|
121 |
texts = [Document(page_content=text, metadata={"source": source})]
|
122 |
+
texts = splitter.split_documents(texts)
|
123 |
+
ids = [str(uuid4()) for _ in range(len(texts))]
|
124 |
vectorstore = QdrantVectorStore.from_existing_collection(
|
125 |
embedding=vectorEmbeddings,
|
126 |
sparse_embedding=sparseEmbeddings,
|
|
|
129 |
api_key=os.environ["QDRANT_API_KEY"],
|
130 |
retrieval_mode=RetrievalMode.HYBRID
|
131 |
)
|
132 |
+
vectorstore.add_documents(documents=texts, ids=ids)
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
return {
|
134 |
"output": "SUCCESS"
|
135 |
}
|
|
|
179 |
api_key=os.environ["QDRANT_API_KEY"],
|
180 |
retrieval_mode=RetrievalMode.HYBRID
|
181 |
)
|
182 |
+
retriever = vectorstore.as_retriever(search_type = "mmr", search_kwargs={"k": 4, "score_threshold": None})
|
|
|
|
|
|
|
|
|
|
|
183 |
baseChain = (
|
184 |
{"context": RunnableLambda(lambda x: x["question"]) | retriever | RunnableLambda(format_docs),
|
185 |
"question": RunnableLambda(lambda x: x["question"]), "chatHistory": RunnableLambda(lambda x: x["chatHistory"])}
|
|
|
265 |
def getTextFromImagePDF(pdfBytes):
|
266 |
def getText(image):
|
267 |
global reader
|
268 |
+
return "\n".join([text[1] for text in reader.readtext(np.array(image), paragraph=True)])
|
269 |
allImages = convert_from_bytes(pdfBytes)
|
270 |
texts = [getText(image) for image in allImages]
|
271 |
return "\n\n\n".join(texts)
|