Spaces:
Paused
Paused
Shreyas094
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -455,27 +455,18 @@ def get_response_from_pdf(query, model, selected_docs, num_calls=3, temperature=
|
|
455 |
if os.path.exists("faiss_database"):
|
456 |
logging.info("Loading FAISS database")
|
457 |
database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
|
458 |
-
|
459 |
-
# Inspect FAISS database
|
460 |
-
logging.info(f"FAISS database size: {len(database.docstore._dict)}")
|
461 |
-
for doc_id, doc in database.docstore._dict.items():
|
462 |
-
logging.info(f"Document ID: {doc_id}, Source: {doc.metadata.get('source', 'Unknown')}")
|
463 |
else:
|
464 |
logging.warning("No FAISS database found")
|
465 |
yield "No documents available. Please upload PDF documents to answer questions."
|
466 |
return
|
467 |
|
468 |
-
retriever = database.as_retriever(search_kwargs={"k": 20}) # Increased k to
|
469 |
logging.info(f"Retrieving relevant documents for query: {query}")
|
470 |
relevant_docs = retriever.get_relevant_documents(query)
|
471 |
logging.info(f"Number of relevant documents retrieved: {len(relevant_docs)}")
|
472 |
|
473 |
-
# Log details of retrieved documents
|
474 |
-
for i, doc in enumerate(relevant_docs):
|
475 |
-
logging.info(f"Retrieved document {i+1}: Source: {doc.metadata.get('source', 'Unknown')}, Content preview: {doc.page_content[:100]}...")
|
476 |
-
|
477 |
# Filter relevant_docs based on selected documents
|
478 |
-
filtered_docs = [doc for doc in relevant_docs if doc.metadata
|
479 |
logging.info(f"Number of filtered documents: {len(filtered_docs)}")
|
480 |
|
481 |
if not filtered_docs:
|
@@ -483,28 +474,28 @@ def get_response_from_pdf(query, model, selected_docs, num_calls=3, temperature=
|
|
483 |
yield "No relevant information found in the selected documents. Please try selecting different documents or rephrasing your query."
|
484 |
return
|
485 |
|
486 |
-
|
487 |
-
|
488 |
-
|
489 |
-
start = 0
|
490 |
-
while start < len(text):
|
491 |
-
end = start + chunk_size
|
492 |
-
chunk = text[start:end]
|
493 |
-
chunks.append(chunk)
|
494 |
-
start = end - overlap
|
495 |
-
return chunks
|
496 |
|
|
|
|
|
497 |
context_chunks = []
|
|
|
498 |
for doc in filtered_docs:
|
499 |
-
|
500 |
-
|
|
|
|
|
|
|
|
|
|
|
501 |
|
502 |
-
logging.info(f"Number of context chunks
|
503 |
|
504 |
for i, context_str in enumerate(context_chunks):
|
505 |
logging.info(f"Processing context chunk {i+1}/{len(context_chunks)}")
|
506 |
logging.info(f"Context chunk length: {len(context_str)}")
|
507 |
-
logging.info(f"Context chunk preview: {context_str[:100]}...")
|
508 |
|
509 |
if model == "@cf/meta/llama-3.1-8b-instruct":
|
510 |
logging.info("Using Cloudflare API")
|
@@ -523,7 +514,7 @@ Write a detailed and complete response that answers the following user question:
|
|
523 |
logging.info(f"API call {j+1}/{num_calls}")
|
524 |
for message in client.chat_completion(
|
525 |
messages=[{"role": "user", "content": prompt}],
|
526 |
-
max_tokens=
|
527 |
temperature=temperature,
|
528 |
stream=True,
|
529 |
):
|
|
|
455 |
if os.path.exists("faiss_database"):
|
456 |
logging.info("Loading FAISS database")
|
457 |
database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
|
|
|
|
|
|
|
|
|
|
|
458 |
else:
|
459 |
logging.warning("No FAISS database found")
|
460 |
yield "No documents available. Please upload PDF documents to answer questions."
|
461 |
return
|
462 |
|
463 |
+
retriever = database.as_retriever(search_kwargs={"k": 20}) # Increased k to 10
|
464 |
logging.info(f"Retrieving relevant documents for query: {query}")
|
465 |
relevant_docs = retriever.get_relevant_documents(query)
|
466 |
logging.info(f"Number of relevant documents retrieved: {len(relevant_docs)}")
|
467 |
|
|
|
|
|
|
|
|
|
468 |
# Filter relevant_docs based on selected documents
|
469 |
+
filtered_docs = [doc for doc in relevant_docs if doc.metadata["source"] in selected_docs]
|
470 |
logging.info(f"Number of filtered documents: {len(filtered_docs)}")
|
471 |
|
472 |
if not filtered_docs:
|
|
|
474 |
yield "No relevant information found in the selected documents. Please try selecting different documents or rephrasing your query."
|
475 |
return
|
476 |
|
477 |
+
for doc in filtered_docs:
|
478 |
+
logging.info(f"Document source: {doc.metadata['source']}")
|
479 |
+
logging.info(f"Document content preview: {doc.page_content[:100]}...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
480 |
|
481 |
+
# Implement a sliding window approach for context
|
482 |
+
max_context_length = 4000 # Adjust based on your model's capacity
|
483 |
context_chunks = []
|
484 |
+
current_chunk = ""
|
485 |
for doc in filtered_docs:
|
486 |
+
if len(current_chunk) + len(doc.page_content) > max_context_length:
|
487 |
+
context_chunks.append(current_chunk)
|
488 |
+
current_chunk = doc.page_content
|
489 |
+
else:
|
490 |
+
current_chunk += "\n" + doc.page_content
|
491 |
+
if current_chunk:
|
492 |
+
context_chunks.append(current_chunk)
|
493 |
|
494 |
+
logging.info(f"Number of context chunks: {len(context_chunks)}")
|
495 |
|
496 |
for i, context_str in enumerate(context_chunks):
|
497 |
logging.info(f"Processing context chunk {i+1}/{len(context_chunks)}")
|
498 |
logging.info(f"Context chunk length: {len(context_str)}")
|
|
|
499 |
|
500 |
if model == "@cf/meta/llama-3.1-8b-instruct":
|
501 |
logging.info("Using Cloudflare API")
|
|
|
514 |
logging.info(f"API call {j+1}/{num_calls}")
|
515 |
for message in client.chat_completion(
|
516 |
messages=[{"role": "user", "content": prompt}],
|
517 |
+
max_tokens=10000,
|
518 |
temperature=temperature,
|
519 |
stream=True,
|
520 |
):
|