Shreyas094 commited on
Commit
b45f3cf
·
verified ·
1 Parent(s): 0e2e9a3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -26
app.py CHANGED
@@ -455,27 +455,18 @@ def get_response_from_pdf(query, model, selected_docs, num_calls=3, temperature=
455
  if os.path.exists("faiss_database"):
456
  logging.info("Loading FAISS database")
457
  database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
458
-
459
- # Inspect FAISS database
460
- logging.info(f"FAISS database size: {len(database.docstore._dict)}")
461
- for doc_id, doc in database.docstore._dict.items():
462
- logging.info(f"Document ID: {doc_id}, Source: {doc.metadata.get('source', 'Unknown')}")
463
  else:
464
  logging.warning("No FAISS database found")
465
  yield "No documents available. Please upload PDF documents to answer questions."
466
  return
467
 
468
- retriever = database.as_retriever(search_kwargs={"k": 20}) # Increased k to 20
469
  logging.info(f"Retrieving relevant documents for query: {query}")
470
  relevant_docs = retriever.get_relevant_documents(query)
471
  logging.info(f"Number of relevant documents retrieved: {len(relevant_docs)}")
472
 
473
- # Log details of retrieved documents
474
- for i, doc in enumerate(relevant_docs):
475
- logging.info(f"Retrieved document {i+1}: Source: {doc.metadata.get('source', 'Unknown')}, Content preview: {doc.page_content[:100]}...")
476
-
477
  # Filter relevant_docs based on selected documents
478
- filtered_docs = [doc for doc in relevant_docs if doc.metadata.get("source") in selected_docs]
479
  logging.info(f"Number of filtered documents: {len(filtered_docs)}")
480
 
481
  if not filtered_docs:
@@ -483,28 +474,28 @@ def get_response_from_pdf(query, model, selected_docs, num_calls=3, temperature=
483
  yield "No relevant information found in the selected documents. Please try selecting different documents or rephrasing your query."
484
  return
485
 
486
- # Implement a custom chunking strategy
487
- def custom_chunk(text, chunk_size=1000, overlap=200):
488
- chunks = []
489
- start = 0
490
- while start < len(text):
491
- end = start + chunk_size
492
- chunk = text[start:end]
493
- chunks.append(chunk)
494
- start = end - overlap
495
- return chunks
496
 
 
 
497
  context_chunks = []
 
498
  for doc in filtered_docs:
499
- doc_chunks = custom_chunk(doc.page_content)
500
- context_chunks.extend(doc_chunks)
 
 
 
 
 
501
 
502
- logging.info(f"Number of context chunks after custom chunking: {len(context_chunks)}")
503
 
504
  for i, context_str in enumerate(context_chunks):
505
  logging.info(f"Processing context chunk {i+1}/{len(context_chunks)}")
506
  logging.info(f"Context chunk length: {len(context_str)}")
507
- logging.info(f"Context chunk preview: {context_str[:100]}...")
508
 
509
  if model == "@cf/meta/llama-3.1-8b-instruct":
510
  logging.info("Using Cloudflare API")
@@ -523,7 +514,7 @@ Write a detailed and complete response that answers the following user question:
523
  logging.info(f"API call {j+1}/{num_calls}")
524
  for message in client.chat_completion(
525
  messages=[{"role": "user", "content": prompt}],
526
- max_tokens=2000, # Reduced max_tokens to avoid potential errors
527
  temperature=temperature,
528
  stream=True,
529
  ):
 
455
  if os.path.exists("faiss_database"):
456
  logging.info("Loading FAISS database")
457
  database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
 
 
 
 
 
458
  else:
459
  logging.warning("No FAISS database found")
460
  yield "No documents available. Please upload PDF documents to answer questions."
461
  return
462
 
463
+ retriever = database.as_retriever(search_kwargs={"k": 20}) # Increased k to 10
464
  logging.info(f"Retrieving relevant documents for query: {query}")
465
  relevant_docs = retriever.get_relevant_documents(query)
466
  logging.info(f"Number of relevant documents retrieved: {len(relevant_docs)}")
467
 
 
 
 
 
468
  # Filter relevant_docs based on selected documents
469
+ filtered_docs = [doc for doc in relevant_docs if doc.metadata["source"] in selected_docs]
470
  logging.info(f"Number of filtered documents: {len(filtered_docs)}")
471
 
472
  if not filtered_docs:
 
474
  yield "No relevant information found in the selected documents. Please try selecting different documents or rephrasing your query."
475
  return
476
 
477
+ for doc in filtered_docs:
478
+ logging.info(f"Document source: {doc.metadata['source']}")
479
+ logging.info(f"Document content preview: {doc.page_content[:100]}...")
 
 
 
 
 
 
 
480
 
481
+ # Implement a sliding window approach for context
482
+ max_context_length = 4000 # Adjust based on your model's capacity
483
  context_chunks = []
484
+ current_chunk = ""
485
  for doc in filtered_docs:
486
+ if len(current_chunk) + len(doc.page_content) > max_context_length:
487
+ context_chunks.append(current_chunk)
488
+ current_chunk = doc.page_content
489
+ else:
490
+ current_chunk += "\n" + doc.page_content
491
+ if current_chunk:
492
+ context_chunks.append(current_chunk)
493
 
494
+ logging.info(f"Number of context chunks: {len(context_chunks)}")
495
 
496
  for i, context_str in enumerate(context_chunks):
497
  logging.info(f"Processing context chunk {i+1}/{len(context_chunks)}")
498
  logging.info(f"Context chunk length: {len(context_str)}")
 
499
 
500
  if model == "@cf/meta/llama-3.1-8b-instruct":
501
  logging.info("Using Cloudflare API")
 
514
  logging.info(f"API call {j+1}/{num_calls}")
515
  for message in client.chat_completion(
516
  messages=[{"role": "user", "content": prompt}],
517
+ max_tokens=10000,
518
  temperature=temperature,
519
  stream=True,
520
  ):