Shreyas094 commited on
Commit
263cb2e
·
verified ·
1 Parent(s): 63416d1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +98 -137
app.py CHANGED
@@ -18,17 +18,8 @@ import logging
18
  import shutil
19
 
20
 
21
- logging.basicConfig(level=logging.DEBUG,
22
- format='%(asctime)s - %(levelname)s - %(message)s',
23
- filename='chatbot.log',
24
- filemode='w')
25
-
26
- # Also log to console
27
- console = logging.StreamHandler()
28
- console.setLevel(logging.INFO)
29
- formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
30
- console.setFormatter(formatter)
31
- logging.getLogger('').addHandler(console)
32
 
33
  # Environment variables and configurations
34
  huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
@@ -57,30 +48,24 @@ llama_parser = LlamaParse(
57
  )
58
 
59
  def load_document(file: NamedTemporaryFile, parser: str = "llamaparse") -> List[Document]:
60
- logging.info(f"Loading document: {file.name} using parser: {parser}")
61
  if parser == "pypdf":
62
  loader = PyPDFLoader(file.name)
63
- documents = loader.load_and_split()
64
  elif parser == "llamaparse":
65
  try:
66
  documents = llama_parser.load_data(file.name)
67
- documents = [Document(page_content=doc.text, metadata={"source": file.name}) for doc in documents]
68
  except Exception as e:
69
- logging.error(f"Error using Llama Parse: {str(e)}")
70
- logging.info("Falling back to PyPDF parser")
71
  loader = PyPDFLoader(file.name)
72
- documents = loader.load_and_split()
73
  else:
74
  raise ValueError("Invalid parser specified. Use 'pypdf' or 'llamaparse'.")
75
-
76
- logging.info(f"Loaded {len(documents)} chunks from {file.name}")
77
- for i, doc in enumerate(documents):
78
- logging.debug(f"Chunk {i} content preview: {doc.page_content[:100]}...")
79
-
80
- return documents
81
 
82
  def get_embeddings():
83
- return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
84
 
85
  # Add this at the beginning of your script, after imports
86
  DOCUMENTS_FILE = "uploaded_documents.json"
@@ -99,71 +84,61 @@ def save_documents(documents):
99
  uploaded_documents = load_documents()
100
 
101
  # Modify the update_vectors function
102
- from langchain.vectorstores import FAISS
103
- import faiss
104
-
105
- def add_documents_to_faiss(documents: List[Document], embeddings):
106
- logging.info(f"Adding {len(documents)} documents to FAISS database")
107
- if os.path.exists("faiss_database"):
108
- db = FAISS.load_local("faiss_database", embeddings, allow_dangerous_deserialization=True)
109
- logging.info(f"Loaded existing FAISS database with {db.index.ntotal} vectors")
110
- initial_size = db.index.ntotal
111
- db.add_documents(documents)
112
- final_size = db.index.ntotal
113
- logging.info(f"FAISS database updated. Initial size: {initial_size}, Final size: {final_size}")
114
- else:
115
- db = FAISS.from_documents(documents, embeddings)
116
- logging.info(f"Created new FAISS database with {db.index.ntotal} vectors")
117
 
118
- db.save_local("faiss_database")
119
- logging.info("FAISS database saved")
120
- return db
121
-
122
- def get_relevant_documents(query: str, selected_docs: List[str], embeddings) -> List[Document]:
123
- if not os.path.exists("faiss_database"):
124
- logging.warning("No FAISS database found")
125
- return []
126
-
127
- db = FAISS.load_local("faiss_database", embeddings, allow_dangerous_deserialization=True)
128
- logging.info(f"Loaded FAISS database with {db.index.ntotal} vectors")
129
-
130
- # Retrieve documents without filtering first
131
- all_docs = db.similarity_search(query, k=20) # Increase k to ensure we get enough documents
132
- logging.info(f"Retrieved {len(all_docs)} documents from FAISS")
133
-
134
- # Log all retrieved documents
135
- for i, doc in enumerate(all_docs):
136
- logging.info(f"Retrieved document {i+1} source: {doc.metadata['source']}")
137
-
138
- # Filter documents based on selected_docs
139
- filtered_docs = [doc for doc in all_docs if doc.metadata["source"] in selected_docs]
140
- logging.info(f"Filtered to {len(filtered_docs)} documents based on selection")
141
-
142
- return filtered_docs
143
-
144
- def update_vectors(files: List[NamedTemporaryFile], parser: str, embeddings) -> str:
145
- all_documents = []
146
  for file in files:
147
  logging.info(f"Processing file: {file.name}")
148
  try:
149
- documents = load_document(file, parser)
150
- if not documents:
151
  logging.warning(f"No chunks loaded from {file.name}")
152
  continue
153
- logging.info(f"Loaded {len(documents)} chunks from {file.name}")
154
- all_documents.extend(documents)
 
 
 
 
 
 
155
  except Exception as e:
156
  logging.error(f"Error processing file {file.name}: {str(e)}")
157
 
158
- if not all_documents:
159
- return "No valid data could be extracted from the uploaded files."
160
-
 
 
 
161
  try:
162
- db = add_documents_to_faiss(all_documents, embeddings)
163
- return f"Vector store updated successfully. Added {len(all_documents)} chunks from {len(files)} files."
 
 
 
 
 
 
 
 
164
  except Exception as e:
165
  logging.error(f"Error updating FAISS database: {str(e)}")
166
- return f"Error updating vector store: {str(e)}"
 
 
 
 
 
167
 
168
  def delete_documents(selected_docs):
169
  global uploaded_documents
@@ -334,7 +309,6 @@ def respond(message, history, model, temperature, num_calls, use_web_search, sel
334
  logging.info(f"User Query: {message}")
335
  logging.info(f"Model Used: {model}")
336
  logging.info(f"Search Type: {'Web Search' if use_web_search else 'PDF Search'}")
337
- logging.info(f"Selected Documents: {selected_docs}")
338
 
339
  logging.info(f"Selected Documents: {selected_docs}")
340
 
@@ -480,75 +454,62 @@ def get_response_from_pdf(query, model, selected_docs, num_calls=3, temperature=
480
  embed = get_embeddings()
481
  if os.path.exists("faiss_database"):
482
  logging.info("Loading FAISS database")
483
- try:
484
- database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
485
- logging.info(f"FAISS database loaded with {database.index.ntotal} vectors")
486
- except Exception as e:
487
- logging.error(f"Error loading FAISS database: {str(e)}")
488
- yield "Error loading the document database. Please try uploading the documents again."
489
- return
490
  else:
491
  logging.warning("No FAISS database found")
492
  yield "No documents available. Please upload PDF documents to answer questions."
493
  return
494
 
495
- try:
496
- retriever = database.as_retriever(search_kwargs={"k": 20}) # Increase k to retrieve more documents initially
497
- logging.info(f"Retrieving relevant documents for query: {query}")
498
- all_relevant_docs = retriever.get_relevant_documents(query)
499
- logging.info(f"Number of relevant documents retrieved: {len(all_relevant_docs)}")
500
-
501
- # Log all retrieved documents before filtering
502
- for i, doc in enumerate(all_relevant_docs):
503
- logging.info(f"Retrieved document {i+1} source: {doc.metadata['source']}")
504
-
505
- # Filter relevant_docs based on selected documents
506
- filtered_docs = [doc for doc in all_relevant_docs if doc.metadata["source"] in selected_docs]
507
- logging.info(f"Number of filtered documents: {len(filtered_docs)}")
508
-
509
- if not filtered_docs:
510
- logging.warning(f"No relevant information found in the selected documents: {selected_docs}")
511
- yield "No relevant information found in the selected documents. Please try selecting different documents or rephrasing your query."
512
- return
513
 
514
- for i, doc in enumerate(filtered_docs):
515
- logging.info(f"Document {i+1} source: {doc.metadata['source']}")
516
- logging.info(f"Document {i+1} content preview: {doc.page_content[:100]}...")
517
 
518
- context_str = "\n".join([doc.page_content for doc in filtered_docs])
519
- logging.info(f"Total context length: {len(context_str)}")
520
 
521
- if model == "@cf/meta/llama-3.1-8b-instruct":
522
- logging.info("Using Cloudflare API")
523
- for response in get_response_from_cloudflare(prompt="", context=context_str, query=query, num_calls=num_calls, temperature=temperature, search_type="pdf"):
524
- yield response
525
- else:
526
- logging.info("Using Hugging Face API")
527
- prompt = f"""Using the following context from the PDF documents:
 
 
528
  {context_str}
529
  Write a detailed and complete response that answers the following user question: '{query}'"""
530
-
531
- client = InferenceClient(model, token=huggingface_token)
532
-
533
- response = ""
534
- for i in range(num_calls):
535
- logging.info(f"API call {i+1}/{num_calls}")
536
- for message in client.chat_completion(
537
- messages=[{"role": "user", "content": prompt}],
538
- max_tokens=10000,
539
- temperature=temperature,
540
- stream=True,
541
- ):
542
- if message.choices and message.choices[0].delta and message.choices[0].delta.content:
543
- chunk = message.choices[0].delta.content
544
- response += chunk
545
- yield response # Yield partial response
546
-
547
- logging.info("Finished generating response")
548
-
549
- except Exception as e:
550
- logging.error(f"Error in get_response_from_pdf: {str(e)}")
551
- yield f"An error occurred while processing your query: {str(e)}. Please try again or contact support."
552
 
553
  def vote(data: gr.LikeData):
554
  if data.liked:
 
18
  import shutil
19
 
20
 
21
+ # Set up basic configuration for logging
22
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
 
 
 
 
 
 
 
 
23
 
24
  # Environment variables and configurations
25
  huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
 
48
  )
49
 
50
  def load_document(file: NamedTemporaryFile, parser: str = "llamaparse") -> List[Document]:
51
+ """Loads and splits the document into pages."""
52
  if parser == "pypdf":
53
  loader = PyPDFLoader(file.name)
54
+ return loader.load_and_split()
55
  elif parser == "llamaparse":
56
  try:
57
  documents = llama_parser.load_data(file.name)
58
+ return [Document(page_content=doc.text, metadata={"source": file.name}) for doc in documents]
59
  except Exception as e:
60
+ print(f"Error using Llama Parse: {str(e)}")
61
+ print("Falling back to PyPDF parser")
62
  loader = PyPDFLoader(file.name)
63
+ return loader.load_and_split()
64
  else:
65
  raise ValueError("Invalid parser specified. Use 'pypdf' or 'llamaparse'.")
 
 
 
 
 
 
66
 
67
  def get_embeddings():
68
+ return HuggingFaceEmbeddings(model_name="sentence-transformers/stsb-roberta-large")
69
 
70
  # Add this at the beginning of your script, after imports
71
  DOCUMENTS_FILE = "uploaded_documents.json"
 
84
  uploaded_documents = load_documents()
85
 
86
  # Modify the update_vectors function
87
+ def update_vectors(files, parser):
88
+ global uploaded_documents
89
+ logging.info(f"Entering update_vectors with {len(files)} files and parser: {parser}")
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
+ if not files:
92
+ logging.warning("No files provided for update_vectors")
93
+ return "Please upload at least one PDF file.", display_documents()
94
+
95
+ embed = get_embeddings()
96
+ total_chunks = 0
97
+
98
+ all_data = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  for file in files:
100
  logging.info(f"Processing file: {file.name}")
101
  try:
102
+ data = load_document(file, parser)
103
+ if not data:
104
  logging.warning(f"No chunks loaded from {file.name}")
105
  continue
106
+ logging.info(f"Loaded {len(data)} chunks from {file.name}")
107
+ all_data.extend(data)
108
+ total_chunks += len(data)
109
+ if not any(doc["name"] == file.name for doc in uploaded_documents):
110
+ uploaded_documents.append({"name": file.name, "selected": True})
111
+ logging.info(f"Added new document to uploaded_documents: {file.name}")
112
+ else:
113
+ logging.info(f"Document already exists in uploaded_documents: {file.name}")
114
  except Exception as e:
115
  logging.error(f"Error processing file {file.name}: {str(e)}")
116
 
117
+ logging.info(f"Total chunks processed: {total_chunks}")
118
+
119
+ if not all_data:
120
+ logging.warning("No valid data extracted from uploaded files")
121
+ return "No valid data could be extracted from the uploaded files. Please check the file contents and try again.", display_documents()
122
+
123
  try:
124
+ if os.path.exists("faiss_database"):
125
+ logging.info("Updating existing FAISS database")
126
+ database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
127
+ database.add_documents(all_data)
128
+ else:
129
+ logging.info("Creating new FAISS database")
130
+ database = FAISS.from_documents(all_data, embed)
131
+
132
+ database.save_local("faiss_database")
133
+ logging.info("FAISS database saved")
134
  except Exception as e:
135
  logging.error(f"Error updating FAISS database: {str(e)}")
136
+ return f"Error updating vector store: {str(e)}", display_documents()
137
+
138
+ # Save the updated list of documents
139
+ save_documents(uploaded_documents)
140
+
141
+ return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files using {parser}.", display_documents()
142
 
143
  def delete_documents(selected_docs):
144
  global uploaded_documents
 
309
  logging.info(f"User Query: {message}")
310
  logging.info(f"Model Used: {model}")
311
  logging.info(f"Search Type: {'Web Search' if use_web_search else 'PDF Search'}")
 
312
 
313
  logging.info(f"Selected Documents: {selected_docs}")
314
 
 
454
  embed = get_embeddings()
455
  if os.path.exists("faiss_database"):
456
  logging.info("Loading FAISS database")
457
+ database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
 
 
 
 
 
 
458
  else:
459
  logging.warning("No FAISS database found")
460
  yield "No documents available. Please upload PDF documents to answer questions."
461
  return
462
 
463
+ retriever = database.as_retriever()
464
+ logging.info(f"Retrieving relevant documents for query: {query}")
465
+ relevant_docs = retriever.get_relevant_documents(query)
466
+ logging.info(f"Number of relevant documents retrieved: {len(relevant_docs)}")
467
+
468
+ # Filter relevant_docs based on selected documents
469
+ filtered_docs = [doc for doc in relevant_docs if doc.metadata["source"] in selected_docs]
470
+ logging.info(f"Number of filtered documents: {len(filtered_docs)}")
471
+
472
+ if not filtered_docs:
473
+ logging.warning(f"No relevant information found in the selected documents: {selected_docs}")
474
+ yield "No relevant information found in the selected documents. Please try selecting different documents or rephrasing your query."
475
+ return
 
 
 
 
 
476
 
477
+ for doc in filtered_docs:
478
+ logging.info(f"Document source: {doc.metadata['source']}")
479
+ logging.info(f"Document content preview: {doc.page_content[:100]}...") # Log first 100 characters of each document
480
 
481
+ context_str = "\n".join([doc.page_content for doc in filtered_docs])
482
+ logging.info(f"Total context length: {len(context_str)}")
483
 
484
+ if model == "@cf/meta/llama-3.1-8b-instruct":
485
+ logging.info("Using Cloudflare API")
486
+ # Use Cloudflare API with the retrieved context
487
+ for response in get_response_from_cloudflare(prompt="", context=context_str, query=query, num_calls=num_calls, temperature=temperature, search_type="pdf"):
488
+ yield response
489
+ else:
490
+ logging.info("Using Hugging Face API")
491
+ # Use Hugging Face API
492
+ prompt = f"""Using the following context from the PDF documents:
493
  {context_str}
494
  Write a detailed and complete response that answers the following user question: '{query}'"""
495
+
496
+ client = InferenceClient(model, token=huggingface_token)
497
+
498
+ response = ""
499
+ for i in range(num_calls):
500
+ logging.info(f"API call {i+1}/{num_calls}")
501
+ for message in client.chat_completion(
502
+ messages=[{"role": "user", "content": prompt}],
503
+ max_tokens=10000,
504
+ temperature=temperature,
505
+ stream=True,
506
+ ):
507
+ if message.choices and message.choices[0].delta and message.choices[0].delta.content:
508
+ chunk = message.choices[0].delta.content
509
+ response += chunk
510
+ yield response # Yield partial response
511
+
512
+ logging.info("Finished generating response")
 
 
 
 
513
 
514
  def vote(data: gr.LikeData):
515
  if data.liked: