SearchGPT

Paused

App Files Files Community

Shreyas094 commited on Aug 4, 2024

Commit

263cb2e

verified ·

1 Parent(s): 63416d1

Update app.py

Browse files

Files changed (1) hide show

app.py +98 -137

app.py CHANGED Viewed

@@ -18,17 +18,8 @@ import logging
 import shutil
-logging.basicConfig(level=logging.DEBUG,
-                    format='%(asctime)s - %(levelname)s - %(message)s',
-                    filename='chatbot.log',
-                    filemode='w')
-# Also log to console
-console = logging.StreamHandler()
-console.setLevel(logging.INFO)
-formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
-console.setFormatter(formatter)
-logging.getLogger('').addHandler(console)
 # Environment variables and configurations
 huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
@@ -57,30 +48,24 @@ llama_parser = LlamaParse(
 )
 def load_document(file: NamedTemporaryFile, parser: str = "llamaparse") -> List[Document]:
-    logging.info(f"Loading document: {file.name} using parser: {parser}")
     if parser == "pypdf":
         loader = PyPDFLoader(file.name)
-        documents = loader.load_and_split()
     elif parser == "llamaparse":
         try:
             documents = llama_parser.load_data(file.name)
-            documents = [Document(page_content=doc.text, metadata={"source": file.name}) for doc in documents]
         except Exception as e:
-            logging.error(f"Error using Llama Parse: {str(e)}")
-            logging.info("Falling back to PyPDF parser")
             loader = PyPDFLoader(file.name)
-            documents = loader.load_and_split()
     else:
         raise ValueError("Invalid parser specified. Use 'pypdf' or 'llamaparse'.")
-    logging.info(f"Loaded {len(documents)} chunks from {file.name}")
-    for i, doc in enumerate(documents):
-        logging.debug(f"Chunk {i} content preview: {doc.page_content[:100]}...")
-    return documents
 def get_embeddings():
-    return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
 # Add this at the beginning of your script, after imports
 DOCUMENTS_FILE = "uploaded_documents.json"
@@ -99,71 +84,61 @@ def save_documents(documents):
 uploaded_documents = load_documents()
 # Modify the update_vectors function
-from langchain.vectorstores import FAISS
-import faiss
-def add_documents_to_faiss(documents: List[Document], embeddings):
-    logging.info(f"Adding {len(documents)} documents to FAISS database")
-    if os.path.exists("faiss_database"):
-        db = FAISS.load_local("faiss_database", embeddings, allow_dangerous_deserialization=True)
-        logging.info(f"Loaded existing FAISS database with {db.index.ntotal} vectors")
-        initial_size = db.index.ntotal
-        db.add_documents(documents)
-        final_size = db.index.ntotal
-        logging.info(f"FAISS database updated. Initial size: {initial_size}, Final size: {final_size}")
-    else:
-        db = FAISS.from_documents(documents, embeddings)
-        logging.info(f"Created new FAISS database with {db.index.ntotal} vectors")
-    db.save_local("faiss_database")
-    logging.info("FAISS database saved")
-    return db
-def get_relevant_documents(query: str, selected_docs: List[str], embeddings) -> List[Document]:
-    if not os.path.exists("faiss_database"):
-        logging.warning("No FAISS database found")
-        return []
-    db = FAISS.load_local("faiss_database", embeddings, allow_dangerous_deserialization=True)
-    logging.info(f"Loaded FAISS database with {db.index.ntotal} vectors")
-    # Retrieve documents without filtering first
-    all_docs = db.similarity_search(query, k=20)  # Increase k to ensure we get enough documents
-    logging.info(f"Retrieved {len(all_docs)} documents from FAISS")
-    # Log all retrieved documents
-    for i, doc in enumerate(all_docs):
-        logging.info(f"Retrieved document {i+1} source: {doc.metadata['source']}")
-    # Filter documents based on selected_docs
-    filtered_docs = [doc for doc in all_docs if doc.metadata["source"] in selected_docs]
-    logging.info(f"Filtered to {len(filtered_docs)} documents based on selection")
-    return filtered_docs
-def update_vectors(files: List[NamedTemporaryFile], parser: str, embeddings) -> str:
-    all_documents = []
     for file in files:
         logging.info(f"Processing file: {file.name}")
         try:
-            documents = load_document(file, parser)
-            if not documents:
                 logging.warning(f"No chunks loaded from {file.name}")
                 continue
-            logging.info(f"Loaded {len(documents)} chunks from {file.name}")
-            all_documents.extend(documents)
         except Exception as e:
             logging.error(f"Error processing file {file.name}: {str(e)}")
-    if not all_documents:
-        return "No valid data could be extracted from the uploaded files."
     try:
-        db = add_documents_to_faiss(all_documents, embeddings)
-        return f"Vector store updated successfully. Added {len(all_documents)} chunks from {len(files)} files."
     except Exception as e:
         logging.error(f"Error updating FAISS database: {str(e)}")
-        return f"Error updating vector store: {str(e)}"
 def delete_documents(selected_docs):
     global uploaded_documents
@@ -334,7 +309,6 @@ def respond(message, history, model, temperature, num_calls, use_web_search, sel
     logging.info(f"User Query: {message}")
     logging.info(f"Model Used: {model}")
     logging.info(f"Search Type: {'Web Search' if use_web_search else 'PDF Search'}")
-    logging.info(f"Selected Documents: {selected_docs}")
     logging.info(f"Selected Documents: {selected_docs}")
@@ -480,75 +454,62 @@ def get_response_from_pdf(query, model, selected_docs, num_calls=3, temperature=
     embed = get_embeddings()
     if os.path.exists("faiss_database"):
         logging.info("Loading FAISS database")
-        try:
-            database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
-            logging.info(f"FAISS database loaded with {database.index.ntotal} vectors")
-        except Exception as e:
-            logging.error(f"Error loading FAISS database: {str(e)}")
-            yield "Error loading the document database. Please try uploading the documents again."
-            return
     else:
         logging.warning("No FAISS database found")
         yield "No documents available. Please upload PDF documents to answer questions."
         return
-    try:
-        retriever = database.as_retriever(search_kwargs={"k": 20})  # Increase k to retrieve more documents initially
-        logging.info(f"Retrieving relevant documents for query: {query}")
-        all_relevant_docs = retriever.get_relevant_documents(query)
-        logging.info(f"Number of relevant documents retrieved: {len(all_relevant_docs)}")
-        # Log all retrieved documents before filtering
-        for i, doc in enumerate(all_relevant_docs):
-            logging.info(f"Retrieved document {i+1} source: {doc.metadata['source']}")
-        # Filter relevant_docs based on selected documents
-        filtered_docs = [doc for doc in all_relevant_docs if doc.metadata["source"] in selected_docs]
-        logging.info(f"Number of filtered documents: {len(filtered_docs)}")
-        if not filtered_docs:
-            logging.warning(f"No relevant information found in the selected documents: {selected_docs}")
-            yield "No relevant information found in the selected documents. Please try selecting different documents or rephrasing your query."
-            return
-        for i, doc in enumerate(filtered_docs):
-            logging.info(f"Document {i+1} source: {doc.metadata['source']}")
-            logging.info(f"Document {i+1} content preview: {doc.page_content[:100]}...")
-        context_str = "\n".join([doc.page_content for doc in filtered_docs])
-        logging.info(f"Total context length: {len(context_str)}")
-        if model == "@cf/meta/llama-3.1-8b-instruct":
-            logging.info("Using Cloudflare API")
-            for response in get_response_from_cloudflare(prompt="", context=context_str, query=query, num_calls=num_calls, temperature=temperature, search_type="pdf"):
-                yield response
-        else:
-            logging.info("Using Hugging Face API")
-            prompt = f"""Using the following context from the PDF documents:
 {context_str}
 Write a detailed and complete response that answers the following user question: '{query}'"""
-            client = InferenceClient(model, token=huggingface_token)
-            response = ""
-            for i in range(num_calls):
-                logging.info(f"API call {i+1}/{num_calls}")
-                for message in client.chat_completion(
-                    messages=[{"role": "user", "content": prompt}],
-                    max_tokens=10000,
-                    temperature=temperature,
-                    stream=True,
-                ):
-                    if message.choices and message.choices[0].delta and message.choices[0].delta.content:
-                        chunk = message.choices[0].delta.content
-                        response += chunk
-                        yield response  # Yield partial response
-            logging.info("Finished generating response")
-    except Exception as e:
-        logging.error(f"Error in get_response_from_pdf: {str(e)}")
-        yield f"An error occurred while processing your query: {str(e)}. Please try again or contact support."
 def vote(data: gr.LikeData):
     if data.liked:

 import shutil
+# Set up basic configuration for logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 # Environment variables and configurations
 huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
 )
 def load_document(file: NamedTemporaryFile, parser: str = "llamaparse") -> List[Document]:
+    """Loads and splits the document into pages."""
     if parser == "pypdf":
         loader = PyPDFLoader(file.name)
+        return loader.load_and_split()
     elif parser == "llamaparse":
         try:
             documents = llama_parser.load_data(file.name)
+            return [Document(page_content=doc.text, metadata={"source": file.name}) for doc in documents]
         except Exception as e:
+            print(f"Error using Llama Parse: {str(e)}")
+            print("Falling back to PyPDF parser")
             loader = PyPDFLoader(file.name)
+            return loader.load_and_split()
     else:
         raise ValueError("Invalid parser specified. Use 'pypdf' or 'llamaparse'.")
 def get_embeddings():
+    return HuggingFaceEmbeddings(model_name="sentence-transformers/stsb-roberta-large")
 # Add this at the beginning of your script, after imports
 DOCUMENTS_FILE = "uploaded_documents.json"
 uploaded_documents = load_documents()
 # Modify the update_vectors function
+def update_vectors(files, parser):
+    global uploaded_documents
+    logging.info(f"Entering update_vectors with {len(files)} files and parser: {parser}")
+    if not files:
+        logging.warning("No files provided for update_vectors")
+        return "Please upload at least one PDF file.", display_documents()
+    embed = get_embeddings()
+    total_chunks = 0
+    all_data = []
     for file in files:
         logging.info(f"Processing file: {file.name}")
         try:
+            data = load_document(file, parser)
+            if not data:
                 logging.warning(f"No chunks loaded from {file.name}")
                 continue
+            logging.info(f"Loaded {len(data)} chunks from {file.name}")
+            all_data.extend(data)
+            total_chunks += len(data)
+            if not any(doc["name"] == file.name for doc in uploaded_documents):
+                uploaded_documents.append({"name": file.name, "selected": True})
+                logging.info(f"Added new document to uploaded_documents: {file.name}")
+            else:
+                logging.info(f"Document already exists in uploaded_documents: {file.name}")
         except Exception as e:
             logging.error(f"Error processing file {file.name}: {str(e)}")
+    logging.info(f"Total chunks processed: {total_chunks}")
+    if not all_data:
+        logging.warning("No valid data extracted from uploaded files")
+        return "No valid data could be extracted from the uploaded files. Please check the file contents and try again.", display_documents()
     try:
+        if os.path.exists("faiss_database"):
+            logging.info("Updating existing FAISS database")
+            database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
+            database.add_documents(all_data)
+        else:
+            logging.info("Creating new FAISS database")
+            database = FAISS.from_documents(all_data, embed)
+        database.save_local("faiss_database")
+        logging.info("FAISS database saved")
     except Exception as e:
         logging.error(f"Error updating FAISS database: {str(e)}")
+        return f"Error updating vector store: {str(e)}", display_documents()
+    # Save the updated list of documents
+    save_documents(uploaded_documents)
+    return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files using {parser}.", display_documents()
 def delete_documents(selected_docs):
     global uploaded_documents
     logging.info(f"User Query: {message}")
     logging.info(f"Model Used: {model}")
     logging.info(f"Search Type: {'Web Search' if use_web_search else 'PDF Search'}")
     logging.info(f"Selected Documents: {selected_docs}")
     embed = get_embeddings()
     if os.path.exists("faiss_database"):
         logging.info("Loading FAISS database")
+        database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
     else:
         logging.warning("No FAISS database found")
         yield "No documents available. Please upload PDF documents to answer questions."
         return
+    retriever = database.as_retriever()
+    logging.info(f"Retrieving relevant documents for query: {query}")
+    relevant_docs = retriever.get_relevant_documents(query)
+    logging.info(f"Number of relevant documents retrieved: {len(relevant_docs)}")
+    # Filter relevant_docs based on selected documents
+    filtered_docs = [doc for doc in relevant_docs if doc.metadata["source"] in selected_docs]
+    logging.info(f"Number of filtered documents: {len(filtered_docs)}")
+    if not filtered_docs:
+        logging.warning(f"No relevant information found in the selected documents: {selected_docs}")
+        yield "No relevant information found in the selected documents. Please try selecting different documents or rephrasing your query."
+        return
+    for doc in filtered_docs:
+        logging.info(f"Document source: {doc.metadata['source']}")
+        logging.info(f"Document content preview: {doc.page_content[:100]}...")  # Log first 100 characters of each document
+    context_str = "\n".join([doc.page_content for doc in filtered_docs])
+    logging.info(f"Total context length: {len(context_str)}")
+    if model == "@cf/meta/llama-3.1-8b-instruct":
+        logging.info("Using Cloudflare API")
+        # Use Cloudflare API with the retrieved context
+        for response in get_response_from_cloudflare(prompt="", context=context_str, query=query, num_calls=num_calls, temperature=temperature, search_type="pdf"):
+            yield response
+    else:
+        logging.info("Using Hugging Face API")
+        # Use Hugging Face API
+        prompt = f"""Using the following context from the PDF documents:
 {context_str}
 Write a detailed and complete response that answers the following user question: '{query}'"""
+        client = InferenceClient(model, token=huggingface_token)
+        response = ""
+        for i in range(num_calls):
+            logging.info(f"API call {i+1}/{num_calls}")
+            for message in client.chat_completion(
+                messages=[{"role": "user", "content": prompt}],
+                max_tokens=10000,
+                temperature=temperature,
+                stream=True,
+            ):
+                if message.choices and message.choices[0].delta and message.choices[0].delta.content:
+                    chunk = message.choices[0].delta.content
+                    response += chunk
+                    yield response  # Yield partial response
+        logging.info("Finished generating response")
 def vote(data: gr.LikeData):
     if data.liked: