Spaces:

jessica45
/

rag

Runtime error

App Files Files Community

jessica45 commited on Feb 12

Commit

8953dfc

verified ·

1 Parent(s): caab355

initial commit

Browse files

Files changed (6) hide show

app.py +397 -0
chroma_db_utils.py +249 -0
gemini_embedding.py +19 -0
pdf_utils.py +141 -0
query_handler.py +72 -0
requirement.txt +13 -0

app.py ADDED Viewed

	@@ -0,0 +1,397 @@

+# import streamlit as st
+# from pdf_utils import extract_text_from_file, split_text
+# from chroma_db_utils import create_chroma_db, load_chroma_collection
+# from query_handler import handle_query
+# import os
+# import re
+# import tempfile
+# def generate_collection_name(file_path=None):
+#     """Generate a valid collection name from a file path."""
+#     base_name = os.path.basename(file_path) if file_path else "collection"
+#     # Remove file extension
+#     base_name = re.sub(r'\..*$', '', base_name)
+#     # Replace invalid characters and ensure it starts with a letter
+#     base_name = re.sub(r'\W+', '_', base_name)
+#     base_name = re.sub(r'^[^a-zA-Z]+', '', base_name)
+#     return base_name
+# def process_uploaded_file(uploaded_file, chroma_db_path):
+#     """Process the uploaded file and create/load ChromaDB collection."""
+#     # Create a temporary file to store the uploaded content
+#     with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
+#         tmp_file.write(uploaded_file.getvalue())
+#         file_path = tmp_file.name
+#     try:
+#         # Generate collection name from original filename
+#         collection_name = generate_collection_name(uploaded_file.name)
+#         # Extract and process text
+#         file_text = extract_text_from_file(file_path)
+#         if file_text is None:
+#             return None, "Failed to extract text from the file."
+#         chunked_text = split_text(file_text)
+#         # Try to load existing collection or create new one
+#         try:
+#             db = load_chroma_collection(collection_name, chroma_db_path)
+#             st.success("Loaded existing ChromaDB collection.")
+#         except Exception:
+#             db = create_chroma_db(chunked_text, collection_name, chroma_db_path)
+#             st.success("Created new ChromaDB collection.")
+#         return db, None
+#     except Exception as e:
+#         return None, f"Error processing file: {str(e)}"
+#     finally:
+#         # Clean up temporary file
+#         os.unlink(file_path)
+# def main():
+#     st.title("File Question Answering System")
+#     # Sidebar for configuration
+#     st.sidebar.header("Configuration")
+#     chroma_db_path = st.sidebar.text_input(
+#         "ChromaDB Path",
+#         value="./chroma_db",
+#         help="Directory where ChromaDB collections will be stored"
+#     )
+#     # Main content
+#     st.write("Upload a file and ask questions about its content!")
+#     # File uploader
+#     uploaded_file = st.file_uploader("Upload a file", type=["pdf", "docx", "txt"])
+#     # Session state initialization
+#     if 'db' not in st.session_state:
+#         st.session_state.db = None
+#     if uploaded_file is not None:
+#         # Process file if not already processed
+#         if st.session_state.db is None:
+#             with st.spinner("Processing PDF file..."):
+#                 db, error = process_uploaded_file(uploaded_file, chroma_db_path)
+#                 if error:
+#                     st.error(error)
+#                 else:
+#                     st.session_state.db = db
+#                     st.success("File processed successfully!")
+#         # Question answering interface
+#         st.subheader("Ask a Question")
+#         question = st.text_input("Enter your question:")
+#         if question:
+#             if st.session_state.db is not None:
+#                 with st.spinner("Finding answer..."):
+#                     answer = handle_query(question, st.session_state.db)
+#                     st.subheader("Answer:")
+#                     st.write(answer)
+#             else:
+#                 st.error("Please wait for the file to be processed or try uploading again.")
+#         # Clear database button
+#         if st.button("Clear Database"):
+#             st.session_state.db = None
+#             st.success("Database cleared. You can upload a new file.")
+# if __name__ == "__main__":
+#     main()
+import streamlit as st
+import os
+from typing import List
+import time
+from pdf_utils import extract_text_from_file, split_text
+from chroma_db_utils import create_chroma_db
+from query_handler import handle_query
+def initialize_session_state():
+    """Initialize session state variables."""
+    if 'messages' not in st.session_state:
+        st.session_state.messages = []
+    if 'db' not in st.session_state:
+        st.session_state.db = None
+    if 'chunks' not in st.session_state:
+        st.session_state.chunks = []
+def process_uploaded_file(uploaded_file) -> List[str]:
+    """Process the uploaded file and return text chunks."""
+    # Create a temporary file to store the uploaded content
+    with open(uploaded_file.name, "wb") as f:
+        f.write(uploaded_file.getbuffer())
+    try:
+        # Extract text from the file
+        extracted_text = extract_text_from_file(uploaded_file.name)
+        if extracted_text:
+            # Split text into chunks
+            chunks = split_text(extracted_text)
+            return chunks
+        else:
+            st.error("No text could be extracted from the file.")
+            return []
+    finally:
+        # Clean up temporary file
+        if os.path.exists(uploaded_file.name):
+            os.remove(uploaded_file.name)
+def main():
+    st.title("📚 Document Q&A System")
+    # Initialize session state
+    initialize_session_state()
+    # Sidebar for file upload
+    with st.sidebar:
+        st.header("Document Upload")
+        uploaded_file = st.file_uploader(
+            "Upload your document",
+            type=['pdf', 'docx', 'txt'],
+            help="Supported formats: PDF, DOCX, TXT"
+        )
+        if uploaded_file:
+            with st.spinner("Processing document..."):
+                # Process the uploaded file
+                chunks = process_uploaded_file(uploaded_file)
+                if chunks:
+                    # Create/update the database
+                    st.session_state.chunks = chunks
+                    st.session_state.db = create_chroma_db(chunks)
+                    st.success(f"Document processed! Created {len(chunks)} chunks.")
+                    # Add system message to chat history
+                    if not st.session_state.messages:
+                        st.session_state.messages.append({
+                            "role": "system",
+                            "content": "I've processed your document. You can now ask questions about it!"
+                        })
+    # Main chat interface
+    st.header("💬 Chat")
+    # Display chat messages
+    for message in st.session_state.messages:
+        with st.chat_message(message["role"]):
+            st.write(message["content"])
+    # Chat input
+    if prompt := st.chat_input("Ask a question about your document"):
+        # Only process if we have a database
+        if st.session_state.db is None:
+            st.error("Please upload a document first!")
+            return
+        # Add user message to chat history
+        st.session_state.messages.append({"role": "user", "content": prompt})
+        # Display user message
+        with st.chat_message("user"):
+            st.write(prompt)
+        # Generate and display assistant response
+        with st.chat_message("assistant"):
+            with st.spinner("Thinking..."):
+                try:
+                    response = handle_query(prompt, st.session_state.db)
+                    st.write(response)
+                    # Add assistant response to chat history
+                    st.session_state.messages.append({
+                        "role": "assistant",
+                        "content": response
+                    })
+                except Exception as e:
+                    st.error(f"Error generating response: {str(e)}")
+    # Add a clear chat button
+    if st.sidebar.button("Clear Chat"):
+        st.session_state.messages = []
+        st.experimental_rerun()
+if __name__ == "__main__":
+    main()
+# import streamlit as st
+# from chromadb.config import Settings
+# import os
+# import chromadb
+# from typing import List
+# import time
+# import google
+# import datetime
+# # from chroma_db_utils import create_chroma_db, get_relevant_passage
+# from query_handler import generate_answer, handle_query
+# from pdf_utils import extract_text_from_file, split_text
+# import logging
+# # Configure logging
+# logging.basicConfig(level=logging.INFO)
+# logger = logging.getLogger(__name__)
+# def create_chroma_db(chunks: List[str]):
+#     """Create and return an ephemeral ChromaDB collection."""
+#     try:
+#         # Initialize ChromaDB with ephemeral storage
+#         client = chromadb.EphemeralClient()
+#         # Create collection
+#         collection_name = f"temp_collection_{int(time.time())}"
+#         collection = client.create_collection(name=collection_name)
+#         # Add documents
+#         collection.add(
+#             documents=chunks,
+#             ids=[f"doc_{i}" for i in range(len(chunks))]
+#         )
+#         # Verify the data was added
+#         verify_count = collection.count()
+#         print(f"Verified: Added {verify_count} documents to collection {collection_name}")
+#         # Store both client and collection in session state
+#         st.session_state.chroma_client = client
+#         return collection
+#     except Exception as e:
+#         print(f"Error creating ChromaDB: {str(e)}")
+#         return None
+# def get_relevant_passage(query: str, collection):
+#     """Get relevant passages from the collection."""
+#     try:
+#         # Use the collection directly since it's ephemeral
+#         results = collection.query(
+#             query_texts=[query],
+#             n_results=2
+#         )
+#         if results and 'documents' in results:
+#             print(f"Found {len(results['documents'])} relevant passages")
+#             return results['documents']
+#         return None
+#     except Exception as e:
+#         print(f"Error in get_relevant_passage: {str(e)}")
+#         return None
+# def initialize_session_state():
+#     """Initialize Streamlit session state variables."""
+#     if "chat_history" not in st.session_state:
+#         st.session_state.chat_history = []
+#     if "chroma_collection" not in st.session_state:
+#         st.session_state.chroma_collection = None
+#     if "chroma_client" not in st.session_state:
+#         st.session_state.chroma_client = None
+# def process_uploaded_file(uploaded_file) -> List[str]:
+#     """Process the uploaded file and return text chunks."""
+#     temp_file_path = f"/tmp/{uploaded_file.name}"
+#     try:
+#         with open(temp_file_path, "wb") as f:
+#             f.write(uploaded_file.getbuffer())
+#         # Extract text from the file
+#         extracted_text = extract_text_from_file(temp_file_path)
+#         if extracted_text:
+#             # Split text into chunks
+#             chunks = split_text(extracted_text)
+#             return chunks
+#         else:
+#             st.error("No text could be extracted from the file.")
+#             return []
+#     finally:
+#         if os.path.exists(temp_file_path):
+#             os.remove(temp_file_path)
+# def chat_interface():
+#     st.title("Chat with Your Documents 📄💬")
+#     # Debug: Print current state
+#     print(f"Current chroma_collection state: {st.session_state.chroma_collection}")
+#     uploaded_files = st.file_uploader(
+#         "Upload your files (TXT, PDF)",
+#         accept_multiple_files=True,
+#         type=['txt', 'pdf']
+#     )
+#     if uploaded_files and st.button("Process Files"):
+#         with st.spinner("Processing files..."):
+#             all_chunks = []
+#             for uploaded_file in uploaded_files:
+#                 chunks = process_uploaded_file(uploaded_file)
+#                 print(f"Processed {len(chunks)} chunks from {uploaded_file.name}")
+#                 if chunks:
+#                     all_chunks.extend(chunks)
+#             if all_chunks:
+#                 print(f"Creating ChromaDB with {len(all_chunks)} total chunks")
+#                 # Create ChromaDB collection with all documents
+#                 db = create_chroma_db(all_chunks)
+#                 if db:
+#                     # Verify the collection immediately after creation
+#                     try:
+#                         verify_count = db.count()
+#                         print(f"Verification - Collection size: {verify_count}")
+#                         # Try a test query
+#                         test_query = db.query(
+#                             query_texts=["test verification query"],
+#                             n_results=1
+#                         )
+#                         print("Verification - Query test successful")
+#                         st.session_state.chroma_collection = db
+#                         st.success(f"Files processed successfully! {verify_count} chunks loaded.")
+#                     except Exception as e:
+#                         print(f"Verification failed: {str(e)}")
+#                         st.error("Database verification failed")
+#                 else:
+#                     st.error("Failed to create database")
+#     # Query interface
+#     if st.session_state.chroma_collection is not None:
+#         print("ChromaDB collection found in session state")
+#         query = st.text_input("Ask a question about your documents:")
+#         if st.button("Send") and query:
+#             print(f"Processing query: {query}")
+#             with st.spinner("Generating response..."):
+#                 try:
+#                     # Verify both client and collection exist
+#                     if st.session_state.chroma_client is None or st.session_state.chroma_collection is None:
+#                         st.error("Please upload documents first")
+#                         return
+#                     collection = st.session_state.chroma_collection
+#                     print(f"Collection name: {collection.name}")
+#                     print(f"Collection size: {collection.count()}")
+#                     relevant_passages = get_relevant_passage(query, collection)
+#                     if relevant_passages:
+#                         response = handle_query(query, relevant_passages)
+#                         st.session_state.chat_history.append((query, response))
+#                     else:
+#                         st.warning("No relevant information found in the documents.")
+#                 except Exception as e:
+#                     print(f"Full error during query processing: {str(e)}")
+#                     logger.exception("Detailed error trace:")  # This will log the full stack trace
+#                     st.error("Failed to process your question. Please try again.")
+#     else:
+#         print("No ChromaDB collection in session state")
+# if __name__ == "__main__":
+#     initialize_session_state()
+#     chat_interface()

chroma_db_utils.py ADDED Viewed

	@@ -0,0 +1,249 @@

+# import os
+# import chromadb
+# import numpy as np
+# from typing import List, Tuple
+# from gemini_embedding import GeminiEmbeddingFunction
+# def create_chroma_db(documents: List[str], dataset_name: str, base_path: str = "chroma_db"):
+#     """
+#     Creates a Chroma database using the provided documents.
+#     Automatically generates path and collection name based on dataset_name.
+#     """
+#     path = os.path.join(base_path, dataset_name)
+#     name = f"{dataset_name}_collection"
+#     if not os.path.exists(path):
+#         os.makedirs(path)
+#     chroma_client = chromadb.PersistentClient(path=path)
+#     db = chroma_client.create_collection(name=name, embedding_function=GeminiEmbeddingFunction())
+#     for i, doc in enumerate(documents):
+#         db.add(documents=[doc], ids=[str(i)])
+#     return db
+# def load_chroma_collection(dataset_name: str, base_path: str = "chroma_db"):
+#     """
+#     Loads an existing Chroma collection.
+#     """
+#     path = os.path.join(base_path, dataset_name)
+#     name = f"{dataset_name}_collection"
+#     chroma_client = chromadb.PersistentClient(path=path)
+#     return chroma_client.get_collection(name=name, embedding_function=GeminiEmbeddingFunction())
+# def cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float:
+#     """
+#     Calculate cosine similarity between two vectors.
+#     Returns a value between -1 and 1, where 1 means most similar.
+#     """
+#     dot_product = np.dot(vec1, vec2)
+#     norm1 = np.linalg.norm(vec1)
+#     norm2 = np.linalg.norm(vec2)
+#     return dot_product / (norm1 * norm2)
+# def get_relevant_passage(query: str, db, n_results: int = 5) -> List[str]:
+#     """
+#     Retrieves relevant passages using explicit cosine similarity calculation.
+#     """
+#     # Get query embedding
+#     query_embedding = db._embedding_function([query])[0]
+#     # Get all document embeddings
+#     all_docs = db.get(include=['embeddings', 'documents'])
+#     doc_embeddings = all_docs['embeddings']
+#     documents = all_docs['documents']
+#     # Calculate cosine similarity for each document
+#     similarities = []
+#     for doc_embedding in doc_embeddings:
+#         similarity = cosine_similarity(query_embedding, doc_embedding)
+#         similarities.append(similarity)
+#     # Sort documents by similarity
+#     doc_similarities = list(zip(documents, similarities))
+#     doc_similarities.sort(key=lambda x: x[1], reverse=True)
+#     # Take top n results
+#     top_results = doc_similarities[:n_results]
+#     # Print results for debugging
+#     print(f"Number of relevant passages retrieved: {len(top_results)}")
+#     for i, (doc, similarity) in enumerate(top_results):
+#         print(f"Passage {i+1} (Cosine Similarity: {similarity:.4f}): {doc[:100]}...")
+#     # Return just the documents
+#     return [doc for doc, _ in top_results]
+# in memory
+# import chromadb
+# from typing import List
+# from gemini_embedding import GeminiEmbeddingFunction  # Ensure this is correctly implemented
+# import time
+# from chromadb.config import Settings
+# def create_chroma_db(chunks: List[str]):
+#     """Create and return an in-memory ChromaDB collection."""
+#     try:
+#         # Initialize in-memory ChromaDB with current recommended configuration
+#         client = chromadb.Client()
+#         # Create collection with unique name to avoid conflicts
+#         collection_name = f"temp_collection_{int(time.time())}"
+#         collection = client.create_collection(name=collection_name)
+#         # Add documents with unique IDs
+#         collection.add(
+#             documents=chunks,
+#             ids=[f"doc_{i}" for i in range(len(chunks))]
+#         )
+#         # Verify the data was added
+#         verify_count = collection.count()
+#         print(f"Verified: Added {verify_count} documents to collection {collection_name}")
+#         # Test query to ensure collection is working
+#         test_results = collection.query(
+#             query_texts=["test"],
+#             n_results=1
+#         )
+#         print("Verified: Collection is queryable")
+#         return collection
+#     except Exception as e:
+#         print(f"Error creating ChromaDB: {str(e)}")
+#         return None
+# def get_relevant_passage(query: str, db, n_results: int = 5) -> List[str]:
+#     """
+#     Retrieves relevant passages using ChromaDB's similarity search.
+#     """
+#     try:
+#         if db is None:
+#             print("Database not initialized")
+#             return []
+#         # Verify collection has documents
+#         count = db.count()
+#         if count == 0:
+#             print("Collection is empty")
+#             return []
+#         # Query the database
+#         results = db.query(
+#             query_texts=[query],
+#             n_results=min(n_results, count)  # Ensure we don't request more than we have
+#         )
+#         # Ensure results exist
+#         if not results["documents"]:
+#             print("No relevant passages found.")
+#             return []
+#         documents = results["documents"][0]  # First result batch
+#         distances = results["distances"][0]  # Corresponding distances
+#         # Debug output
+#         print(f"Number of relevant passages retrieved: {len(documents)}")
+#         for i, (doc, distance) in enumerate(zip(documents, distances)):
+#             similarity = 1 - distance  # Convert distance to similarity
+#             print(f"Passage {i+1} (Similarity: {similarity:.4f}): {doc[:100]}...")
+#         return documents
+#     except Exception as e:
+#         print(f"Error in get_relevant_passage: {str(e)}")
+#         return []
+import chromadb
+from chromadb.config import Settings
+from typing import List
+import os
+from gemini_embedding import GeminiEmbeddingFunction
+import datetime
+embedding_function = GeminiEmbeddingFunction()
+def create_chroma_db(documents: List[str]):
+    """
+    Creates a persistent Chroma database using the provided documents.
+    """
+    # Create a persistent directory for ChromaDB
+    persist_directory = "chroma_db"
+    os.makedirs(persist_directory, exist_ok=True)
+    # Initialize the client with persistence
+    chroma_client = chromadb.PersistentClient(
+        path=persist_directory,
+    )
+    # Get or create collection
+    try:
+        # Try to get existing collection
+        db = chroma_client.get_collection(
+            name="document_collection",
+            embedding_function=embedding_function
+        )
+        # Clear existing documents
+        db.delete(db.get()["ids"])
+    except:
+        # Create new collection if it doesn't exist
+        db = chroma_client.create_collection(
+            name="document_collection",
+            embedding_function=embedding_function
+        )
+    # Add documents in batches to avoid memory issues
+    batch_size = 20
+    for i in range(0, len(documents), batch_size):
+        batch = documents[i:i + batch_size]
+        db.add(
+            documents=batch,
+            ids=[f"doc_{j}" for j in range(i, i + len(batch))]
+        )
+    return db
+def get_relevant_passage(query: str, db, n_results: int = 5) -> List[str]:
+    start_time = datetime.datetime.now()
+    print(f"{start_time}: Starting ChromaDB query for question: {query[:50]}...") # Log query start
+    try:
+        results = db.query(
+            query_texts=[query],
+            n_results=min(n_results, db.count()),
+            include=['documents', 'distances']
+        )
+        end_time = datetime.datetime.now()
+        print(f"{end_time}: ChromaDB query finished. Time taken: {end_time - start_time}")  # Log the time taken
+        # ... (rest of your get_relevant_passage function remains the same)
+        # Ensure results exist and contain at least one document
+        if not results or 'documents' not in results or not results['documents'] or not results['documents'][0]:
+            print("No relevant passages found.")
+            return []
+        # Extract valid results
+        documents = results['documents'][0]  # List of retrieved documents
+        distances = results['distances'][0]  # Corresponding similarity scores
+        # Debugging output
+        print(f"Number of relevant passages retrieved: {len(documents)}")
+        for i, (doc, distance) in enumerate(zip(documents, distances)):
+            similarity = 1 - distance  # Convert distance to similarity score
+            print(f"Passage {i+1} (Similarity: {similarity:.4f}): {doc[:100]}...")
+        return documents  # Return only valid results
+    except Exception as e:
+        print(f"Error in get_relevant_passage: {str(e)}")
+        return []

gemini_embedding.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import os
+import google.generativeai as genai
+from chromadb.api.types import Documents, Embeddings
+from chromadb import EmbeddingFunction
+from dotenv import load_dotenv
+load_dotenv()
+gemini_api_key = os.environ["GEMINI_API_KEY"]
+class GeminiEmbeddingFunction(EmbeddingFunction):
+    """
+    Custom embedding function using Gemini AI API.
+    """
+    def __call__(self, input: Documents) -> Embeddings:
+        if not gemini_api_key:
+            raise ValueError("Gemini API Key not provided. Please set GEMINI_API_KEY as an environment variable.")
+        genai.configure(api_key=gemini_api_key)
+        model = "models/text-embedding-004"
+        return genai.embed_content(model=model, content=input, task_type="retrieval_document")["embedding"]

pdf_utils.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import re
+import pdfplumber
+from typing import List, Optional
+import textract
+from docx import Document
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+import os
+import logging
+import warnings
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def clean_text(text: str) -> str:
+    """Clean extracted text by removing extra whitespace and invalid characters."""
+    text = re.sub(r'\s+', ' ', text)  # Remove multiple spaces
+    text = ''.join(char for char in text if char.isprintable() or char == '\n')  # Remove non-printable characters
+    text = re.sub(r'\n\s*\n', '\n\n', text)  # Remove multiple newlines
+    return text.strip()
+def extract_text_from_pdf(pdf_path: str) -> Optional[str]:
+    """
+    Extract text from PDF using pdfplumber.
+    """
+    extracted_text = []
+    try:
+        with pdfplumber.open(pdf_path) as pdf:
+            for page_num, page in enumerate(pdf.pages, 1):
+                try:
+                    page_text = page.extract_text()
+                    if page_text:
+                        extracted_text.append(page_text)
+                    else:
+                        logger.warning(f"No text extracted from page {page_num}")
+                except Exception as e:
+                    logger.error(f"Error extracting text from page {page_num}: {e}")
+                    continue
+        if not extracted_text:
+            logger.warning("No text was extracted from any page of the PDF")
+            return None
+        return clean_text('\n'.join(extracted_text))
+    except Exception as e:
+        logger.error(f"Failed to process PDF {pdf_path}: {e}")
+        return None
+def extract_text_from_docx(docx_path: str) -> Optional[str]:
+    """
+    Extract text from DOCX with enhanced error handling.
+    """
+    try:
+        doc = Document(docx_path)
+        text = '\n'.join(para.text for para in doc.paragraphs if para.text.strip())
+        return clean_text(text) if text else None
+    except Exception as e:
+        logger.error(f"Failed to process DOCX {docx_path}: {e}")
+        return None
+import tempfile
+def extract_text_from_file(uploaded_file) -> Optional[str]:
+    """
+    Extract text from various file types with enhanced error handling and logging.
+    If file is uploaded as file-like object, save it temporarily.
+    """
+    if isinstance(uploaded_file, str):  # Assuming file_path is a string for direct file handling
+        file_path = uploaded_file
+    else:  # Handle file-like objects (e.g., uploaded files)
+        with tempfile.NamedTemporaryFile(delete=False) as temp_file:
+            temp_file.write(uploaded_file.read())  # Save file contents temporarily
+            file_path = temp_file.name  # Temporary file path
+    if not os.path.exists(file_path):
+        logger.error(f"File not found: {file_path}")
+        return None
+    _, file_extension = os.path.splitext(file_path)
+    file_extension = file_extension.lower()
+    try:
+        if file_extension == ".pdf":
+            text = extract_text_from_pdf(file_path)
+        elif file_extension == ".docx":
+            text = extract_text_from_docx(file_path)
+        elif file_extension == ".txt":
+            try:
+                with open(file_path, "r", encoding="utf-8") as file:
+                    text = clean_text(file.read())
+            except UnicodeDecodeError:
+                with open(file_path, "r", encoding="latin-1") as file:
+                    text = clean_text(file.read())
+        else:
+            text = clean_text(textract.process(file_path).decode("utf-8"))
+        if not text:
+            logger.warning(f"No text content extracted from {file_path}")
+            return None
+        return text
+    except Exception as e:
+        logger.error(f"Error extracting text from {file_path}: {e}")
+        return None
+def split_text(text: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> List[str]:
+    """
+    Split text into chunks with improved handling and validation.
+    """
+    if not text:
+        logger.warning("Empty text provided for splitting")
+        return []
+    try:
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap,
+            length_function=len,
+            is_separator_regex=False
+        )
+        splits = text_splitter.split_text(text)
+        logger.info(f"Split text into {len(splits)} chunks")
+        return splits
+    except Exception as e:
+        logger.error(f"Error splitting text: {e}")
+        return []
+# Example usage
+if __name__ == "__main__":
+    sample_file = "/Users/jessicawin/Downloads/github-recovery-codes.txt"
+    if os.path.exists(sample_file):
+        file_text = extract_text_from_file(sample_file)
+        if file_text:
+            chunks = split_text(file_text)
+            print(f"Successfully processed file into {len(chunks)} chunks")

query_handler.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import os
+import google.generativeai as genai
+from chroma_db_utils import get_relevant_passage
+import time
+import datetime
+import google.api_core.exceptions
+# Constants
+MAX_RETRIES = 3
+RETRY_DELAY = 1  # Initial delay in seconds
+MODEL_NAME = "gemini-1.5-flash"
+REQUESTS_PER_MINUTE = 3  # Free tier limit
+REQUEST_INTERVAL = 60 / REQUESTS_PER_MINUTE  # Ensures we stay within the rate limit
+def make_rag_prompt(query: str, relevant_passage: str) -> str:
+    """
+    Creates a prompt for the RAG model.
+    """
+    escaped = relevant_passage.replace("'", "").replace('"', "").replace("\n", " ")
+    prompt = f'''
+    You are a helpful and informative bot that answers questions using the REFERENCE TEXT below.
+    If the REFERENCE TEXT is irrelevant to the question, say "I cannot answer this question based on the provided information."
+    QUESTION: {query}
+    REFERENCE TEXT:
+    {escaped}
+    ANSWER:
+    '''
+    return prompt
+def generate_answer(prompt: str) -> str:
+    """
+    Calls the Gemini API with retries and rate limiting.
+    """
+    gemini_api_key = os.getenv("GEMINI_API_KEY")
+    if not gemini_api_key:
+        raise ValueError("Gemini API Key not provided.")
+    genai.configure(api_key=gemini_api_key)
+    model = genai.GenerativeModel(MODEL_NAME)
+    for attempt in range(MAX_RETRIES):
+        start_time = datetime.datetime.now()
+        print(f"{start_time}: Making Gemini API request (attempt {attempt + 1}/{MAX_RETRIES})...")
+        try:
+            response = model.generate_content(prompt)
+            end_time = datetime.datetime.now()
+            print(f"{end_time}: Gemini API request successful. Time taken: {end_time - start_time}")
+            # Enforce rate limiting
+            time.sleep(REQUEST_INTERVAL)
+            return response.text
+        except google.api_core.exceptions.ResourceExhausted as e:
+            if e.code == 429:  # Too Many Requests
+                delay = RETRY_DELAY * (2 ** attempt)  # Exponential backoff
+                print(f"Rate limit hit. Retrying in {delay} seconds (attempt {attempt + 1}/{MAX_RETRIES})...")
+                time.sleep(delay)
+            else:
+                raise  # Re-raise other exceptions
+    raise Exception("Max retries exceeded for Gemini API request.")
+def handle_query(query: str, db, n_results: int = 5) -> str:
+    """
+    Handles a user query by retrieving relevant passages and generating an answer.
+    """
+    relevant_passages = get_relevant_passage(query, db, n_results)
+    relevant_passage_str = " ".join(relevant_passages)
+    prompt = make_rag_prompt(query, relevant_passage=relevant_passage_str)
+    return generate_answer(prompt)

requirement.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+google-generativeai>=0.3.0
+chromadb
+pdfplumber
+python-docx>=0.8.11
+textract>=1.6.5
+langchain>=0.1.0
+chromadb>=0.4.0
+numpy>=1.21.0
+python-dotenv>=0.19.0
+streamlit>=1.18.0
+typing>=3.7.4
+warnings>=0.1.0
+logging>=0.5.0