Spaces:

wahab5763
/

EmalDataExtractor

Running

App Files Files Community

wahab5763 commited on Jan 24

Commit

08f287a

verified ·

1 Parent(s): 358cf6c

Update app.py

Browse files

Files changed (1) hide show

app.py +266 -211

app.py CHANGED Viewed

@@ -1,25 +1,26 @@
 import streamlit as st
 import faiss
 import numpy as np
-from sentence_transformers import SentenceTransformer, CrossEncoder
 import requests
 import os
 import torch
 import pickle
-from tqdm import tqdm
-from googleapiclient.discovery import build
-from google_auth_oauthlib.flow import InstalledAppFlow
-from google.auth.transport.requests import Request
-from google.oauth2.credentials import Credentials
 import base64
 import re
 from pyngrok import ngrok
 # ===============================
 # 1. Streamlit App Configuration
 # ===============================
 st.set_page_config(page_title="📥 Email Chat Application", layout="wide")
-st.title("✉️ Email Chat Application")
 # ===============================
 # 2. Gmail Authentication Configuration
@@ -42,6 +43,12 @@ if "embeddings" not in st.session_state:
 if "vector_store" not in st.session_state:
     st.session_state.vector_store = None
 def count_tokens(text):
     return len(text.split())
@@ -57,7 +64,9 @@ def reset_session_state():
     st.session_state.data_chunks = []
     st.session_state.embeddings = None
     st.session_state.vector_store = None
-    for filename in ["token.json", "data_chunks.pkl", "embeddings.pkl", "vector_store.index"]:
         if os.path.exists(filename):
             os.remove(filename)
@@ -65,6 +74,7 @@ def authenticate_gmail(credentials_file):
     creds = None
     if os.path.exists('token.json'):
         try:
             creds = Credentials.from_authorized_user_file('token.json', SCOPES)
             if creds and creds.valid:
                 st.session_state.creds = creds
@@ -132,18 +142,19 @@ def extract_email_body(payload):
     return ""
 def combine_email_text(email):
     parts = []
     if email.get("sender"):
-        parts.append(f"Sender: {email['sender']}")
     if email.get("to"):
-        parts.append(f"To: {email['to']}")
     if email.get("date"):
-        parts.append(f"Date: {email['date']}")
     if email.get("subject"):
-        parts.append(f"Subject: {email['subject']}")
     if email.get("body"):
-        parts.append(f"Body: {email['body']}")
-    return "\n".join(parts)
 def create_chunks_from_gmail(service, label):
     try:
@@ -152,8 +163,7 @@ def create_chunks_from_gmail(service, label):
         messages.extend(result.get('messages', []))
         while 'nextPageToken' in result:
             token = result["nextPageToken"]
-            result = service.users().messages().list(userId='me', labelIds=[label],
-                                                      maxResults=500, pageToken=token).execute()
             messages.extend(result.get('messages', []))
         data_chunks = []
@@ -175,22 +185,28 @@ def create_chunks_from_gmail(service, label):
                     email_dict['date'] = header.get('value', '')
             email_dict['body'] = extract_email_body(msg_data.get('payload', {}))
             data_chunks.append(email_dict)
-            progress_bar.progress((idx + 1) / total)
-        st.session_state.data_chunks = data_chunks
-        st.success(f"✅ Data chunks created successfully from Gmail! Total emails processed: {len(data_chunks)}")
-        # Save chunks locally for future use.
-        with open("data_chunks.pkl", "wb") as f:
-            pickle.dump(data_chunks, f)
     except Exception as e:
-        st.error(f"❌ Error creating chunks from Gmail: {e}")
 def embed_emails(email_chunks):
     st.header("🔄 Embedding Data and Creating Vector Store")
     with st.spinner('🔄 Embedding data...'):
         try:
-            embed_model = SentenceTransformer("all-MiniLM-L6-v2")
-            device = 'cuda' if torch.cuda.is_available() else 'cpu'
-            embed_model.to(device)
             combined_texts = [combine_email_text(email) for email in email_chunks]
             batch_size = 64
             embeddings = []
@@ -203,6 +219,8 @@ def embed_emails(email_chunks):
                     device=device
                 )
                 embeddings.append(batch_embeddings)
             embeddings = np.vstack(embeddings)
             faiss.normalize_L2(embeddings)
             st.session_state.embeddings = embeddings
@@ -211,218 +229,255 @@ def embed_emails(email_chunks):
             index.add(embeddings)
             st.session_state.vector_store = index
             st.success("✅ Data embedding and vector store created successfully!")
-            # Save embeddings and index to disk.
-            with open('embeddings.pkl', 'wb') as f:
-                pickle.dump(embeddings, f)
-            faiss.write_index(index, 'vector_store.index')
         except Exception as e:
             st.error(f"❌ Error during embedding: {e}")
-def save_embeddings_and_index():
-    try:
-        with open('embeddings.pkl', 'wb') as f:
-            pickle.dump(st.session_state.embeddings, f)
-        faiss.write_index(st.session_state.vector_store, 'vector_store.index')
-        st.success("💾 Embeddings and vector store saved successfully!")
-    except Exception as e:
-        st.error(f"❌ Error saving embeddings and vector store: {e}")
-def load_embeddings_and_index():
     try:
-        with open('embeddings.pkl', 'rb') as f:
-            st.session_state.embeddings = pickle.load(f)
-        st.session_state.vector_store = faiss.read_index('vector_store.index')
-        st.success("📁 Embeddings and vector store loaded successfully!")
     except Exception as e:
-        st.error(f"❌ Error loading embeddings and vector store: {e}")
-def load_chunks():
-    try:
-        with open("data_chunks.pkl", "rb") as f:
-            st.session_state.data_chunks = pickle.load(f)
-        st.success("📁 Email chunks loaded successfully!")
-    except Exception as e:
-        st.error(f"❌ Error loading email chunks: {e}")
 # ===============================
-# 5. Handling User Queries
 # ===============================
 def preprocess_query(query):
     return query.lower().strip()
 def handle_user_query():
-    st.header("💬 Let's chat with your Email")
-    user_query = st.text_input("Enter your query:")
-    TOP_K = 10
-    SIMILARITY_THRESHOLD = 0.4
-    if st.button("🔍 Get Response"):
-        if (st.session_state.vector_store is None or
-            st.session_state.embeddings is None or
-            st.session_state.data_chunks is None):
-            st.error("❌ Please process your email data or load saved chunks/embeddings first.")
-            return
-        if not user_query.strip():
-            st.error("❌ Please enter a valid query.")
             return
-        with st.spinner('🔄 Processing your query...'):
-            try:
-                # Retrieve candidates using the bi-encoder.
-                embed_model = SentenceTransformer("all-MiniLM-L6-v2")
-                device = 'cuda' if torch.cuda.is_available() else 'cpu'
-                embed_model.to(device)
-                processed_query = preprocess_query(user_query)
-                query_embedding = embed_model.encode(
-                    [processed_query],
-                    convert_to_numpy=True,
-                    show_progress_bar=False,
-                    device=device
                 )
-                faiss.normalize_L2(query_embedding)
-                distances, indices = st.session_state.vector_store.search(query_embedding, TOP_K)
-                candidates = []
-                for idx, score in zip(indices[0], distances[0]):
-                    candidates.append((st.session_state.data_chunks[idx], score))
-                # Boost candidates if sender or "to" field contains query tokens (e.g., email addresses).
-                query_tokens = re.findall(r'\S+@\S+', user_query)
-                if query_tokens:
-                    for i in range(len(candidates)):
-                        candidate_email_str = (
-                            (candidates[i][0].get("sender", "") + " " + candidates[i][0].get("to", "")).lower()
-                        )
-                        for token in query_tokens:
-                            if token.lower() in candidate_email_str:
-                                candidates[i] = (candidates[i][0], max(candidates[i][1], 1.0))
-                    filtered_candidates = []
-                    for candidate, score in candidates:
-                        candidate_text = combine_email_text(candidate).lower()
-                        if any(token.lower() in candidate_text for token in query_tokens):
-                            filtered_candidates.append((candidate, score))
-                    if filtered_candidates:
-                        candidates = filtered_candidates
-                    else:
-                        st.info("No candidate emails contain the query token(s) exactly. Proceeding with all candidates.")
-                candidates.sort(key=lambda x: x[1], reverse=True)
-                if not candidates:
-                    st.subheader("📝 AI Response:")
-                    st.write("⚠️ No documents found.")
-                    return
-                if candidates[0][1] < SIMILARITY_THRESHOLD:
-                    st.subheader("📝 AI Response:")
-                    st.write("⚠️ No document strongly matches your query. Try refining your query.")
-                    return
-                # Re-rank candidates using the cross-encoder.
-                cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-12-v2")
-                candidate_pairs = [(user_query, combine_email_text(candidate[0])) for candidate in candidates]
-                rerank_scores = cross_encoder.predict(candidate_pairs)
-                reranked_candidates = [(candidates[i][0], rerank_scores[i]) for i in range(len(candidates))]
-                reranked_candidates.sort(key=lambda x: x[1], reverse=True)
-                retrieved_emails = [email for email, score in reranked_candidates]
-                retrieved_scores = [score for email, score in reranked_candidates]
-                average_similarity = np.mean(retrieved_scores)
-                # Build the final context string.
-                context_str = "\n\n".join([combine_email_text(email) for email in retrieved_emails])
-                MAX_CONTEXT_TOKENS = 500
-                context_tokens = context_str.split()
-                if len(context_tokens) > MAX_CONTEXT_TOKENS:
-                    context_str = " ".join(context_tokens[:MAX_CONTEXT_TOKENS])
-                payload = {
-                    "model": "llama3-8b-8192",  # Adjust as needed.
-                    "messages": [
-                        {"role": "system", "content": f"Use the following context:\n{context_str}"},
-                        {"role": "user", "content": user_query}
-                    ]
-                }
-                api_key = "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"  # Replace with your API key.
-                url = "https://api.groq.com/openai/v1/chat/completions"
-                headers = {
-                    "Authorization": f"Bearer {api_key}",
-                    "Content-Type": "application/json"
-                }
-                response = requests.post(url, headers=headers, json=payload)
-                if response.status_code == 200:
-                    response_json = response.json()
-                    generated_text = response_json["choices"][0]["message"]["content"]
-                    st.subheader("📝 AI Response:")
-                    st.write(generated_text)
-                    st.write(f"Average Re-Ranked Score: {average_similarity:.4f}")
-                else:
-                    st.error(f"❌ Error from LLM API: {response.status_code} - {response.text}")
-            except Exception as e:
-                st.error(f"❌ An error occurred during processing: {e}")
 # ===============================
 # 6. Main Application Logic
 # ===============================
 def main():
     st.sidebar.header("🔒 Gmail Authentication")
-    credentials_file = st.sidebar.file_uploader("📁 Upload `credentials.json`", type=["json"])
-    if credentials_file and st.sidebar.button("🔓 Authenticate"):
-        reset_session_state()
-        with open("credentials.json", "wb") as f:
-            f.write(credentials_file.getbuffer())
-        authenticate_gmail("credentials.json")
-    # Option to load previously saved email chunks.
-    chunks_file = st.sidebar.file_uploader("📁 Upload saved email chunks (data_chunks.pkl)", type=["pkl"])
-    if chunks_file:
-        try:
-            st.session_state.data_chunks = pickle.load(chunks_file)
-            st.success("📁 Email chunks loaded successfully from upload!")
-        except Exception as e:
-            st.error(f"❌ Error loading uploaded email chunks: {e}")
-    # Option to load previously saved embeddings and vector store.
-    embeddings_file = st.sidebar.file_uploader("📁 Upload saved embeddings (embeddings.pkl)", type=["pkl"])
-    vector_file = st.sidebar.file_uploader("📁 Upload saved vector store (vector_store.index)", type=["index", "idx"])
-    if embeddings_file and vector_file:
-        try:
-            st.session_state.embeddings = pickle.load(embeddings_file)
-            st.session_state.vector_store = faiss.read_index(vector_file.name)
-            st.success("📁 Embeddings and vector store loaded successfully from upload!")
-        except Exception as e:
-            st.error(f"❌ Error loading uploaded embeddings/vector store: {e}")
-    if st.session_state.auth_url:
-        st.sidebar.markdown("### 🔗 **Authorization URL:**")
-        st.sidebar.markdown(f"[Authorize]({st.session_state.auth_url})")
-        st.sidebar.text_input("🔑 Enter the authorization code:", key="auth_code")
-        if st.sidebar.button("✅ Submit Authentication Code"):
-            submit_auth_code()
-    if st.session_state.authenticated:
         st.sidebar.success("✅ You are authenticated!")
-        st.sidebar.header("📂 Data Management")
-        label = st.sidebar.selectbox("📬 Select Label to Process Emails From:",
-                                     ["INBOX", "SENT", "DRAFTS", "TRASH", "SPAM"],
-                                     key="label_selector")
-        if st.sidebar.button("📥 Create Chunks and Embed Data"):
             service = build('gmail', 'v1', credentials=st.session_state.creds)
-            create_chunks_from_gmail(service, label)
             if st.session_state.data_chunks:
                 embed_emails(st.session_state.data_chunks)
-        if (st.session_state.embeddings is not None and st.session_state.vector_store is not None):
-            with st.expander("💾 Save Data"):
-                if st.button("💾 Save Email Chunks"):
-                    try:
-                        with open("data_chunks.pkl", "wb") as f:
-                            pickle.dump(st.session_state.data_chunks, f)
-                        st.success("💾 Email chunks saved to disk!")
-                    except Exception as e:
-                        st.error(f"❌ Error saving email chunks: {e}")
-                if st.button("💾 Save Embeddings & Vector Store"):
-                    save_embeddings_and_index()
-        if (st.session_state.vector_store is not None and
-            st.session_state.embeddings is not None and
-            st.session_state.data_chunks is not None):
-            handle_user_query()
-    else:
-        st.warning("⚠️ You are not authenticated yet. Please authenticate to access your Gmail data.")
 if __name__ == "__main__":
     main()

 import streamlit as st
 import faiss
 import numpy as np
+from sentence_transformers import SentenceTransformer
 import requests
 import os
 import torch
 import pickle
 import base64
 import re
 from pyngrok import ngrok
+from googleapiclient.discovery import build
+from google_auth_oauthlib.flow import InstalledAppFlow
+from google.auth.transport.requests import Request
+import subprocess
+import time
+import sys
 # ===============================
 # 1. Streamlit App Configuration
 # ===============================
 st.set_page_config(page_title="📥 Email Chat Application", layout="wide")
+st.title("💬 Turn Emails into Conversations—Effortless Chat with Your Inbox! 📩")
 # ===============================
 # 2. Gmail Authentication Configuration
 if "vector_store" not in st.session_state:
     st.session_state.vector_store = None
+# For storing candidate context details.
+if "candidate_context" not in st.session_state:
+    st.session_state.candidate_context = None
+if "raw_candidates" not in st.session_state:
+    st.session_state.raw_candidates = None
 def count_tokens(text):
     return len(text.split())
     st.session_state.data_chunks = []
     st.session_state.embeddings = None
     st.session_state.vector_store = None
+    st.session_state.candidate_context = None
+    st.session_state.raw_candidates = None
+    for filename in ["token.json", "data_chunks.pkl", "embeddings.pkl", "vector_store.index", "vector_database.pkl"]:
         if os.path.exists(filename):
             os.remove(filename)
     creds = None
     if os.path.exists('token.json'):
         try:
+            from google.oauth2.credentials import Credentials
             creds = Credentials.from_authorized_user_file('token.json', SCOPES)
             if creds and creds.valid:
                 st.session_state.creds = creds
     return ""
 def combine_email_text(email):
+    # Build the complete email text by joining parts with HTML line breaks.
     parts = []
     if email.get("sender"):
+        parts.append("From: " + email["sender"])
     if email.get("to"):
+        parts.append("To: " + email["to"])
     if email.get("date"):
+        parts.append("Date: " + email["date"])
     if email.get("subject"):
+        parts.append("Subject: " + email["subject"])
     if email.get("body"):
+        parts.append("Body: " + email["body"])
+    return "<br>".join(parts)
 def create_chunks_from_gmail(service, label):
     try:
         messages.extend(result.get('messages', []))
         while 'nextPageToken' in result:
             token = result["nextPageToken"]
+            result = service.users().messages().list(userId='me', labelIds=[label], maxResults=500, pageToken=token).execute()
             messages.extend(result.get('messages', []))
         data_chunks = []
                     email_dict['date'] = header.get('value', '')
             email_dict['body'] = extract_email_body(msg_data.get('payload', {}))
             data_chunks.append(email_dict)
+            progress_bar.progress(min((idx + 1) / total, 1.0))
+        st.session_state.data_chunks.extend(data_chunks)
+        st.success(f"✅ Data chunks created successfully from {label}! Total emails processed for this label: {len(data_chunks)}")
     except Exception as e:
+        st.error(f"❌ Error creating chunks from Gmail for label {label}: {e}")
+# -------------------------------
+# Cached model loaders for efficiency
+# -------------------------------
+@st.cache_resource
+def get_embed_model():
+    model = SentenceTransformer("all-MiniLM-L6-v2")
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model.to(device)
+    return model, device
 def embed_emails(email_chunks):
     st.header("🔄 Embedding Data and Creating Vector Store")
+    progress_bar = st.progress(0)
     with st.spinner('🔄 Embedding data...'):
         try:
+            embed_model, device = get_embed_model()
             combined_texts = [combine_email_text(email) for email in email_chunks]
             batch_size = 64
             embeddings = []
                     device=device
                 )
                 embeddings.append(batch_embeddings)
+                progress_value = min((i + batch_size) / len(combined_texts), 1.0)
+                progress_bar.progress(progress_value)
             embeddings = np.vstack(embeddings)
             faiss.normalize_L2(embeddings)
             st.session_state.embeddings = embeddings
             index.add(embeddings)
             st.session_state.vector_store = index
             st.success("✅ Data embedding and vector store created successfully!")
         except Exception as e:
             st.error(f"❌ Error during embedding: {e}")
+# New function to save the entire vector database as a single pickle file.
+def save_vector_database():
     try:
+        vector_db = {
+            "vector_store": st.session_state.vector_store,
+            "embeddings": st.session_state.embeddings,
+            "data_chunks": st.session_state.data_chunks
+        }
+        db_data = pickle.dumps(vector_db)
+        st.download_button(
+            label="Download Vector Database",
+            data=db_data,
+            file_name="vector_database.pkl",
+            mime="application/octet-stream"
+        )
     except Exception as e:
+        st.error(f"❌ Error saving vector database: {e}")
 # ===============================
+# 5. Handling User Queries (User-Controlled Threshold)
 # ===============================
 def preprocess_query(query):
     return query.lower().strip()
+def process_candidate_emails(query, similarity_threshold):
+    """
+    Process the query by computing its embedding, searching the vector store,
+    filtering candidates based on a similarity threshold, and building a context string.
+    """
+    TOP_K = 20  # Increased to allow for threshold filtering
+    # Reset candidate context for each query
+    st.session_state.candidate_context = None
+    st.session_state.raw_candidates = None
+    if st.session_state.vector_store is None:
+        st.error("❌ Please process your email data or load a saved vector database first.")
+        return
+    with st.spinner('🔄 Processing your query...'):
+        try:
+            embed_model, device = get_embed_model()
+            processed_query = preprocess_query(query)
+            query_embedding = embed_model.encode(
+                [processed_query],
+                convert_to_numpy=True,
+                show_progress_bar=False,
+                device=device
+            )
+            faiss.normalize_L2(query_embedding)
+            # Debug: Verify the type of vector_store
+            st.write(f"Vector Store Type: {type(st.session_state.vector_store)}")
+            # Perform search
+            distances, indices = st.session_state.vector_store.search(query_embedding, TOP_K)
+            candidates = []
+            for idx, sim in zip(indices[0], distances[0]):
+                # Include candidate only if similarity meets the threshold
+                if sim >= similarity_threshold:
+                    candidates.append((st.session_state.data_chunks[idx], sim))
+            if not candidates:
+                st.write("⚠️ No matching embeddings found for your query with the selected threshold.")
+                return
+            # Build the context string by concatenating all matching email texts using HTML breaks.
+            context_str = ""
+            for candidate, sim in candidates:
+                context_str += combine_email_text(candidate) + "<br><br>"
+            # Optionally limit context size.
+            MAX_CONTEXT_TOKENS = 500
+            context_tokens = context_str.split()
+            if len(context_tokens) > MAX_CONTEXT_TOKENS:
+                context_str = " ".join(context_tokens[:MAX_CONTEXT_TOKENS])
+            st.session_state.candidate_context = context_str
+            st.session_state.raw_candidates = candidates
+            st.success("✅ Candidates retrieved and context built!")
+        except Exception as e:
+            st.error(f"❌ An error occurred during processing: {e}")
+def call_llm_api(query):
+    """
+    Send the user's query along with the concatenated matching email texts (context)
+    to the LLM API and display the AI response.
+    """
+    if not st.session_state.candidate_context:
+        st.error("❌ No candidate context available. Please try again.")
+        return
+    # Retrieve the API key from the environment variable 'GroqAPI'
+    api_key = os.getenv("GroqAPI")
+    if not api_key:
+        st.error("❌ API key not found. Please ensure 'GroqAPI' is set in Hugging Face Secrets.")
+        return
+    payload = {
+        "model": "llama-3.3-70b-versatile",  # Adjust model as needed.
+        "messages": [
+            {"role": "system", "content": f"Use the following context:<br>{st.session_state.candidate_context}"},
+            {"role": "user", "content": query}
+        ]
+    }
+    url = "https://api.groq.com/openai/v1/chat/completions"  # Verify this endpoint
+    headers = {
+        "Authorization": f"Bearer {api_key}",
+        "Content-Type": "application/json"
+    }
+    with st.spinner("🔄 Fetching AI response..."):
+        try:
+            response = requests.post(url, headers=headers, json=payload)
+            response.raise_for_status()  # Raises stored HTTPError, if one occurred.
+            response_json = response.json()
+            generated_text = response_json["choices"][0]["message"]["content"]
+            st.subheader("📝 AI Response:")
+            st.write(generated_text)
+        except requests.exceptions.HTTPError as http_err:
+            try:
+                error_info = response.json().get("error", {})
+                error_message = error_info.get("message", "An unknown error occurred.")
+                st.error(f"❌ HTTP error occurred: {error_message}")
+            except ValueError:
+                st.error(f"❌ HTTP error occurred: {response.status_code} - {response.text}")
+        except Exception as err:
+            st.error(f"❌ An unexpected error occurred: {err}")
 def handle_user_query():
+    st.header("💬 Let's Chat with Your Emails")
+    # Checkbox to show/hide the threshold slider
+    show_threshold = st.checkbox("Adjust Similarity Threshold")
+    # Slider, shown only if 'show_threshold' is True
+    if show_threshold:
+        similarity_threshold = st.slider(
+            "Select Similarity Threshold",
+            min_value=0.0,
+            max_value=1.0,
+            value=0.3,
+            step=0.05,
+            help="Adjust the similarity threshold to control the relevance of retrieved emails. Higher values yield more relevant results.",
+            key='similarity_threshold'
+        )
+    else:
+        # Set a default threshold if the slider is not shown
+        if 'similarity_threshold' not in st.session_state:
+            st.session_state.similarity_threshold = 0.3
+        similarity_threshold = st.session_state.similarity_threshold
+    # Callback function to process the query
+    def query_callback():
+        query = st.session_state.query_input
+        if not query.strip():
             return
+        process_candidate_emails(query, similarity_threshold)
+        if st.session_state.raw_candidates:
+            st.subheader("🔎 Matching Email Chunks:")
+            for candidate, sim in st.session_state.raw_candidates:
+                # Get a snippet (first 150 characters) of the body instead of full body content.
+                body = candidate.get('body', 'No Content')
+                snippet = (body[:150] + "...") if len(body) > 150 else body
+                st.markdown(
+                    f"**From:** {candidate.get('sender','Unknown')}  <br>"
+                    f"**To:** {candidate.get('to','Unknown')}  <br>"
+                    f"**Date:** {candidate.get('date','Unknown')}  <br>"
+                    f"**Subject:** {candidate.get('subject','No Subject')}  <br>"
+                    f"**Body Snippet:** {snippet}  <br>"
+                    f"**Similarity:** {sim:.4f}",
+                    unsafe_allow_html=True
                 )
+            # Then send the query along with the context to the LLM API.
+            call_llm_api(query)
+    # Text input with callback on change (when Enter is pressed)
+    st.text_input("Enter your query:", key="query_input", on_change=query_callback)
 # ===============================
 # 6. Main Application Logic
 # ===============================
 def main():
     st.sidebar.header("🔒 Gmail Authentication")
+    credentials_file = st.sidebar.file_uploader("📁 Upload credentials.json", type=["json"])
+    data_management_option = st.sidebar.selectbox(
+        "Choose an option",
+        ["Upload Pre-existing Data", "Authenticate and Create New Data"],
+        index=1  # Default to "Authenticate and Create New Data"
+    )
+    if data_management_option == "Upload Pre-existing Data":
+        uploaded_db = st.sidebar.file_uploader("📁 Upload vector database (vector_database.pkl)", type=["pkl"])
+        if uploaded_db:
+            # Check file size; if larger than 200MB, show a warning and then continue.
+            file_size_mb = uploaded_db.size / (1024 * 1024)
+            if file_size_mb > 200:
+                st.warning("The uploaded file is larger than 200MB. It may take longer to load, but processing will continue.")
+            try:
+                vector_db = pickle.load(uploaded_db)
+                st.session_state.vector_store = vector_db.get("vector_store")
+                st.session_state.embeddings = vector_db.get("embeddings")
+                st.session_state.data_chunks = vector_db.get("data_chunks")
+                st.success("📁 Vector database loaded successfully from upload!")
+            except Exception as e:
+                st.error(f"❌ Error loading vector database: {e}")
+    elif data_management_option == "Authenticate and Create New Data":
+        if credentials_file and st.sidebar.button("🔓 Authenticate"):
+            reset_session_state()
+            with open("credentials.json", "wb") as f:
+                f.write(credentials_file.getbuffer())
+            authenticate_gmail("credentials.json")
+        if st.session_state.auth_url:
+            st.sidebar.markdown("### 🔗 **Authorization URL:**")
+            st.sidebar.markdown(f"[Authorize]({st.session_state.auth_url})")
+            st.sidebar.text_input("🔑 Enter the authorization code:", key="auth_code")
+            if st.sidebar.button("✅ Submit Authentication Code"):
+                submit_auth_code()
+    if data_management_option == "Authenticate and Create New Data" and st.session_state.authenticated:
         st.sidebar.success("✅ You are authenticated!")
+        st.header("📂 Data Management")
+        # Multi-select widget for folders (labels)
+        folders = st.multiselect("Select Labels (Folders) to Process Emails From:",
+                                 ["INBOX", "SENT", "DRAFTS", "TRASH", "SPAM"], default=["INBOX"])
+        if st.button("📥 Create Chunks and Embed Data"):
             service = build('gmail', 'v1', credentials=st.session_state.creds)
+            all_chunks = []
+            # Process each selected folder
+            for folder in folders:
+                # Clear temporary data_chunks so that each folder's data is separate
+                st.session_state.data_chunks = []
+                create_chunks_from_gmail(service, folder)
+                if st.session_state.data_chunks:
+                    all_chunks.extend(st.session_state.data_chunks)
+            st.session_state.data_chunks = all_chunks
             if st.session_state.data_chunks:
                 embed_emails(st.session_state.data_chunks)
+        if st.session_state.vector_store is not None:
+            with st.expander("💾 Download Data", expanded=True):
+                save_vector_database()
+    if st.session_state.vector_store is not None:
+        handle_user_query()
 if __name__ == "__main__":
     main()