Spaces:

ADKU
/

ResearchGPT_space

Running

App Files Files Community

ADKU commited on Mar 6

Commit

d8a8174

verified ·

1 Parent(s): cf9155f

Update app.py

Browse files

Files changed (1) hide show

app.py +108 -200

app.py CHANGED Viewed

@@ -8,8 +8,6 @@ import gradio as gr
 from transformers import AutoTokenizer, AutoModel
 import google.generativeai as genai
 import logging
-from PyPDF2 import PdfReader
-import io
 # Set up logging
 logging.basicConfig(level=logging.INFO)
@@ -66,8 +64,8 @@ except Exception as e:
     logger.error(f"Model loading failed: {e}")
     raise
-# Generate SciBERT embeddings (optimized with larger batch size)
-def generate_embeddings_sci_bert(texts, batch_size=64):
     try:
         all_embeddings = []
         for i in range(0, len(texts), batch_size):
@@ -96,7 +94,7 @@ except Exception as e:
     logger.error(f"FAISS index creation failed: {e}")
     raise
-# Hybrid search function (unchanged from original)
 def get_relevant_papers(query):
     if not query.strip():
         return [], "Please enter a search query."
@@ -108,237 +106,147 @@ def get_relevant_papers(query):
         bm25_top_indices = np.argsort(bm25_scores)[::-1][:5]
         combined_indices = list(set(indices[0]) | set(bm25_top_indices))
         ranked_results = sorted(combined_indices, key=lambda idx: -bm25_scores[idx])
         papers = [f"{i+1}. {df.iloc[idx]['title']} - Abstract: {df.iloc[idx]['abstract'][:200]}..." for i, idx in enumerate(ranked_results[:5])]
         return papers, ranked_results[:5], "Search completed."
     except Exception as e:
         logger.error(f"Search failed: {e}")
         return [], [], "Search failed. Please try again."
-# Process uploaded PDF for RAG
-def process_uploaded_pdf(file):
-    try:
-        pdf_reader = PdfReader(file)
-        text = ""
-        for page in pdf_reader.pages:
-            text += page.extract_text() or ""
-        cleaned_text = clean_text(text)
-        chunks = [cleaned_text[i:i+1000] for i in range(0, len(cleaned_text), 1000)]
-        embeddings = generate_embeddings_sci_bert(chunks)
-        faiss_index = faiss.IndexFlatL2(embeddings.shape[1])
-        faiss_index.add(embeddings.astype(np.float32))
-        tokenized_chunks = [chunk.split() for chunk in chunks]
-        bm25_rag = BM25Okapi(tokenized_chunks)
-        return {"chunks": chunks, "embeddings": embeddings, "faiss_index": faiss_index, "bm25": bm25_rag}, "Document processed successfully"
-    except Exception as e:
-        logger.error(f"PDF processing failed: {e}")
-        return None, "Failed to process document"
-# Hybrid search for RAG
-def get_relevant_chunks(query, uploaded_doc):
-    if not query.strip():
-        return [], "Please enter a question."
-    try:
-        query_embedding = generate_embeddings_sci_bert([query])
-        distances, indices = uploaded_doc["faiss_index"].search(query_embedding.astype(np.float32), 3)
-        bm25_scores = uploaded_doc["bm25"].get_scores(query.split())
-        combined_indices = list(set(indices[0]) | set(np.argsort(bm25_scores)[::-1][:3]))
-        ranked_results = sorted(combined_indices, key=lambda idx: -bm25_scores[idx])
-        return [uploaded_doc["chunks"][idx] for idx in ranked_results[:3]], "Retrieval completed."
-    except Exception as e:
-        logger.error(f"RAG retrieval failed: {e}")
-        return [], "Retrieval failed."
-# Unified QA function (updated for messages format)
-def answer_question(mode, selected_index, question, history, uploaded_doc=None):
     if not question.strip():
-        return history + [{"role": "user", "content": question}, {"role": "assistant", "content": "Please ask a question!"}], history
     if question.lower() in ["exit", "done"]:
-        return history + [{"role": "user", "content": "Conversation ended."}, {"role": "assistant", "content": "Start a new conversation!"}], []
     try:
-        if mode == "research":
-            if selected_index is None:
-                return history + [{"role": "user", "content": question}, {"role": "assistant", "content": "Please select a paper first!"}], history
-            paper_data = df.iloc[selected_index]
-            title = paper_data["title"]
-            abstract = paper_data["abstract"]
-            authors = ", ".join(paper_data["authors"])
-            doi = paper_data["doi"]
-            prompt = (
-                "You are Dr. Sage, the world's most brilliant and reliable research assistant, specializing in machine learning, deep learning, and agriculture. "
-                "Your goal is to provide concise, accurate, and well-structured answers based on the given paper's details. "
-                "When asked about tech stacks or methods, follow these guidelines:\n"
-                "1. If the abstract explicitly mentions technologies (e.g., Python, TensorFlow), list them precisely with brief explanations.\n"
-                "2. If the abstract is vague (e.g., 'machine learning techniques'), infer the most likely tech stacks based on the context of crop prediction and modern research practices, and explain your reasoning.\n"
-                "3. Always respond in a clear, concise format—use bullet points for lists (e.g., tech stacks) and short paragraphs for explanations.\n"
-                "4. If the question requires prior conversation context, refer to it naturally to maintain coherence.\n"
-                "5. If the abstract lacks enough detail, supplement with plausible, domain-specific suggestions and note they are inferred.\n"
-                "6. Avoid speculation or fluff—stick to facts or educated guesses grounded in the field.\n\n"
-                "Here’s the paper:\n"
-                f"Title: {title}\n"
-                f"Authors: {authors}\n"
-                f"Abstract: {abstract}\n"
-                f"DOI: {doi}\n\n"
-            )
-            if history:
-                prompt += "Previous conversation (use for context):\n"
-                for msg in history[-2:]:
-                    prompt += f"User: {msg['content']}\n" if msg["role"] == "user" else f"Assistant: {msg['content']}\n"
-            prompt += f"Now, answer this question: {question}"
-            model = genai.GenerativeModel("gemini-1.5-flash")
-            response = model.generate_content(prompt)
-            answer = response.text.strip()
-            if not answer or len(answer) < 15:
-                answer = (
-                    "The abstract doesn’t provide specific technologies, but based on crop prediction with machine learning and deep learning, likely tech stacks include:\n"
-                    "- Python: Core language for ML/DL.\n"
-                    "- TensorFlow or PyTorch: Frameworks for deep learning models.\n"
-                    "- Scikit-learn: For traditional ML algorithms.\n"
-                    "- Pandas/NumPy: For data handling and preprocessing."
-                )
-        elif mode == "rag":
-            if uploaded_doc is None:
-                return history + [{"role": "user", "content": question}, {"role": "assistant", "content": "Please upload a document first!"}], history
-            relevant_chunks, _ = get_relevant_chunks(question, uploaded_doc)
-            context = "\n".join(relevant_chunks)
-            prompt = (
-                "You are an expert AI assistant specializing in answering questions based on uploaded documents. "
-                "Provide concise, accurate answers based on the following document content:\n"
-                f"Content: {context}\n\n"
             )
-            if history:
-                prompt += "Previous conversation (use for context):\n"
-                for msg in history[-2:]:
-                    prompt += f"User: {msg['content']}\n" if msg["role"] == "user" else f"Assistant: {msg['content']}\n"
-            prompt += f"Now, answer this question: {question}"
-            model = genai.GenerativeModel("gemini-1.5-flash")
-            response = model.generate_content(prompt)
-            answer = response.text.strip()
-        else:  # general mode
-            prompt = (
-                "You are a highly knowledgeable AI assistant. Answer the following question concisely and accurately:\n"
-            )
-            if history:
-                prompt += "Previous conversation (use for context):\n"
-                for msg in history[-2:]:
-                    prompt += f"User: {msg['content']}\n" if msg["role"] == "user" else f"Assistant: {msg['content']}\n"
-            prompt += f"Question: {question}"
-            model = genai.GenerativeModel("gemini-1.5-flash")
-            response = model.generate_content(prompt)
-            answer = response.text.strip()
-        history.append({"role": "user", "content": question})
-        history.append({"role": "assistant", "content": answer})
         return history, history
     except Exception as e:
         logger.error(f"QA failed: {e}")
-        history.append({"role": "user", "content": question})
-        history.append({"role": "assistant", "content": "Sorry, I couldn’t process that. Try again!"})
         return history, history
 # Gradio UI
 with gr.Blocks(
     css="""
-    .chatbot {height: 500px; overflow-y: auto; border-radius: 10px; box-shadow: 0 2px 5px rgba(0,0,0,0.1);}
-    .sidebar {width: 350px; padding: 15px; background: #f8f9fa; border-radius: 10px;}
-    #main {display: flex; flex-direction: row; gap: 20px; padding: 20px;}
-    .tab-content {padding: 20px; background: #ffffff; border-radius: 10px; box-shadow: 0 2px 5px rgba(0,0,0,0.1);}
-    .gr-button {background: #007bff; color: white; border-radius: 5px; transition: background 0.3s;}
-    .gr-button:hover {background: #0056b3;}
-    h1 {color: #007bff; text-align: center; margin-bottom: 20px;}
     """,
-    theme=gr.themes.Soft(primary_hue="blue", secondary_hue="gray")
 ) as demo:
-    gr.Markdown("# Triad: ResearchGPT, RAG, & General Chat")
     with gr.Row(elem_id="main"):
-        # Sidebar
-        with gr.Column(scale=1, min_width=350, elem_classes="sidebar"):
-            mode_tabs = gr.Tabs()
-            with mode_tabs:
-                # Research Mode
-                with gr.TabItem("Research Mode"):
-                    gr.Markdown("### Search Papers")
-                    query_input = gr.Textbox(label="Enter your search query", placeholder="e.g., machine learning in healthcare")
-                    search_btn = gr.Button("Search")
-                    paper_dropdown = gr.Dropdown(label="Select a Paper", choices=[], interactive=True)
-                    search_status = gr.Textbox(label="Search Status", interactive=False)
-                    paper_choices_state = gr.State([])
-                    paper_indices_state = gr.State([])
-                    search_btn.click(
-                        fn=get_relevant_papers,
-                        inputs=query_input,
-                        outputs=[paper_choices_state, paper_indices_state, search_status]
-                    ).then(
-                        fn=lambda choices: gr.update(choices=choices, value=None),
-                        inputs=paper_choices_state,
-                        outputs=paper_dropdown
-                    )
-                # RAG Mode
-                with gr.TabItem("RAG Mode"):
-                    gr.Markdown("### Upload Document")
-                    file_upload = gr.File(label="Upload PDF", file_types=[".pdf"])
-                    upload_status = gr.Textbox(label="Upload Status", interactive=False)
-                    uploaded_doc_state = gr.State(None)
-                    file_upload.change(
-                        fn=process_uploaded_pdf,
-                        inputs=file_upload,
-                        outputs=[uploaded_doc_state, upload_status]
-                    )
-                # General Mode
-                with gr.TabItem("General Chat"):
-                    gr.Markdown("Ask anything, powered by Gemini!")
         # Main chat area
-        with gr.Column(scale=3, elem_classes="tab-content"):
-            gr.Markdown("### Chat Area")
-            selected_display = gr.Markdown(label="Selected Context", value="Select a mode to begin!")
-            chatbot = gr.Chatbot(label="Conversation", elem_classes="chatbot", type="messages")  # Updated to messages format
             question_input = gr.Textbox(label="Ask a question", placeholder="e.g., What methods are used?")
             chat_btn = gr.Button("Send")
             history_state = gr.State([])
             selected_index_state = gr.State(None)
-            def update_display(selected_tab, choice, indices, uploaded_doc):
-                if selected_tab == "Research Mode" and choice:
-                    index = int(choice.split(".")[0]) - 1
-                    selected_idx = indices[index]
-                    paper = df.iloc[selected_idx]
-                    return f"**{paper['title']}**<br>DOI: [{paper['doi']}](https://doi.org/{paper['doi']})", selected_idx
-                elif selected_tab == "RAG Mode" and uploaded_doc:
-                    return "Uploaded Document Ready", None
-                elif selected_tab == "General Chat":
-                    return "General Chat Mode", None
-                return "Select a mode to begin!", None
-            mode_tabs.select(
-                fn=lambda selected_tab: update_display(selected_tab, paper_dropdown.value, paper_indices_state.value, uploaded_doc_state.value),
-                inputs=[mode_tabs],
-                outputs=[selected_display, selected_index_state]
             ).then(
                 fn=lambda: [],
                 inputs=None,
-                outputs=[chatbot, history_state]
-            )
-            paper_dropdown.change(
-                fn=update_display,
-                inputs=[mode_tabs, paper_dropdown, paper_indices_state, uploaded_doc_state],
-                outputs=[selected_display, selected_index_state]
             )
             chat_btn.click(
-                fn=lambda mode, idx, q, hist, doc: answer_question(
-                    "research" if mode == "Research Mode" else "rag" if mode == "RAG Mode" else "general",
-                    idx, q, hist, doc
-                ),
-                inputs=[mode_tabs, selected_index_state, question_input, history_state, uploaded_doc_state],
                 outputs=[chatbot, history_state]
             ).then(
                 fn=lambda: "",

 from transformers import AutoTokenizer, AutoModel
 import google.generativeai as genai
 import logging
 # Set up logging
 logging.basicConfig(level=logging.INFO)
     logger.error(f"Model loading failed: {e}")
     raise
+# Generate SciBERT embeddings
+def generate_embeddings_sci_bert(texts, batch_size=32):
     try:
         all_embeddings = []
         for i in range(0, len(texts), batch_size):
     logger.error(f"FAISS index creation failed: {e}")
     raise
+# Hybrid search function (return indices instead of truncated strings)
 def get_relevant_papers(query):
     if not query.strip():
         return [], "Please enter a search query."
         bm25_top_indices = np.argsort(bm25_scores)[::-1][:5]
         combined_indices = list(set(indices[0]) | set(bm25_top_indices))
         ranked_results = sorted(combined_indices, key=lambda idx: -bm25_scores[idx])
+        # Return formatted strings for dropdown and indices for full data
         papers = [f"{i+1}. {df.iloc[idx]['title']} - Abstract: {df.iloc[idx]['abstract'][:200]}..." for i, idx in enumerate(ranked_results[:5])]
         return papers, ranked_results[:5], "Search completed."
     except Exception as e:
         logger.error(f"Search failed: {e}")
         return [], [], "Search failed. Please try again."
+# Gemini API QA function with full context
+def answer_question(selected_index, question, history):
+    if selected_index is None:
+        return [(question, "Please select a paper first!")], history
     if not question.strip():
+        return [(question, "Please ask a question!")], history
     if question.lower() in ["exit", "done"]:
+        return [("Conversation ended.", "Select a new paper or search again!")], []
     try:
+        # Get full paper data from DataFrame using index
+        paper_data = df.iloc[selected_index]
+        title = paper_data["title"]
+        abstract = paper_data["abstract"]  # Full abstract, not truncated
+        authors = ", ".join(paper_data["authors"])
+        doi = paper_data["doi"]
+        # Build prompt with all fields
+        prompt = (
+            "You are Dr. Sage, the world's most brilliant and reliable research assistant, specializing in machine learning, deep learning, and agriculture. "
+            "Your goal is to provide concise, accurate, and well-structured answers based on the given paper's details. "
+            "When asked about tech stacks or methods, follow these guidelines:\n"
+            "1. If the abstract explicitly mentions technologies (e.g., Python, TensorFlow), list them precisely with brief explanations.\n"
+            "2. If the abstract is vague (e.g., 'machine learning techniques'), infer the most likely tech stacks based on the context of crop prediction and modern research practices, and explain your reasoning.\n"
+            "3. Always respond in a clear, concise format—use bullet points for lists (e.g., tech stacks) and short paragraphs for explanations.\n"
+            "4. If the question requires prior conversation context, refer to it naturally to maintain coherence.\n"
+            "5. If the abstract lacks enough detail, supplement with plausible, domain-specific suggestions and note they are inferred.\n"
+            "6. Avoid speculation or fluff—stick to facts or educated guesses grounded in the field.\n\n"
+            "Here’s the paper:\n"
+            f"Title: {title}\n"
+            f"Authors: {authors}\n"
+            f"Abstract: {abstract}\n"
+            f"DOI: {doi}\n\n"
+        )
+        # Add history if present
+        if history:
+            prompt += "Previous conversation (use for context):\n"
+            for user_q, bot_a in history[-2:]:
+                prompt += f"User: {user_q}\nAssistant: {bot_a}\n"
+        prompt += f"Now, answer this question: {question}"
+        logger.info(f"Prompt sent to Gemini API: {prompt[:200]}...")
+        # Call Gemini API (Gemini 1.5 Flash)
+        model = genai.GenerativeModel("gemini-1.5-flash")
+        response = model.generate_content(prompt)
+        answer = response.text.strip()
+        # Fallback for poor responses
+        if not answer or len(answer) < 15:
+            answer = (
+                "The abstract doesn’t provide specific technologies, but based on crop prediction with machine learning and deep learning, likely tech stacks include:\n"
+                "- Python: Core language for ML/DL.\n"
+                "- TensorFlow or PyTorch: Frameworks for deep learning models.\n"
+                "- Scikit-learn: For traditional ML algorithms.\n"
+                "- Pandas/NumPy: For data handling and preprocessing."
             )
+        history.append((question, answer))
         return history, history
     except Exception as e:
         logger.error(f"QA failed: {e}")
+        history.append((question, "Sorry, I couldn’t process that. Try again!"))
         return history, history
 # Gradio UI
 with gr.Blocks(
     css="""
+    .chatbot {height: 600px; overflow-y: auto;}
+    .sidebar {width: 300px;}
+    #main {display: flex; flex-direction: row;}
     """,
+    theme=gr.themes.Default(primary_hue="blue")
 ) as demo:
+    gr.Markdown("# ResearchGPT - Paper Search & Chat")
     with gr.Row(elem_id="main"):
+        # Sidebar for search
+        with gr.Column(scale=1, min_width=300, elem_classes="sidebar"):
+            gr.Markdown("### Search Papers")
+            query_input = gr.Textbox(label="Enter your search query", placeholder="e.g., machine learning in healthcare")
+            search_btn = gr.Button("Search")
+            paper_dropdown = gr.Dropdown(label="Select a Paper", choices=[], interactive=True)
+            search_status = gr.Textbox(label="Search Status", interactive=False)
+            # States to store paper choices and indices
+            paper_choices_state = gr.State([])
+            paper_indices_state = gr.State([])
+            search_btn.click(
+                fn=get_relevant_papers,
+                inputs=query_input,
+                outputs=[paper_choices_state, paper_indices_state, search_status]
+            ).then(
+                fn=lambda choices: gr.update(choices=choices, value=None),
+                inputs=paper_choices_state,
+                outputs=paper_dropdown
+            )
         # Main chat area
+        with gr.Column(scale=3):
+            gr.Markdown("### Chat with Selected Paper")
+            selected_paper = gr.Textbox(label="Selected Paper", interactive=False)
+            chatbot = gr.Chatbot(label="Conversation", elem_classes="chatbot")
             question_input = gr.Textbox(label="Ask a question", placeholder="e.g., What methods are used?")
             chat_btn = gr.Button("Send")
+            # State to store conversation history and selected index
             history_state = gr.State([])
             selected_index_state = gr.State(None)
+            # Update selected paper and index
+            def update_selected_paper(choice, indices):
+                if choice is None:
+                    return "", None
+                index = int(choice.split(".")[0]) - 1  # Extract rank (e.g., "1." -> 0)
+                selected_idx = indices[index]
+                return choice, selected_idx
+            paper_dropdown.change(
+                fn=update_selected_paper,
+                inputs=[paper_dropdown, paper_indices_state],
+                outputs=[selected_paper, selected_index_state]
             ).then(
                 fn=lambda: [],
                 inputs=None,
+                outputs=chatbot
             )
+            # Handle chat
             chat_btn.click(
+                fn=answer_question,
+                inputs=[selected_index_state, question_input, history_state],
                 outputs=[chatbot, history_state]
             ).then(
                 fn=lambda: "",