Spaces:

ADKU
/

ResearchGPT_space

Running

App Files Files Community

ADKU commited on Feb 28

Commit

cf496f0

verified ·

1 Parent(s): 9699ac9

Update app.py

Browse files

made a gradio app and extended the project with paper qs answering feature in the model using GPT 2 model and few enhancements to enhance the performance of the model

Files changed (1) hide show

app.py +120 -88

app.py CHANGED Viewed

@@ -4,130 +4,162 @@ import numpy as np
 from rank_bm25 import BM25Okapi
 import torch
 import pandas as pd
-from fastapi import FastAPI
-from pydantic import BaseModel
-from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast, AutoTokenizer, AutoModel
-# ✅ Set cache directory to /tmp/huggingface (fixes permission error)
 os.environ["HF_HOME"] = "/tmp/huggingface"
-os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"
-os.environ["HUGGINGFACE_HUB_CACHE"] = "/tmp/huggingface"
-app = FastAPI()
-# Ensure the correct file path
 DATASET_PATH = os.path.join(os.getcwd(), "springer_papers_DL.json")
 if not os.path.exists(DATASET_PATH):
     raise FileNotFoundError(f"Dataset file not found at {DATASET_PATH}")
-# Load dataset
 df = pd.read_json(DATASET_PATH)
-# ✅ Clean text function
 def clean_text(text):
     return text.strip().lower()
 df["cleaned_abstract"] = df["abstract"].apply(clean_text)
-# ✅ Precompute BM25 Index
 tokenized_corpus = [paper.split() for paper in df["cleaned_abstract"]]
 bm25 = BM25Okapi(tokenized_corpus)
-# ✅ Load embedding models
-embedding_models = {
-    "BERT": "bert-base-uncased",
-    "DistilBERT": "distilbert-base-uncased",
-    "Sentence-BERT": "all-MiniLM-L6-v2",
-    "MiniLM": "sentence-transformers/all-MiniLM-L12-v2",
-    "SciBERT": "allenai/scibert_scivocab_uncased",
-}
-BATCH_SIZE = 32  # Batch size for processing
-# ✅ Function to clear GPU memory
-def clear_gpu_memory():
-    torch.cuda.empty_cache()
-# ✅ Generate embeddings using SciBERT
-def generate_embeddings_sci_bert(texts, batch_size=BATCH_SIZE):
-    model_name = "allenai/scibert_scivocab_uncased"
-    tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir="/tmp/huggingface")
-    model = AutoModel.from_pretrained(model_name, cache_dir="/tmp/huggingface")
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    model.to(device)
     all_embeddings = []
     for i in range(0, len(texts), batch_size):
-        batch = texts[i : i + batch_size]
-        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
         inputs = {key: val.to(device) for key, val in inputs.items()}
         with torch.no_grad():
-            outputs = model(**inputs)
         embeddings = outputs.last_hidden_state.mean(dim=1)
         all_embeddings.append(embeddings.cpu().numpy())
-        clear_gpu_memory()
     return np.concatenate(all_embeddings, axis=0)
-# ✅ Compute embeddings
 abstracts = df["cleaned_abstract"].tolist()
-embeddings = generate_embeddings_sci_bert(abstracts, batch_size=BATCH_SIZE)
-# ✅ Initialize FAISS index
 dimension = embeddings.shape[1]
 faiss_index = faiss.IndexFlatL2(dimension)
 faiss_index.add(embeddings.astype(np.float32))
-# ✅ API Request Model
-class InputText(BaseModel):
-    query: str
-    top_k: int = 5
-# ✅ Hybrid Search Function
 def get_relevant_papers(query, top_k=5):
     if not query.strip():
-        return {"error": "Query is empty. Please enter a valid search query."}
-    # 1️⃣ Generate query embedding
-    query_embedding = generate_embeddings_sci_bert([query], batch_size=1)
-    # 2️⃣ Perform FAISS similarity search
     distances, indices = faiss_index.search(query_embedding.astype(np.float32), top_k)
-    # 3️⃣ Perform BM25 keyword search
     tokenized_query = query.split()
     bm25_scores = bm25.get_scores(tokenized_query)
     bm25_top_indices = np.argsort(bm25_scores)[::-1][:top_k]
-    # 4️⃣ Combine FAISS and BM25 results
     combined_indices = list(set(indices[0]) | set(bm25_top_indices))
     ranked_results = sorted(combined_indices, key=lambda idx: -bm25_scores[idx])
-    # 5️⃣ Retrieve relevant papers
-    relevant_papers = []
     for i, index in enumerate(ranked_results[:top_k]):
         paper = df.iloc[index]
-        relevant_papers.append({
-            "rank": i + 1,
-            "title": paper["title"],
-            "authors": paper["authors"],
-            "abstract": paper["cleaned_abstract"],
-        })
-    return {"results": relevant_papers}
-# ✅ FastAPI Endpoint
-@app.post("/predict/")
-async def predict(data: InputText):
-    return get_relevant_papers(data.query, data.top_k)
-# Run FastAPI
-if __name__ == "__main__":
-    import uvicorn
-    uvicorn.run(app, host="0.0.0.0")

 from rank_bm25 import BM25Okapi
 import torch
 import pandas as pd
+import gradio as gr
+from transformers import AutoTokenizer, AutoModel, GPT2LMHeadModel, GPT2Tokenizer
+# Set cache directory for Hugging Face models
 os.environ["HF_HOME"] = "/tmp/huggingface"
+# Load dataset
 DATASET_PATH = os.path.join(os.getcwd(), "springer_papers_DL.json")
 if not os.path.exists(DATASET_PATH):
     raise FileNotFoundError(f"Dataset file not found at {DATASET_PATH}")
 df = pd.read_json(DATASET_PATH)
+# Clean text
 def clean_text(text):
     return text.strip().lower()
 df["cleaned_abstract"] = df["abstract"].apply(clean_text)
+# Precompute BM25 Index
 tokenized_corpus = [paper.split() for paper in df["cleaned_abstract"]]
 bm25 = BM25Okapi(tokenized_corpus)
+# Load SciBERT for embeddings (preloaded globally)
+sci_bert_tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased", cache_dir="/tmp/huggingface")
+sci_bert_model = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased", cache_dir="/tmp/huggingface")
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+sci_bert_model.to(device)
+sci_bert_model.eval()
+# Load GPT-2 for QA (using distilgpt2 for efficiency)
+gpt2_tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2", cache_dir="/tmp/huggingface")
+gpt2_model = GPT2LMHeadModel.from_pretrained("distilgpt2", cache_dir="/tmp/huggingface")
+gpt2_model.to(device)
+gpt2_model.eval()
+# Generate SciBERT embeddings
+def generate_embeddings_sci_bert(texts, batch_size=32):
     all_embeddings = []
     for i in range(0, len(texts), batch_size):
+        batch = texts[i:i + batch_size]
+        inputs = sci_bert_tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
         inputs = {key: val.to(device) for key, val in inputs.items()}
         with torch.no_grad():
+            outputs = sci_bert_model(**inputs)
         embeddings = outputs.last_hidden_state.mean(dim=1)
         all_embeddings.append(embeddings.cpu().numpy())
+        torch.cuda.empty_cache()
     return np.concatenate(all_embeddings, axis=0)
+# Precompute embeddings and FAISS index
 abstracts = df["cleaned_abstract"].tolist()
+embeddings = generate_embeddings_sci_bert(abstracts)
 dimension = embeddings.shape[1]
 faiss_index = faiss.IndexFlatL2(dimension)
 faiss_index.add(embeddings.astype(np.float32))
+# Hybrid search function
 def get_relevant_papers(query, top_k=5):
     if not query.strip():
+        return []
+    query_embedding = generate_embeddings_sci_bert([query])
     distances, indices = faiss_index.search(query_embedding.astype(np.float32), top_k)
     tokenized_query = query.split()
     bm25_scores = bm25.get_scores(tokenized_query)
     bm25_top_indices = np.argsort(bm25_scores)[::-1][:top_k]
     combined_indices = list(set(indices[0]) | set(bm25_top_indices))
     ranked_results = sorted(combined_indices, key=lambda idx: -bm25_scores[idx])
+    papers = []
     for i, index in enumerate(ranked_results[:top_k]):
         paper = df.iloc[index]
+        papers.append(f"{i+1}. {paper['title']} - Abstract: {paper['cleaned_abstract'][:200]}...")
+    return papers
+# GPT-2 QA function
+def answer_question(paper, question, history):
+    if not question.strip():
+        return "Please ask a question!", history
+    if question.lower() in ["exit", "done"]:
+        return "Conversation ended. Select a new paper or search again!", []
+    # Extract title and abstract from paper string
+    title = paper.split(" - Abstract: ")[0].split(". ", 1)[1]
+    abstract = paper.split(" - Abstract: ")[1].rstrip("...")
+    # Build context with history
+    context = f"Title: {title}\nAbstract: {abstract}\n\nPrevious conversation:\n"
+    for user_q, bot_a in history:
+        context += f"User: {user_q}\nAssistant: {bot_a}\n"
+    context += f"User: {question}\nAssistant: "
+    # Generate response
+    inputs = gpt2_tokenizer(context, return_tensors="pt", truncation=True, max_length=512)
+    inputs = {key: val.to(device) for key, val in inputs.items()}
+    with torch.no_grad():
+        outputs = gpt2_model.generate(
+            inputs["input_ids"],
+            max_new_tokens=100,
+            do_sample=True,
+            temperature=0.7,
+            top_k=50,
+            pad_token_id=gpt2_tokenizer.eos_token_id
+        )
+    response = gpt2_tokenizer.decode(outputs[0], skip_special_tokens=True)
+    response = response[len(context):].strip()
+    history.append((question, response))
+    return response, history
+# Gradio UI
+with gr.Blocks(
+    css="""
+    .chatbot {height: 600px; overflow-y: auto;}
+    .sidebar {width: 300px;}
+    #main {display: flex; flex-direction: row;}
+    """,
+    theme=gr.themes.Default(primary_hue="blue")
+) as demo:
+    gr.Markdown("# ResearchGPT - Paper Search & Chat")
+    with gr.Row(elem_id="main"):
+        # Sidebar for search
+        with gr.Column(scale=1, min_width=300, elem_classes="sidebar"):
+            gr.Markdown("### Search Papers")
+            query_input = gr.Textbox(label="Enter your search query", placeholder="e.g., machine learning in healthcare")
+            search_btn = gr.Button("Search")
+            paper_dropdown = gr.Dropdown(label="Select a Paper", choices=[], interactive=True)
+            search_btn.click(
+                fn=get_relevant_papers,
+                inputs=query_input,
+                outputs=paper_dropdown
+            )
+        # Main chat area
+        with gr.Column(scale=3):
+            gr.Markdown("### Chat with Selected Paper")
+            selected_paper = gr.Textbox(label="Selected Paper", interactive=False)
+            chatbot = gr.Chatbot(label="Conversation", elem_classes="chatbot")
+            question_input = gr.Textbox(label="Ask a question", placeholder="e.g., What methods are used?")
+            chat_btn = gr.Button("Send")
+            # State to store conversation history
+            history_state = gr.State([])
+            # Update selected paper
+            paper_dropdown.change(
+                fn=lambda x: x,
+                inputs=paper_dropdown,
+                outputs=selected_paper
+            )
+            # Handle chat
+            chat_btn.click(
+                fn=answer_question,
+                inputs=[selected_paper, question_input, history_state],
+                outputs=[chatbot, history_state],
+                _js="() => {document.querySelector('.chatbot').scrollTop = document.querySelector('.chatbot').scrollHeight;}"
+            )
+# Launch the app
+demo.launch()