Spaces:

ADKU
/

ResearchGPT_space

Sleeping

App Files Files Community

ADKU commited on Feb 28

Commit

12e1b40

verified ·

1 Parent(s): 3bee96e

Update app.py

Browse files

Files changed (1) hide show

app.py +127 -80

app.py CHANGED Viewed

@@ -6,111 +6,155 @@ import torch
 import pandas as pd
 import gradio as gr
 from transformers import AutoTokenizer, AutoModel, GPT2LMHeadModel, GPT2Tokenizer
 # Set cache directory for Hugging Face models
 os.environ["HF_HOME"] = "/tmp/huggingface"
-# Load dataset
 DATASET_PATH = os.path.join(os.getcwd(), "springer_papers_DL.json")
-if not os.path.exists(DATASET_PATH):
-    raise FileNotFoundError(f"Dataset file not found at {DATASET_PATH}")
-df = pd.read_json(DATASET_PATH)
 # Clean text
 def clean_text(text):
-    return text.strip().lower()
 df["cleaned_abstract"] = df["abstract"].apply(clean_text)
 # Precompute BM25 Index
-tokenized_corpus = [paper.split() for paper in df["cleaned_abstract"]]
-bm25 = BM25Okapi(tokenized_corpus)
-# Load SciBERT for embeddings (preloaded globally)
-sci_bert_tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased", cache_dir="/tmp/huggingface")
-sci_bert_model = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased", cache_dir="/tmp/huggingface")
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-sci_bert_model.to(device)
-sci_bert_model.eval()
-# Load GPT-2 for QA (using distilgpt2 for efficiency)
-gpt2_tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2", cache_dir="/tmp/huggingface")
-gpt2_model = GPT2LMHeadModel.from_pretrained("distilgpt2", cache_dir="/tmp/huggingface")
-gpt2_model.to(device)
-gpt2_model.eval()
 # Generate SciBERT embeddings
 def generate_embeddings_sci_bert(texts, batch_size=32):
-    all_embeddings = []
-    for i in range(0, len(texts), batch_size):
-        batch = texts[i:i + batch_size]
-        inputs = sci_bert_tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
-        inputs = {key: val.to(device) for key, val in inputs.items()}
-        with torch.no_grad():
-            outputs = sci_bert_model(**inputs)
-        embeddings = outputs.last_hidden_state.mean(dim=1)
-        all_embeddings.append(embeddings.cpu().numpy())
-        torch.cuda.empty_cache()
-    return np.concatenate(all_embeddings, axis=0)
 # Precompute embeddings and FAISS index
-abstracts = df["cleaned_abstract"].tolist()
-embeddings = generate_embeddings_sci_bert(abstracts)
-dimension = embeddings.shape[1]
-faiss_index = faiss.IndexFlatL2(dimension)
-faiss_index.add(embeddings.astype(np.float32))
 # Hybrid search function
 def get_relevant_papers(query, top_k=5):
     if not query.strip():
         return []
-    query_embedding = generate_embeddings_sci_bert([query])
-    distances, indices = faiss_index.search(query_embedding.astype(np.float32), top_k)
-    tokenized_query = query.split()
-    bm25_scores = bm25.get_scores(tokenized_query)
-    bm25_top_indices = np.argsort(bm25_scores)[::-1][:top_k]
-    combined_indices = list(set(indices[0]) | set(bm25_top_indices))
-    ranked_results = sorted(combined_indices, key=lambda idx: -bm25_scores[idx])
-    papers = []
-    for i, index in enumerate(ranked_results[:top_k]):
-        paper = df.iloc[index]
-        papers.append(f"{i+1}. {paper['title']} - Abstract: {paper['cleaned_abstract'][:200]}...")
-    return papers
 # GPT-2 QA function
 def answer_question(paper, question, history):
     if not question.strip():
-        return "Please ask a question!", history
     if question.lower() in ["exit", "done"]:
-        return "Conversation ended. Select a new paper or search again!", []
-    # Extract title and abstract from paper string
-    title = paper.split(" - Abstract: ")[0].split(". ", 1)[1]
-    abstract = paper.split(" - Abstract: ")[1].rstrip("...")
-    # Build context with history
-    context = f"Title: {title}\nAbstract: {abstract}\n\nPrevious conversation:\n"
-    for user_q, bot_a in history:
-        context += f"User: {user_q}\nAssistant: {bot_a}\n"
-    context += f"User: {question}\nAssistant: "
-    # Generate response
-    inputs = gpt2_tokenizer(context, return_tensors="pt", truncation=True, max_length=512)
-    inputs = {key: val.to(device) for key, val in inputs.items()}
-    with torch.no_grad():
-        outputs = gpt2_model.generate(
-            inputs["input_ids"],
-            max_new_tokens=100,
-            do_sample=True,
-            temperature=0.7,
-            top_k=50,
-            pad_token_id=gpt2_tokenizer.eos_token_id
-        )
-    response = gpt2_tokenizer.decode(outputs[0], skip_special_tokens=True)
-    response = response[len(context):].strip()
-    history.append((question, response))
-    return response, history
 # Gradio UI
 with gr.Blocks(
@@ -148,18 +192,21 @@ with gr.Blocks(
             # Update selected paper
             paper_dropdown.change(
-                fn=lambda x: x,
                 inputs=paper_dropdown,
-                outputs=selected_paper
             )
             # Handle chat
             chat_btn.click(
                 fn=answer_question,
                 inputs=[selected_paper, question_input, history_state],
-                outputs=[chatbot, history_state],
-                _js="() => {document.querySelector('.chatbot').scrollTop = document.querySelector('.chatbot').scrollHeight;}"
             )
 # Launch the app
-demo.launch()

 import pandas as pd
 import gradio as gr
 from transformers import AutoTokenizer, AutoModel, GPT2LMHeadModel, GPT2Tokenizer
+import logging
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 # Set cache directory for Hugging Face models
 os.environ["HF_HOME"] = "/tmp/huggingface"
+# Load dataset with error handling
 DATASET_PATH = os.path.join(os.getcwd(), "springer_papers_DL.json")
+try:
+    if not os.path.exists(DATASET_PATH):
+        raise FileNotFoundError(f"Dataset file not found at {DATASET_PATH}")
+    df = pd.read_json(DATASET_PATH)
+    logger.info("Dataset loaded successfully")
+except Exception as e:
+    logger.error(f"Failed to load dataset: {e}")
+    raise
 # Clean text
 def clean_text(text):
+    return text.strip().lower() if isinstance(text, str) else ""
 df["cleaned_abstract"] = df["abstract"].apply(clean_text)
 # Precompute BM25 Index
+try:
+    tokenized_corpus = [paper.split() for paper in df["cleaned_abstract"]]
+    bm25 = BM25Okapi(tokenized_corpus)
+    logger.info("BM25 index created")
+except Exception as e:
+    logger.error(f"BM25 index creation failed: {e}")
+    raise
+# Load models with error handling
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+logger.info(f"Using device: {device}")
+try:
+    # SciBERT for embeddings
+    sci_bert_tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased", cache_dir="/tmp/huggingface")
+    sci_bert_model = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased", cache_dir="/tmp/huggingface")
+    sci_bert_model.to(device)
+    sci_bert_model.eval()
+    logger.info("SciBERT loaded")
+    # DistilGPT-2 for QA
+    gpt2_tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2", cache_dir="/tmp/huggingface")
+    gpt2_model = GPT2LMHeadModel.from_pretrained("distilgpt2", cache_dir="/tmp/huggingface")
+    gpt2_model.to(device)
+    gpt2_model.eval()
+    logger.info("DistilGPT-2 loaded")
+except Exception as e:
+    logger.error(f"Model loading failed: {e}")
+    raise
 # Generate SciBERT embeddings
 def generate_embeddings_sci_bert(texts, batch_size=32):
+    try:
+        all_embeddings = []
+        for i in range(0, len(texts), batch_size):
+            batch = texts[i:i + batch_size]
+            inputs = sci_bert_tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
+            inputs = {key: val.to(device) for key, val in inputs.items()}
+            with torch.no_grad():
+                outputs = sci_bert_model(**inputs)
+            embeddings = outputs.last_hidden_state.mean(dim=1)
+            all_embeddings.append(embeddings.cpu().numpy())
+            torch.cuda.empty_cache()
+        return np.concatenate(all_embeddings, axis=0)
+    except Exception as e:
+        logger.error(f"Embedding generation failed: {e}")
+        return np.zeros((len(texts), 768))  # Fallback to zero embeddings
 # Precompute embeddings and FAISS index
+try:
+    abstracts = df["cleaned_abstract"].tolist()
+    embeddings = generate_embeddings_sci_bert(abstracts)
+    dimension = embeddings.shape[1]
+    faiss_index = faiss.IndexFlatL2(dimension)
+    faiss_index.add(embeddings.astype(np.float32))
+    logger.info("FAISS index created")
+except Exception as e:
+    logger.error(f"FAISS index creation failed: {e}")
+    raise
 # Hybrid search function
 def get_relevant_papers(query, top_k=5):
     if not query.strip():
         return []
+    try:
+        query_embedding = generate_embeddings_sci_bert([query])
+        distances, indices = faiss_index.search(query_embedding.astype(np.float32), top_k)
+        tokenized_query = query.split()
+        bm25_scores = bm25.get_scores(tokenized_query)
+        bm25_top_indices = np.argsort(bm25_scores)[::-1][:top_k]
+        combined_indices = list(set(indices[0]) | set(bm25_top_indices))
+        ranked_results = sorted(combined_indices, key=lambda idx: -bm25_scores[idx])
+        papers = []
+        for i, index in enumerate(ranked_results[:top_k]):
+            paper = df.iloc[index]
+            papers.append(f"{i+1}. {paper['title']} - Abstract: {paper['cleaned_abstract'][:200]}...")
+        return papers
+    except Exception as e:
+        logger.error(f"Search failed: {e}")
+        return ["Search failed. Please try again."]
 # GPT-2 QA function
 def answer_question(paper, question, history):
+    if not paper:
+        return [("Please select a paper first!", "")], history
     if not question.strip():
+        return [(question, "Please ask a question!")], history
     if question.lower() in ["exit", "done"]:
+        return [("Conversation ended. Select a new paper or search again!", "")], []
+    try:
+        # Extract title and abstract
+        title = paper.split(" - Abstract: ")[0].split(". ", 1)[1]
+        abstract = paper.split(" - Abstract: ")[1].rstrip("...")
+        # Build context with history
+        context = f"Title: {title}\nAbstract: {abstract}\n\nPrevious conversation:\n"
+        for user_q, bot_a in history:
+            context += f"User: {user_q}\nAssistant: {bot_a}\n"
+        context += f"User: {question}\nAssistant: "
+        # Generate response
+        inputs = gpt2_tokenizer(context, return_tensors="pt", truncation=True, max_length=512)
+        inputs = {key: val.to(device) for key, val in inputs.items()}
+        with torch.no_grad():
+            outputs = gpt2_model.generate(
+                inputs["input_ids"],
+                max_new_tokens=100,
+                do_sample=True,
+                temperature=0.7,
+                top_k=50,
+                pad_token_id=gpt2_tokenizer.eos_token_id
+            )
+        response = gpt2_tokenizer.decode(outputs[0], skip_special_tokens=True)
+        response = response[len(context):].strip()
+        history.append((question, response))
+        return history, history  # Return updated history for Chatbot
+    except Exception as e:
+        logger.error(f"QA failed: {e}")
+        history.append((question, "Sorry, I couldn’t process that. Try again!"))
+        return history, history
 # Gradio UI
 with gr.Blocks(
             # Update selected paper
             paper_dropdown.change(
+                fn=lambda x: (x, []),  # Reset history when new paper selected
                 inputs=paper_dropdown,
+                outputs=[selected_paper, history_state]
             )
             # Handle chat
             chat_btn.click(
                 fn=answer_question,
                 inputs=[selected_paper, question_input, history_state],
+                outputs=[chatbot, history_state]
+            ).then(
+                fn=lambda: "",
+                inputs=None,
+                outputs=question_input  # Clear question input after sending
             )
 # Launch the app
+demo.launch(server_name="0.0.0.0", server_port=7860)