Spaces:

ADKU
/

ResearchGPT_space

Running

App Files Files Community

ADKU commited on Feb 27

Commit

9699ac9

verified ·

1 Parent(s): a91f0db

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -27

app.py CHANGED Viewed

@@ -8,7 +8,9 @@ from fastapi import FastAPI
 from pydantic import BaseModel
 from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast, AutoTokenizer, AutoModel
-# Set Hugging Face cache directory
 os.environ["HUGGINGFACE_HUB_CACHE"] = "/tmp/huggingface"
 app = FastAPI()
@@ -22,63 +24,77 @@ if not os.path.exists(DATASET_PATH):
 # Load dataset
 df = pd.read_json(DATASET_PATH)
-# Clean text function
 def clean_text(text):
     return text.strip().lower()
-df['cleaned_abstract'] = df['abstract'].apply(clean_text)
-# Precompute BM25 Index
 tokenized_corpus = [paper.split() for paper in df["cleaned_abstract"]]
 bm25 = BM25Okapi(tokenized_corpus)
-# Load FAISS model
-embedding_model = "allenai/scibert_scivocab_uncased"
-tokenizer = AutoTokenizer.from_pretrained(embedding_model)
-model = AutoModel.from_pretrained(embedding_model)
-# Generate embeddings using SciBERT
-def generate_embeddings_sci_bert(texts, batch_size=32):
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     model.to(device)
     all_embeddings = []
     for i in range(0, len(texts), batch_size):
-        batch = texts[i:i + batch_size]
         inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
         inputs = {key: val.to(device) for key, val in inputs.items()}
         with torch.no_grad():
             outputs = model(**inputs)
         embeddings = outputs.last_hidden_state.mean(dim=1)
         all_embeddings.append(embeddings.cpu().numpy())
     return np.concatenate(all_embeddings, axis=0)
-# Compute document embeddings
 abstracts = df["cleaned_abstract"].tolist()
-embeddings = generate_embeddings_sci_bert(abstracts, batch_size=32)
-# Initialize FAISS index
 dimension = embeddings.shape[1]
 faiss_index = faiss.IndexFlatL2(dimension)
 faiss_index.add(embeddings.astype(np.float32))
-# API Request Model
 class InputText(BaseModel):
     query: str
     top_k: int = 5
-@app.post("/predict/")
-async def predict(data: InputText):
-    query = data.query
-    top_k = data.top_k
     if not query.strip():
         return {"error": "Query is empty. Please enter a valid search query."}
-    # 1️⃣ Generate embedding for query
     query_embedding = generate_embeddings_sci_bert([query], batch_size=1)
     # 2️⃣ Perform FAISS similarity search
@@ -93,7 +109,7 @@ async def predict(data: InputText):
     combined_indices = list(set(indices[0]) | set(bm25_top_indices))
     ranked_results = sorted(combined_indices, key=lambda idx: -bm25_scores[idx])
-    # 5️⃣ Retrieve research papers
     relevant_papers = []
     for i, index in enumerate(ranked_results[:top_k]):
         paper = df.iloc[index]
@@ -101,11 +117,16 @@ async def predict(data: InputText):
             "rank": i + 1,
             "title": paper["title"],
             "authors": paper["authors"],
-            "abstract": paper["cleaned_abstract"]
         })
     return {"results": relevant_papers}
 # Run FastAPI
 if __name__ == "__main__":
     import uvicorn

 from pydantic import BaseModel
 from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast, AutoTokenizer, AutoModel
+# ✅ Set cache directory to /tmp/huggingface (fixes permission error)
+os.environ["HF_HOME"] = "/tmp/huggingface"
+os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"
 os.environ["HUGGINGFACE_HUB_CACHE"] = "/tmp/huggingface"
 app = FastAPI()
 # Load dataset
 df = pd.read_json(DATASET_PATH)
+# ✅ Clean text function
 def clean_text(text):
     return text.strip().lower()
+df["cleaned_abstract"] = df["abstract"].apply(clean_text)
+# ✅ Precompute BM25 Index
 tokenized_corpus = [paper.split() for paper in df["cleaned_abstract"]]
 bm25 = BM25Okapi(tokenized_corpus)
+# ✅ Load embedding models
+embedding_models = {
+    "BERT": "bert-base-uncased",
+    "DistilBERT": "distilbert-base-uncased",
+    "Sentence-BERT": "all-MiniLM-L6-v2",
+    "MiniLM": "sentence-transformers/all-MiniLM-L12-v2",
+    "SciBERT": "allenai/scibert_scivocab_uncased",
+}
+BATCH_SIZE = 32  # Batch size for processing
+# ✅ Function to clear GPU memory
+def clear_gpu_memory():
+    torch.cuda.empty_cache()
+# ✅ Generate embeddings using SciBERT
+def generate_embeddings_sci_bert(texts, batch_size=BATCH_SIZE):
+    model_name = "allenai/scibert_scivocab_uncased"
+    tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir="/tmp/huggingface")
+    model = AutoModel.from_pretrained(model_name, cache_dir="/tmp/huggingface")
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     model.to(device)
     all_embeddings = []
     for i in range(0, len(texts), batch_size):
+        batch = texts[i : i + batch_size]
         inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
         inputs = {key: val.to(device) for key, val in inputs.items()}
         with torch.no_grad():
             outputs = model(**inputs)
         embeddings = outputs.last_hidden_state.mean(dim=1)
         all_embeddings.append(embeddings.cpu().numpy())
+        clear_gpu_memory()
     return np.concatenate(all_embeddings, axis=0)
+# ✅ Compute embeddings
 abstracts = df["cleaned_abstract"].tolist()
+embeddings = generate_embeddings_sci_bert(abstracts, batch_size=BATCH_SIZE)
+# ✅ Initialize FAISS index
 dimension = embeddings.shape[1]
 faiss_index = faiss.IndexFlatL2(dimension)
 faiss_index.add(embeddings.astype(np.float32))
+# ✅ API Request Model
 class InputText(BaseModel):
     query: str
     top_k: int = 5
+# ✅ Hybrid Search Function
+def get_relevant_papers(query, top_k=5):
     if not query.strip():
         return {"error": "Query is empty. Please enter a valid search query."}
+    # 1️⃣ Generate query embedding
     query_embedding = generate_embeddings_sci_bert([query], batch_size=1)
     # 2️⃣ Perform FAISS similarity search
     combined_indices = list(set(indices[0]) | set(bm25_top_indices))
     ranked_results = sorted(combined_indices, key=lambda idx: -bm25_scores[idx])
+    # 5️⃣ Retrieve relevant papers
     relevant_papers = []
     for i, index in enumerate(ranked_results[:top_k]):
         paper = df.iloc[index]
             "rank": i + 1,
             "title": paper["title"],
             "authors": paper["authors"],
+            "abstract": paper["cleaned_abstract"],
         })
     return {"results": relevant_papers}
+# ✅ FastAPI Endpoint
+@app.post("/predict/")
+async def predict(data: InputText):
+    return get_relevant_papers(data.query, data.top_k)
 # Run FastAPI
 if __name__ == "__main__":
     import uvicorn