Spaces:

jaindivyansh
/

bioasq-1

Running

App Files Files Community

jiviteshjain commited on 29 days ago

Commit

8042e59

1 Parent(s): 7e24f8a

Track files with lfs.

Browse files

Files changed (6) hide show

.gitattributes +1 -0
app.py +80 -0
cover.webp +0 -0
data/bioasq_contexts.jsonl +3 -0
data/bioasq_contexts__snowflake-arctic-embed-l__float32_hnsw.index +3 -0
rag.py +160 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/** filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import gc
+import streamlit as st
+import torch
+from rag import load_all, run_query
+@st.cache_resource(
+    show_spinner="Loading models and indices. This might take a while..."
+)
+def get_rag_qa() -> dict:
+    gc.collect()
+    torch.cuda.empty_cache()
+    return load_all(
+        embedder_path="Snowflake/snowflake-arctic-embed-l",
+        embedder_device="cpu",
+        context_file="data/bioasq_contexts.jsonl",
+        index_file="data/bioasq_contexts__snowflake-arctic-embed-l__float32_hnsw.index",
+        reader_path="meta-llama/Llama-3.2-1B-Instruct",
+        reader_device="mps",
+    )
+left_column, cent_column, last_column = st.columns(3)
+with cent_column:
+    st.image("cover.webp", width=400)
+st.title("Ask the BioASQ Database Anything!")
+# Initialize the RagQA model, might be already cached.
+_ = get_rag_qa()
+# Run QA
+st.subheader("Ask away:")
+question = st.text_input("Ask away:", "", label_visibility="collapsed")
+submit = st.button("Submit")
+st.markdown(
+    """
+    > **For example, ask things like:**
+    >
+    > What is the Bartter syndrome?
+    > Which genes have been found to be associated with restless leg syndrome?
+    > Which diseases can be treated with Afamelanotide?
+    ---
+    """,
+    unsafe_allow_html=False,
+)
+if submit:
+    if not question.strip():
+        st.error("Machine Learning still can't read minds. Please enter a question.")
+    else:
+        try:
+            with st.spinner(
+                "Combing through 3000+ documents from the BioASQ database..."
+            ):
+                rag_qa = get_rag_qa()
+                retrieved_context_ids, sources, answer = run_query(question, **rag_qa)
+                print(answer)
+                print(retrieved_context_ids)
+                print(sources)
+            st.subheader("Answer:")
+            st.write(answer)
+            st.write("")
+            with st.expander("Show Sources"):
+                st.subheader("Sources:")
+                for i, (context_id, source) in enumerate(
+                    zip(retrieved_context_ids, sources)
+                ):
+                    st.markdown(f"**BioASQ Document ID:** {context_id}")
+                    st.markdown(f"**Text:**")
+                    st.write(source)
+                    if i < len(sources) - 1:
+                        st.markdown("---")
+        except Exception as e:
+            st.error(f"An error occurred: {e}")

cover.webp ADDED Viewed

data/bioasq_contexts.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1bb0fb8e100386c48d37f3a489593c326a474ed8bde13b834c929637a0c0bbc7
+size 4753372

data/bioasq_contexts__snowflake-arctic-embed-l__float32_hnsw.index ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0f4fe738c0ca9c5846dacb07d932360fa9d41d967f0028fcb329fc55958f0834
+size 15377790

rag.py ADDED Viewed

	@@ -0,0 +1,160 @@

+# %%
+import os
+import json
+import torch
+import faiss
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from transformers import (
+    pipeline,
+    TextGenerationPipeline,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+)
+HF_TOKEN = os.environ["hf_token"]
+SYSTEM_PROMPT = """You are a helpful question answering assistant. You will be given a context and a question. You need to provide the answer to the question based on the context. Answer briefly, based on the context. Only output the answer, and nothing else. Here is an example:
+>> Context
+Fascin is an actin-bundling protein that induces membrane protrusions and cell motility after the formation of lamellipodia or filopodia. Fascin expression has been associated with progression or prognosis in various neoplasms; however, its role in intrahepatic cholangiocarcinoma is unknown.
+>> Question
+What type of protein is fascin?
+>> Answer
+Actin-bundling protein
+Now answer the user's question based on the user's given context.
+"""
+USER_PROMPT = """
+>> Context
+{context}
+>> Question
+{question}
+>> Answer
+"""
+def load_embedder(model_path: str, device: str) -> SentenceTransformer:
+    embedder = SentenceTransformer(model_path)
+    embedder.to(device)
+    return embedder
+def load_contexts(context_file: str) -> list[str]:
+    contexts = []
+    with open(context_file, "r") as f_in:
+        for line in f_in:
+            context = json.loads(line)
+            contexts.append(context["context"])
+    return contexts
+def load_index(index_file: str) -> faiss.Index:
+    return faiss.read_index(index_file)
+def load_reader(model_path: str, device: str) -> TextGenerationPipeline:
+    model = AutoModelForCausalLM.from_pretrained(model_path)
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    tokenizer.pad_token = tokenizer.eos_token
+    reader = pipeline(
+        "text-generation",
+        model=model,
+        tokenizer=tokenizer,
+        torch_dtype=torch.bfloat16,
+        token=HF_TOKEN,
+        device=device,
+    )
+    return reader
+def construct_prompt(contexts: list[str], question: str) -> list[dict]:
+    return [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {
+            "role": "user",
+            "content": USER_PROMPT.format(
+                context="\n".join(contexts), question=question
+            ),
+        },
+    ]
+def load_all(
+    embedder_path: str,
+    embedder_device: str,
+    context_file: str,
+    index_file: str,
+    reader_path: str,
+    reader_device: str,
+) -> tuple[SentenceTransformer, list[str], faiss.Index, TextGenerationPipeline]:
+    embedder = load_embedder(embedder_path, embedder_device)
+    contexts = load_contexts(context_file)
+    index = load_index(index_file)
+    reader = load_reader(reader_path, reader_device)
+    return {
+        "embedder": embedder,
+        "contexts": contexts,
+        "index": index,
+        "reader": reader,
+    }
+def run_query(
+    question: str,
+    embedder: SentenceTransformer,
+    index: faiss.Index,
+    contexts: list[str],
+    reader: TextGenerationPipeline,
+    top_k: int = 3,
+) -> tuple[list[int], list[str], str]:
+    query_embedding = embedder.encode([question], normalize_embeddings=True)
+    _, retrieved_context_ids = index.search(query_embedding, top_k)
+    retrieved_context_ids = np.array(retrieved_context_ids)  # shape: (1, top_k)
+    retrieved_contexts = []
+    for row in retrieved_context_ids:
+        retrieved_contexts.append(
+            [contexts[i] if contexts[i] is not None else "" for i in row]
+        )
+    # The code below is for a single question.
+    prompt = construct_prompt(retrieved_contexts[0], question)
+    answer = reader(prompt, max_new_tokens=128, return_full_text=False)
+    print(answer)
+    answer_text = answer[0]["generated_text"]
+    if ">> Answer" in answer_text:
+        answer_text = answer_text.split(">> Answer")[1].strip()
+    return retrieved_context_ids[0].tolist(), retrieved_contexts[0], answer_text
+# %%
+# embedder_path = "Snowflake/snowflake-arctic-embed-l"
+# reader_path = "meta-llama/Llama-3.2-1B-Instruct"
+# context_file = "../data/bioasq_contexts.jsonl"
+# index_file = "../data/bioasq_contexts__snowflake-arctic-embed-l__float32_hnsw.index"
+# embedder, contexts, index, reader = load_all(
+#     embedder_path, "cpu", context_file, index_file, reader_path, "mps"
+# )
+# query = "What cellular structures does fascin induce?"
+# retrieved_context_ids, retrieved_contexts, answer_text = run_query(
+#     query, embedder, index, contexts, reader
+# )
+# %%