Spaces:

droushb
/

NLP_RAG

Sleeping

App Files Files Community

droushb commited on Dec 8, 2024

Commit

dc75be1

1 Parent(s): a95bcbf

Initial commit for RAG Question Answering system

Browse files

Files changed (5) hide show

app.py +71 -0
config.py +14 -0
model/main.py +44 -0
model/questionAnsweringBot.py +28 -0
model/retriever.py +42 -0

app.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import streamlit as st
+from config import CONFIG
+from model.main import process_query
+st.title("RAG Question Answering System")
+# Instructions
+st.write("""
+Welcome to the Retrieval-Augmented Generation (RAG) Question Answering System.
+### What does this system do?
+- Searches through a collection of the first 50,000 documents of the dataset to find the most relevant information based on your question using **BM25** and **Semantic Search**.
+- Generates accurate answers using the retrieved documents with the power of **OpenAI API GPT-4o-mini**.
+- Provides citations for every piece of information to ensure transparency and trustworthiness.
+### Instructions
+1. **Enter your OpenAI API Key**: You can use your own key.
+2. **Ask Your Question**: Type your question in the input box.
+3. **Choose a Retrieval Method**:
+   - **BM25**: A keyword-based retrieval method.
+   - **Semantic Search**: A context-based retrieval method powered by embeddings.
+4. **Generate the Answer**: Click the "Generate Answer" button to retrieve relevant documents and generate a detailed answer.
+Feel free to experiment with different questions and retrieval methods to explore how the system performs!
+""")
+llm_key = st.text_input("Enter your LLM API Key", type="password")
+# if st.checkbox("Use Test API Key"):
+#     llm_key = CONFIG['LLM_API_key']
+if not llm_key:
+    st.warning("Please provide your LLM API Key to proceed.")
+    st.stop()
+query = st.text_input("Enter your question")
+retrieval_method = st.radio(
+    "Select Retrieval Method",
+    ("BM25", "Semantic Search")
+)
+if st.button("Generate Answear"):
+    if not query.strip():
+        st.warning("Please enter a question to process.")
+    else:
+        with st.spinner("Processing your query..."):
+            try:
+                retrieved_docs, answer = process_query(llm_key, query, retrieval_method)
+                st.subheader("Retrieved Documents")
+                for doc in retrieved_docs:
+                    st.write(f"- {doc}")
+                st.subheader("Generated Answer")
+                st.text_area("Generated Answer", value=answer, height=CONFIG['TEXTAREA_HEIGHT'], disabled=True)
+            except Exception as e:
+                st.error(f"An error occurred: {e}")
+st.markdown(
+    """
+    <style>
+    .stTextArea {
+        border: 2px solid #4CAF50;
+        border-radius: 8px;
+        padding: 10px;
+        font-family: Arial, sans-serif;
+        font-size: 14px;
+        box-shadow: 2px 2px 5px rgba(0, 0, 0, 0.1);
+    }
+    </style>
+    """,
+    unsafe_allow_html=True
+)

config.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import os
+from dotenv import load_dotenv
+load_dotenv()
+CONFIG = {
+    "DATASET": "aalksii/ml-arxiv-papers",
+    "MAX_NUM_OF_RECORDS": 1000,
+    "TEXTAREA_HEIGHT": 200,
+    "CHUNK_SIZE": 200,
+    "OPENAI_ENGINE": "gpt-4o-mini",
+    "MAX_TOKENS": 500,
+    "TOP_DOCS": 3
+}

model/main.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import streamlit as st
+from model.questionAnsweringBot import QuestionAnsweringBot
+from model.retriever import Retriever
+def process_query(llm_key, query, retrieval_method):
+    if "retriever" not in st.session_state:
+        st.session_state.retriever = Retriever()
+        print("Loading and preparing dataset...")
+        st.session_state.retriever.load_and_prepare_dataset()
+        st.session_state.retriever.prepare_bm25()
+        st.session_state.retriever.compute_embeddings()
+    retriever = st.session_state.retriever
+    if retrieval_method == "BM25":
+        print("Retrieving documents using BM25...")
+        retrieved_docs = retriever.retrieve_documents_bm25(query)
+    else:
+        print("Retrieving documents using Semantic Search...")
+        retrieved_docs = retriever.retrieve_documents_semantic(query)
+    bot = QuestionAnsweringBot(llm_key)
+    prompt = getPrompt(retrieved_docs, query)
+    answer = bot.generate_answer(prompt)
+    return retrieved_docs, answer
+def getPrompt(retrieved_docs, query):
+    prompt = (
+        "You are an LM integrated into an RAG system that answers questions based on provided documents.\n"
+        "Rules:\n"
+        "- Reply with the answer only and nothing but the answer.\n"
+        "- Say 'I don't know' if you don't know the answer.\n"
+        "- Use only the provided documents.\n"
+        "- Citations are required. Include the document and chunk number in square brackets after the information (e.g., [Document 1, Chunk 2]).\n\n"
+        "Documents:\n"
+    )
+    for i, doc in enumerate(retrieved_docs):
+        prompt += f"Document {i + 1}: {doc}\n"
+    prompt += f"\nQuery: {query}\n"
+    return prompt

model/questionAnsweringBot.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import openai
+from config import CONFIG
+PROMPT = """
+You are a helpful assistant that can answer questions.
+Rules:
+- Reply with the answer only and nothing but the answer.
+- Say "I don't know" if you don't know the answer.
+- Use the provided context.
+"""
+class QuestionAnsweringBot:
+    def __init__(self, llm_key):
+        openai.api_key = llm_key
+    def generate_answer(self, prompt):
+        try:
+            completion = openai.ChatCompletion.create(
+                model=CONFIG['OPENAI_ENGINE'],
+                messages=[
+                    # {"role": "system", "content": PROMPT},
+                    {"role": "user", "content": prompt}
+                ],
+                max_tokens=CONFIG['MAX_TOKENS'],
+            )
+            return completion['choices'][0]['message']['content'].strip()
+        except Exception as e:
+            return f"An error occurred while generating the answer: {e}"

model/retriever.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from datasets import load_dataset
+from config import CONFIG
+from rank_bm25 import BM25Okapi
+from sentence_transformers import SentenceTransformer, util
+class Retriever:
+    def __init__(self):
+        self.corpus = None
+        self.bm25 = None
+        self.model = None
+        self.chunk_embeddings = None
+    def load_and_prepare_dataset(self):
+        dataset = load_dataset(CONFIG['DATASET'])
+        dataset = dataset['train'].select(range(CONFIG['MAX_NUM_OF_RECORDS']))
+        dataset = dataset.map(lambda x: {'chunks': self.chunk_text(x['abstract'])})
+        self.corpus = [chunk for chunks in dataset["chunks"] for chunk in chunks]
+    def prepare_bm25(self):
+        tokenized_corpus = [doc.split(" ") for doc in self.corpus]
+        self.bm25 = BM25Okapi(tokenized_corpus)
+    def compute_embeddings(self):
+        self.model = SentenceTransformer('all-MiniLM-L6-v2')
+        self.chunk_embeddings = self.model.encode(self.corpus, convert_to_tensor=True)
+    def chunk_text(self, text, chunk_size=CONFIG['CHUNK_SIZE']):
+        words = text.split()
+        return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
+    def retrieve_documents_bm25(self, query):
+        tokenized_query = query.split(" ")
+        scores = self.bm25.get_scores(tokenized_query)
+        top_docs = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:CONFIG['TOP_DOCS']]
+        return [self.corpus[i] for i in top_docs]
+    def retrieve_documents_semantic(self, query):
+        query_embedding = self.model.encode(query, convert_to_tensor=True)
+        scores = util.pytorch_cos_sim(query_embedding, self.chunk_embeddings)[0]
+        top_chunks = scores.topk(CONFIG['TOP_DOCS']).indices
+        return [self.corpus[i] for i in top_chunks]