Spaces:

Hyma7
/

Rag_Implementation

Sleeping

App Files Files Community

Hyma7 commited on Jan 10

Commit

e354c74

verified ·

1 Parent(s): f3b7c20

Create app.py

Browse files

Files changed (1) hide show

app.py +121 -0

app.py ADDED Viewed

	@@ -0,0 +1,121 @@

+# Import necessary libraries
+import streamlit as st
+import pandas as pd
+from sentence_transformers import SentenceTransformer
+from langchain.vectorstores import FAISS
+from langchain.embeddings.base import Embeddings
+from transformers import pipeline
+from langchain.llms.huggingface_pipeline import HuggingFacePipeline
+from langchain.chains import RetrievalQA
+# Define a LangChain-compatible wrapper for SentenceTransformer
+class SentenceTransformerEmbeddings(Embeddings):
+    """
+    Wrapper for SentenceTransformer to integrate with LangChain.
+    """
+    def __init__(self, model_name: str):
+        self.model = SentenceTransformer(model_name)
+    def embed_documents(self, texts):
+        """
+        Generates embeddings for a list of documents.
+        Args:
+        texts (list): List of strings to embed.
+        Returns:
+        np.ndarray: Embedding vectors.
+        """
+        return self.model.encode(texts, show_progress_bar=False)
+    def embed_query(self, text):
+        """
+        Generates an embedding for a single query.
+        Args:
+        text (str): Query string to embed.
+        Returns:
+        np.ndarray: Embedding vector.
+        """
+        return self.model.encode([text], show_progress_bar=False)[0]
+# Initialize the embedding model
+embedding_model = SentenceTransformerEmbeddings('all-MiniLM-L6-v2')
+# Preprocess data into descriptive text entries
+def preprocess_data(data):
+    """
+    Combines multiple dataset columns into descriptive text entries for embedding.
+    Args:
+    data (pd.DataFrame): The input dataset containing participant details.
+    Returns:
+    list: A list of combined textual descriptions for each row in the dataset.
+    """
+    combined_entries = []
+    for _, row in data.iterrows():
+        entry = f"Participant {row['ID']}:\n"
+        entry += f"- AI Knowledge Level: {row['Q1.AI_knowledge']}\n"
+        entry += f"- Sources of AI Knowledge: {row['Q2.AI_sources']}\n"
+        entry += f"- Perspectives on AI: Dehumanization ({row['Q3#1.AI_dehumanization']}), "
+        entry += f"Job Replacement ({row['Q3#2.Job_replacement']})\n"
+        entry += f"- Domains Impacted by AI: {row['Q6.Domains']}\n"
+        entry += f"- Utility Grade for AI: {row['Q7.Utility_grade']}\n"
+        entry += f"- GPA: {row['Q16.GPA']}\n"
+        combined_entries.append(entry)
+    return combined_entries
+# App logic
+def main():
+    # Set up the Streamlit UI
+    st.title("RAG Chatbot")
+    st.write("This chatbot answers questions based on the dataset.")
+    # Load the dataset directly from the space directory
+    dataset_path = "Survey_AI.csv"
+    try:
+        data = pd.read_csv(dataset_path)
+        st.write("Dataset successfully loaded!")
+        # Preprocess data and create vector store
+        combined_texts = preprocess_data(data)
+        vector_store = FAISS.from_texts(combined_texts, embedding_model)
+        retriever = vector_store.as_retriever()
+        # Set up QA chain
+        flan_t5 = pipeline("text2text-generation", model="google/flan-t5-base", device=-1)  # CPU mode
+        llm = HuggingFacePipeline(pipeline=flan_t5)
+        qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
+        # Default sample questions
+        sample_questions = [
+            "What are the sources of AI knowledge for participants?",
+            "Which domains are impacted by AI?",
+            "What are participants' perspectives on job replacement due to AI?",
+            "What is the average GPA of participants?",
+            "What is the utility grade for AI?",
+            "Which participants view AI as highly beneficial in their domain?"
+        ]
+        st.subheader("Sample Questions")
+        selected_question = st.selectbox("Select a question to see the response:", [""] + sample_questions)
+        if selected_question:
+            response = qa_chain.run(selected_question)
+            st.write("Question:", selected_question)
+            st.write("Answer:", response)
+        # Custom user query
+        st.subheader("Custom Query")
+        query = st.text_input("Or, enter your own question:")
+        if query:
+            response = qa_chain.run(query)
+            st.write("Question:", query)
+            st.write("Answer:", response)
+    except FileNotFoundError:
+        st.error("Dataset file not found. Please ensure the file is named 'dataset.csv' and uploaded to the root directory.")
+# Run the app
+if __name__ == "__main__":
+    main()