Spaces:

SalehAhmad
/

Wagner

Runtime error

App Files Files Community

SalehAhmad commited on Sep 26, 2024

Commit

10d250d

verified ·

1 Parent(s): d884eee

Upload 6 files

Browse files

Files changed (6) hide show

app.py +42 -0
chatbot.py +145 -0
data_ingester.py +82 -0
data_loader.py +53 -0
data_query.py +97 -0
requirements.txt +193 -0

app.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import streamlit as st
+from chatbot import RAGChatbot
+import os
+from dotenv import load_dotenv
+# Load environment variables from .env file
+load_dotenv()
+@st.cache_resource
+def initialize_chatbot():
+    # Initialize the chatbot with necessary API keys and settings
+    chatbot = RAGChatbot(
+        pinecone_api_key=os.getenv('PINECONE_API_KEY'),
+        index_name='test',
+    )
+    return chatbot
+chatbot = initialize_chatbot()
+# Streamlit app layout
+st.title("RAG Chatbot")
+st.write("Ask the chatbot anything and get real-time responses.")
+# Input prompt from the user
+prompt = st.text_input("Enter your prompt:", "")
+if prompt:
+    # Query the chatbot and get the response
+    response = chatbot.query_chatbot(prompt, k=15, rerank=True)
+    # Display LLM response
+    st.subheader("LLM Response")
+    if 'response' in response:
+        st.write(response['response'])  # Display the entire response in a readable format
+    # Display reranked relevant documents with metadata
+    st.subheader("Relevant Documents")
+    if 'context_docs' in response:
+        reranked_docs = response['context_docs']
+        for i, doc in enumerate(reranked_docs):
+            st.write(f"**Document {i+1} Metadata:**")
+            st.json(doc.metadata)  # Display metadata in JSON format for better structure

chatbot.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import os
+import torch
+import yaml
+from langchain_pinecone import PineconeVectorStore
+from langchain_community.embeddings import HuggingFaceBgeEmbeddings
+from data_ingester import ChatbotDataIngester
+from data_query import ChatbotDataQuery
+from getpass import getpass
+from pinecone import Pinecone, ServerlessSpec
+from ragatouille import RAGPretrainedModel
+import torch.nn.functional as F
+from transformers import AutoModel
+class CustomReranker:
+    def __init__(self, model_name="nvidia/NV-Embed-v2", max_length=32768):
+        """
+        Initialize the reranker with the model and tokenizer.
+        """
+        self.model = AutoModel.from_pretrained(model_name, trust_remote_code=True, device_map="auto")
+        self.max_length = max_length
+    def _encode(self, texts, instruction=""):
+        """
+        Helper function to encode the input texts using the model.
+        """
+        return self.model.encode(texts, instruction=instruction, max_length=self.max_length)
+    def rerank(self, query, passages, k=1):
+        """
+        Rerank the passages based on their similarity with the query.
+        Args:
+        - query (str): The query text.
+        - passages (list of str): List of passages to rerank.
+        - k (int): The number of top-k documents to return after reranking.
+        Returns:
+        - A list of the top-k ranked passages with their similarity scores.
+        """
+        query_prefix = "Instruct: Given a question, retrieve passages that answer the question\nQuery: "
+        passage_prefix = ""
+        # Get the query and passage embeddings
+        query_embeddings = self._encode([query], instruction=query_prefix)
+        passage_embeddings = self._encode(passages, instruction=passage_prefix)
+        # Normalize embeddings
+        query_embeddings = F.normalize(query_embeddings, p=2, dim=1)
+        passage_embeddings = F.normalize(passage_embeddings, p=2, dim=1)
+        # Compute similarity scores
+        scores = (query_embeddings @ passage_embeddings.T) * 100
+        scores = scores.tolist()[0]
+        # Sort passages by their scores
+        sorted_passages = sorted(
+            [{"content": passage, "score": score, "result_index": idx}
+             for idx, (passage, score) in enumerate(zip(passages, scores))],
+            key=lambda x: x['score'], reverse=True
+        )
+        return sorted_passages[:k]  # Return top-k reranked passages
+class RAGChatbot:
+    def __init__(self, pinecone_api_key=None, index_name="test-index", config_path="../config.yml"):
+        """
+        Initialize the RAGChatbot. Handles embeddings, vector store, data ingestion, and query.
+        """
+        self.pinecone_api_key = pinecone_api_key or os.getenv("PINECONE_API_KEY")# or getpass("Enter your Pinecone API key: ")
+        self.index_name = index_name
+        self.embeddings = self.initialize_embeddings()
+        self.dimensions = len(self.embeddings.embed_query("Hello World!"))
+        self.vector_store = self.initialize_vector_store()
+        self.data_ingester = ChatbotDataIngester(vector_store=self.vector_store, embeddings=self.embeddings)
+        self.data_query = ChatbotDataQuery(vector_store=self.vector_store)
+        # self.reranker = self.initialize_reranker()
+        # self.reranker = CustomReranker()
+    def load_config(self, config_path):
+        """
+        Load the configuration file (config.yml).
+        """
+        with open(config_path, 'r') as file:
+            return yaml.safe_load(file)
+    def initialize_embeddings(self):
+        """
+        Initialize the embedding model based on the config file.
+        """
+        model_name = "BAAI/bge-large-en-v1.5"
+        model_kwargs = {"device": "cuda" if torch.cuda.is_available() else "cpu"}
+        encode_kwargs = {"normalize_embeddings": True}
+        hf = HuggingFaceBgeEmbeddings(
+            model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)
+        return hf
+    def initialize_reranker(self):
+        """
+        Initialize the reranker
+        """
+        return RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
+    def initialize_vector_store(self):
+        """
+        Initialize Pinecone vector store.
+        """
+        pc = Pinecone(api_key=self.pinecone_api_key)
+        existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
+        if self.index_name not in existing_indexes:
+            pc.create_index(
+                name=self.index_name,
+                dimension=self.dimensions,
+                metric="cosine",
+                spec=ServerlessSpec(cloud="aws", region="us-east-1"),
+            )
+            while not pc.describe_index(self.index_name).status["ready"]:
+                import time
+                time.sleep(1)
+        return PineconeVectorStore(index=pc.Index(self.index_name), embedding=self.embeddings)
+    def ingest_data(self, dir_path, empty=False):
+        """
+        Ingest data from a directory using the ChatbotDataIngester.
+        """
+        self.data_ingester.load_and_ingest(dir_path, empty_db=empty)
+    def query_chatbot(self, query_text, k=1, rerank=False): #, fetch_k=2, lambda_mult=0.5
+        """
+        Query the chatbot using the provided query text and optional search parameters.
+        """
+        if rerank:
+            response = self.data_query.query(
+                query_text=query_text,
+                k=k,
+                # reranker=self.reranker
+            )
+        else:
+            response = self.data_query.query(
+                query_text=query_text,
+                k=k,
+            )
+        return response

data_ingester.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import os
+from uuid import uuid4
+from langchain_core.documents import Document
+from data_loader import ChatbotDataLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_text_splitters import SpacyTextSplitter
+class ChatbotDataIngester:
+    def __init__(self, vector_store, embeddings):
+        """
+        Initialize the ChatbotDataIngester with an external vector store and embeddings model.
+        Raise an exception if either of them is None.
+        """
+        if vector_store in [None, '']:
+            raise ValueError("Vector store cannot be None/empty")
+        if embeddings in [None, '']:
+            raise ValueError("Embeddings model cannot be None/empty")
+        self.loader = ChatbotDataLoader()
+        self.vector_store = vector_store
+        self.embeddings = embeddings
+        self.text_splitter = SpacyTextSplitter(
+            separator=["\n\n", "\n", '.'],
+            chunk_size=1000,
+            chunk_overlap=200,)
+    def embed_content(self, content):
+        """
+        Embed the text content using the provided embedding model.
+        """
+        return self.embeddings.embed_query(content)
+    def load_and_ingest(self, dir_path, empty_db=False):
+        """
+        Load documents from the directory, generate embeddings, and ingest them into the vector store.
+        :param dir_path: Directory path to load the documents from.
+        :param empty_db: If True, the vector store will be emptied before adding new documents.
+        """
+        # Optionally clear the vector store
+        if empty_db:
+            self.clear_vector_store()
+        # Load files from the directory
+        file_contents = self.loader.load_directory(dir_path)
+        # Create documents from the file contents
+        documents = [
+            Document(page_content=content, metadata={"source": file_path})
+            for file_path, content in file_contents.items()
+        ]
+        split_docs = self.text_splitter.split_documents(documents)
+        # Generate UUIDs for documents
+        uuids = [str(uuid4()) for _ in range(len(split_docs))]
+        print(f'{len(documents)} documents splitted into {len(split_docs)} chunks')
+        # Ingest documents into the vector store
+        self.ingest_to_vector_store(split_docs, uuids)
+    def clear_vector_store(self):
+        """
+        Clear all documents in the vector store.
+        """
+        try:
+            self.vector_store.delete(delete_all=True)
+            print("Cleared the vector store.")
+        except Exception as e:
+            print(f"Failed to clear the vector store: {str(e)}")
+    def ingest_to_vector_store(self, documents, uuids):
+        """
+        Ingest the documents into the vector store.
+        """
+        try:
+            self.vector_store.add_documents(documents, ids=uuids)
+            print(f'Ingested {len(documents)} chunks to the vector store')
+        except Exception as e:
+            print(f'Failed to ingest documents: {str(e)}')

data_loader.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import os
+from docx import Document
+import PyPDF2
+class ChatbotDataLoader:
+    def __init__(self):
+        pass
+    def read_docx(self, file_path):
+        """
+        Reads content from a .docx file.
+        """
+        doc = Document(file_path)
+        content = "\n".join([para.text for para in doc.paragraphs])
+        return content
+    def read_pdf(self, file_path):
+        """
+        Reads content from a .pdf file.
+        """
+        with open(file_path, "rb") as file:
+            reader = PyPDF2.PdfReader(file)
+            content = ""
+            for page in range(len(reader.pages)):
+                content += reader.pages[page].extract_text()
+        return content
+    def load_file(self, file_path):
+        """
+        Reads content from a .docx or .pdf file based on the file extension.
+        """
+        if file_path.endswith(".docx"):
+            return self.read_docx(file_path)
+        elif file_path.endswith(".pdf"):
+            return self.read_pdf(file_path)
+        else:
+            raise ValueError(f"Unsupported file type: {file_path}")
+    def load_directory(self, dir_path):
+        """
+        Iterates through the directory, loads all .docx and .pdf files, and returns their content.
+        """
+        file_contents = {}
+        for root, _, files in os.walk(dir_path):
+            for file in files:
+                file_path = os.path.join(root, file)
+                if file.endswith((".docx", ".pdf")):
+                    try:
+                        content = self.load_file(file_path)
+                        file_contents[file_path] = content
+                    except Exception as e:
+                        print(f"Failed to load {file_path}: {str(e)}")
+        return file_contents

data_query.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import getpass
+import os
+from langchain_core.runnables import RunnablePassthrough
+from langchain_core.output_parsers import StrOutputParser
+from langchain_openai import ChatOpenAI
+from langchain.chains import create_retrieval_chain
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.documents import Document
+def genetare_openai_response(input_prompt):
+    print(f'In genetare_openai_response')
+    system_prompt = '''You are an assistant designed to provide answers when no (0) relevant documents are retrieved from the vector database. When this happens, you should follow these steps:
+                    1) First, determine if you can answer the user's query using general knowledge or internal information. If so, generate a confident, helpful response in a straightforward narrative style. Do not use phrases such as 'According to me,' 'As of my knowledge,' 'I don’t know but,' or mention knowledge cutoffs or lack of information. Simply provide the answer as if you are certain of the facts.
+                    2) If the question is domain-specific, too specific (e.g., about a particular person or object that could mislead), or outside your knowledge, do not attempt to answer. Politely respond with: 'I'm sorry, I currently do not have enough information to answer your question.'''
+    llm = ChatOpenAI(model="gpt-4o", api_key=os.getenv("OPENAI_API_KEY"))
+    return 'The number of retrieved documents from RAG pipeline was 0, so the answer is based on LLM\s internal knowledge.\n' + llm(system_prompt+input_prompt).content
+class ChatbotDataQuery:
+    def __init__(self, vector_store):
+        self.llm = ChatOpenAI(model="gpt-4o", api_key=os.getenv("OPENAI_API_KEY"))
+        self.system_prompt = '''You are Wagner, a highly intelligent and friendly AI assistant...'''
+        if vector_store is None:
+            raise ValueError("Vector store cannot be None")
+        else:
+            self.vector_store = vector_store
+    def initialize_reranker(self):
+        """
+        Initialize the custom reranker.
+        """
+        return CustomReranker()
+    def __generate_response(self, query_text, retriever, reranker=None, reranker_docs=0):
+        context_docs = retriever.invoke(query_text)
+        if len(context_docs) == 0:
+            response = genetare_openai_response(input_prompt=query_text)
+            return response
+        context_docs_texts = [doc.page_content for doc in context_docs]
+        if reranker is not None and reranker_docs > 0:
+            # Use the custom reranker to rerank the context_docs
+            relevant_docs = reranker.rerank(query_text, context_docs_texts, k=reranker_docs)
+            final_reranked_docs = []
+            for reranked_doc in relevant_docs:
+                idx_of_content_in_context_doc = reranked_doc['result_index']
+                meta_data = context_docs[idx_of_content_in_context_doc].metadata
+                final_reranked_docs.append(Document(page_content=reranked_doc['content'], metadata=meta_data))
+            context_docs = final_reranked_docs
+        prompt = ChatPromptTemplate.from_template(
+            "You are a helpful assistant that only answers questions about the context. "
+            "You try your best to extract the relevant answers from the context. "
+            "The context is:\n\n{context}\n\n"
+            "Question: {question}\n"
+            "Helpful Answer:"
+        )
+        print(f'---\nThe Retrieved Documents are:')
+        for idx, doc in enumerate(context_docs):
+            print(idx, '-', doc.metadata)
+        print('---\n\n')
+        chain = create_stuff_documents_chain(
+            llm=self.llm,
+            prompt=prompt,
+            document_variable_name="context",
+        )
+        context = '\n\n'.join([doc.page_content for doc in context_docs])
+        query = [
+            ("system", f"{self.system_prompt}"),
+            ("human", f"context: {context}\nInput: {query_text}"),
+        ]
+        response = ''
+        for chunk in self.llm.stream(query):
+            response += chunk.content
+        return {'response': response, 'context_docs': context_docs}
+            # yield chunk.content
+        # return context_docs
+    def query(self, query_text, k=1, reranker=None):
+        retriever = self.vector_store.as_retriever(
+            search_kwargs={"k": k},
+            search_type="similarity",
+        )
+        try:
+            return self.__generate_response(query_text=query_text, retriever=retriever, reranker=reranker, reranker_docs=k//2)
+        except Exception as e:
+            print(f"Failed to retrieve documents: {str(e)}")
+            return None

requirements.txt ADDED Viewed

	@@ -0,0 +1,193 @@

+accelerate==0.34.2
+aiohttp==3.9.5
+aiosignal==1.3.1
+annotated-types==0.7.0
+anyio==4.5.0
+asttokens==2.4.1
+attrs==24.2.0
+beautifulsoup4==4.12.3
+bitarray==2.9.2
+blinker==1.8.2
+blis==0.7.11
+catalogue==2.0.10
+certifi==2024.8.30
+charset-normalizer==3.3.2
+click==8.1.7
+cloudpathlib==0.19.0
+colbert-ai==0.2.19
+comm==0.2.2
+confection==0.1.5
+cymem==2.0.8
+dataclasses-json==0.6.7
+datasets==3.0.0
+debugpy==1.8.5
+decorator==5.1.1
+Deprecated==1.2.14
+dill==0.3.8
+dirtyjson==1.0.8
+distro==1.9.0
+einops==0.8.0
+en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889
+executing==2.1.0
+faiss-cpu==1.8.0.post1
+fast-pytorch-kmeans==0.2.0.1
+filelock==3.16.1
+Flask==3.0.3
+frozenlist==1.4.1
+fsspec==2024.6.1
+git-python==1.0.3
+gitdb==4.0.11
+GitPython==3.1.43
+greenlet==3.1.0
+h11==0.14.0
+httpcore==1.0.5
+httpx==0.27.2
+huggingface-hub==0.25.0
+idna==3.10
+InstructorEmbedding==1.0.1
+ipykernel==6.29.5
+ipython==8.27.0
+itsdangerous==2.2.0
+jedi==0.19.1
+Jinja2==3.1.4
+jiter==0.5.0
+joblib==1.4.2
+jsonpatch==1.33
+jsonpointer==3.0.0
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+langchain==0.3.0
+langchain-community==0.3.0
+langchain-core==0.3.2
+langchain-huggingface==0.1.0
+langchain-openai==0.2.0
+langchain-pinecone==0.2.0
+langchain-text-splitters==0.3.0
+langcodes==3.4.0
+langsmith==0.1.125
+language_data==1.2.0
+llama-cloud==0.1.0
+llama-index==0.11.13
+llama-index-agent-openai==0.3.4
+llama-index-cli==0.3.1
+llama-index-core==0.11.13.post1
+llama-index-embeddings-openai==0.2.5
+llama-index-indices-managed-llama-cloud==0.3.1
+llama-index-legacy==0.9.48.post3
+llama-index-llms-openai==0.2.9
+llama-index-multi-modal-llms-openai==0.2.1
+llama-index-program-openai==0.2.0
+llama-index-question-gen-openai==0.2.0
+llama-index-readers-file==0.2.2
+llama-index-readers-llama-parse==0.3.0
+llama-parse==0.5.6
+lxml==5.3.0
+marisa-trie==1.2.0
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+marshmallow==3.22.0
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.1.0
+multiprocess==0.70.16
+murmurhash==1.0.10
+mypy-extensions==1.0.0
+nest-asyncio==1.6.0
+networkx==3.3
+ninja==1.11.1.1
+nltk==3.9.1
+numpy==1.26.4
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.6.68
+nvidia-nvtx-cu12==12.1.105
+onnx==1.16.2
+openai==1.46.1
+orjson==3.10.7
+packaging==24.1
+pandas==2.2.2
+parso==0.8.4
+pexpect==4.9.0
+pillow==10.4.0
+pinecone-client==5.0.1
+pinecone-plugin-inference==1.1.0
+pinecone-plugin-interface==0.0.7
+platformdirs==4.3.6
+preshed==3.0.9
+prompt_toolkit==3.0.47
+protobuf==5.28.2
+psutil==6.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pyarrow==17.0.0
+pydantic==2.9.2
+pydantic-settings==2.5.2
+pydantic_core==2.23.4
+Pygments==2.18.0
+pynvml==11.5.3
+pypdf==4.3.1
+PyPDF2==3.0.1
+python-dateutil==2.9.0.post0
+python-docx==1.1.2
+python-dotenv==1.0.1
+pytz==2024.2
+PyYAML==6.0.2
+pyzmq==26.2.0
+RAGatouille==0.0.8.post4
+regex==2024.9.11
+requests==2.32.3
+rich==13.8.1
+safetensors==0.4.5
+scikit-learn==1.5.2
+scipy==1.14.1
+sentence-transformers==2.7.0
+sentencepiece==0.2.0
+shellingham==1.5.4
+six==1.16.0
+smart-open==7.0.4
+smmap==5.0.1
+sniffio==1.3.1
+soupsieve==2.6
+spacy==3.7.6
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+SQLAlchemy==2.0.35
+srsly==2.4.8
+stack-data==0.6.3
+striprtf==0.0.26
+sympy==1.13.3
+tenacity==8.5.0
+thinc==8.2.5
+threadpoolctl==3.5.0
+tiktoken==0.7.0
+tokenizers==0.19.1
+torch==2.4.1
+torchvision==0.19.1
+tornado==6.4.1
+tqdm==4.66.5
+traitlets==5.14.3
+transformers==4.44.2
+triton==3.0.0
+typer==0.12.5
+typing-inspect==0.9.0
+typing_extensions==4.12.2
+tzdata==2024.1
+ujson==5.10.0
+urllib3==2.2.3
+voyager==2.0.9
+wasabi==1.1.3
+wcwidth==0.2.13
+weasel==0.4.1
+Werkzeug==3.0.4
+wrapt==1.16.0
+xxhash==3.5.0
+yarl==1.11.1