Spaces:

rchrdgwr
/

CoolApp

Sleeping

App Files Files Community

rchrdgwr commited on Aug 25, 2024

Commit

0614fbf

1 Parent(s): ba0bf00

cleaned up code

Browse files

Files changed (7) hide show

BuildingAChainlitApp.md +42 -23
aimakerspace/vectordatabase.py +0 -68
app.py +72 -89
richard/__init__.py +0 -0
richard/pipeline.py +27 -0
richard/text_utils.py +46 -0
richard/vector_database.py +112 -0

BuildingAChainlitApp.md CHANGED Viewed

@@ -257,34 +257,53 @@ Code was modified to support pdf documents in the following areas:
 2) change process_text_file() function to handle .pdf files
-- identify the file extension
-- read the uploaded document into a temporary file
-- process a .txt file as before resulting in the texts list
-- if the file is .pdf use the PyMuPDF library to read each page and extract the text and add it to texts list
 ```python
-    file_extension = os.path.splitext(file.name)[1].lower()
-    with tempfile.NamedTemporaryFile(mode="wb", delete=False, suffix=file_extension) as temp_file:
-        temp_file_path = temp_file.name
-        temp_file.write(file.content)
-    if file_extension == ".txt":
-        with open(temp_file_path, "r", encoding="utf-8") as f:
-            text_loader = TextFileLoader(temp_file_path)
-            documents = text_loader.load_documents()
-            texts = text_splitter.split_texts(documents)
-    elif file_extension == ".pdf":
-        pdf_document = fitz.open(temp_file_path)
-        documents = []
         for page_num in range(len(pdf_document)):
             page = pdf_document.load_page(page_num)
             text = page.get_text()
-            documents.append(text)
-        texts = text_splitter.split_texts(documents)
-    else:
-        raise ValueError("Unsupported file type")
 ```
 3) Test the handling of .pdf and .txt files

 2) change process_text_file() function to handle .pdf files
+- refactor the code to do all file handling in richard.text_utils
+- app calls process_file, optionally passing in the text splitter function
+    - default text splitter function is CharacterTextSplitter
+    ```python
+    texts = process_file(file)
+    ```
+- load_file() function does the following
+    - read the uploaded document into a temporary file
+    - identify the file extension
+    - process a .txt file as before resulting in the texts list
+    - if the file is .pdf use the PyMuPDF library to read each page and extract the text and add it to texts list
+    - use the passed in text splitter function to split the documents
 ```python
+    def load_file(self, file, text_splitter=CharacterTextSplitter()):
+        file_extension = os.path.splitext(file.name)[1].lower()
+        with tempfile.NamedTemporaryFile(mode="wb", delete=False, suffix=file_extension) as temp_file:
+            self.temp_file_path = temp_file.name
+            temp_file.write(file.content)
+        if os.path.isfile(self.temp_file_path):
+            if self.temp_file_path.endswith(".txt"):
+                self.load_text_file()
+            elif self.temp_file_path.endswith(".pdf"):
+                self.load_pdf_file()
+            else:
+                raise ValueError(
+                    f"Unsupported file type: {self.temp_file_path}"
+                )
+            return text_splitter.split_texts(self.documents)
+        else:
+            raise ValueError(
+                    "Not a file"
+                )
+    def load_text_file(self):
+        with open(self.temp_file_path, "r", encoding=self.encoding) as f:
+            self.documents.append(f.read())
+    def load_pdf_file(self):
+        print("load_pdf_file()")
+        pdf_document = fitz.open(self.temp_file_path)
+        print(len(pdf_document))
         for page_num in range(len(pdf_document)):
             page = pdf_document.load_page(page_num)
             text = page.get_text()
+            self.documents.append(text)
 ```
 3) Test the handling of .pdf and .txt files

aimakerspace/vectordatabase.py CHANGED Viewed

@@ -52,77 +52,9 @@ class VectorDatabase:
         for text, embedding in zip(list_of_text, embeddings):
             self.insert(text, np.array(embedding))
         return self
-import hashlib
-from qdrant_client import QdrantClient
-from qdrant_client.http.models import PointStruct
-class QdrantDatabase:
-    def __init__(self, qdrant_client: QdrantClient, collection_name: str, embedding_model=None):
-        self.qdrant_client = qdrant_client
-        self.collection_name = collection_name
-        self.embedding_model = embedding_model or EmbeddingModel()
-        self.vectors = defaultdict(np.array)  # Still keeps a local copy if needed
-    def string_to_int_id(self, s: str) -> int:
-        return int(hashlib.sha256(s.encode('utf-8')).hexdigest(), 16) % (10**8)
-    def insert(self, key: str, vector: np.array) -> None:
-        point_id = self.string_to_int_id(key)
-        # Insert vector into Qdrant
-        payload = {"text": key}  # Storing the key (text) as payload
-        point = PointStruct(
-            id=point_id,
-            vector={"default": vector.tolist()},  # Use the vector name defined in the collection
-            payload=payload
-        )
-        # Insert the vector into Qdrant with the associated document
-        self.qdrant_client.upsert(
-            collection_name=self.collection_name,
-            points=[point]  # Qdrant expects a list of PointStruct
-        )
-    def search(
-        self,
-        query_vector: np.array,
-        k: int,
-        distance_measure: Callable = None,
-    ) -> List[Tuple[str, float]]:
-        # Perform search in Qdrant
-        print(query_vector)
-        if isinstance(query_vector, list):
-            query_vector = np.array(query_vector)
-        search_results = self.qdrant_client.search(
-            collection_name=self.collection_name,
-            query_vector={"name": "default", "vector": query_vector.tolist()},# Convert numpy array to list
-            limit=k
-        )
-        # Extract and return results
-        return [(result.payload['text'], result.score) for result in search_results]
-    def search_by_text(
-        self,
-        query_text: str,
-        k: int,
-        distance_measure: Callable = None,
-        return_as_text: bool = False,
-    ) -> List[Tuple[str, float]]:
-        query_vector = self.embedding_model.get_embedding(query_text)
-        results = self.search(query_vector, k, distance_measure)
-        return [result[0] for result in results] if return_as_text else results
-    def retrieve_from_key(self, key: str) -> np.array:
-        # Retrieve from local cache
-        return self.vectors.get(key, None)
-    async def abuild_from_list(self, list_of_text: List[str]) -> "QdrantDatabase":
-        embeddings = await self.embedding_model.async_get_embeddings(list_of_text)
-        for text, embedding in zip(list_of_text, embeddings):
-            self.insert(text, np.array(embedding))
-        return self
 if __name__ == "__main__":
     list_of_text = [
         "I like to eat broccoli and bananas.",

         for text, embedding in zip(list_of_text, embeddings):
             self.insert(text, np.array(embedding))
         return self
 if __name__ == "__main__":
     list_of_text = [
         "I like to eat broccoli and bananas.",

app.py CHANGED Viewed

@@ -1,20 +1,27 @@
 import os
-from typing import List
 from chainlit.types import AskFileResponse
-from aimakerspace.text_utils import CharacterTextSplitter, TextFileLoader
 from aimakerspace.openai_utils.prompts import (
     UserRolePrompt,
     SystemRolePrompt,
     AssistantRolePrompt,
 )
 from aimakerspace.openai_utils.embedding import EmbeddingModel
-from aimakerspace.vectordatabase import VectorDatabase, QdrantDatabase
 from aimakerspace.openai_utils.chatmodel import ChatOpenAI
 import chainlit as cl
-import fitz
 system_template = """\
-Use the following context to answer a users question. If you cannot find the answer in the context, say you don't know the answer."""
 system_role_prompt = SystemRolePrompt(system_template)
 user_prompt_template = """\
@@ -26,65 +33,39 @@ Question:
 """
 user_role_prompt = UserRolePrompt(user_prompt_template)
-class RetrievalAugmentedQAPipeline:
-    def __init__(self, llm, vector_db_retriever: VectorDatabase) -> None:
-        self.llm = llm
-        self.vector_db_retriever = vector_db_retriever
-    async def arun_pipeline(self, user_query: str):
-        context_list = self.vector_db_retriever.search_by_text(user_query, k=4)
-        context_prompt = ""
-        for context in context_list:
-            context_prompt += context[0] + "\n"
-        formatted_system_prompt = system_role_prompt.create_message()
-        formatted_user_prompt = user_role_prompt.create_message(question=user_query, context=context_prompt)
-        async def generate_response():
-            async for chunk in self.llm.astream([formatted_system_prompt, formatted_user_prompt]):
-                yield chunk
-        return {"response": generate_response(), "context": context_list}
-text_splitter = CharacterTextSplitter()
-def process_text_file(file: AskFileResponse):
-    import tempfile
-    file_extension = os.path.splitext(file.name)[1].lower()
-    with tempfile.NamedTemporaryFile(mode="wb", delete=False, suffix=file_extension) as temp_file:
-        temp_file_path = temp_file.name
-        temp_file.write(file.content)
-    if file_extension == ".txt":
-        with open(temp_file_path, "r", encoding="utf-8") as f:
-            text_loader = TextFileLoader(temp_file_path)
-            documents = text_loader.load_documents()
-            texts = text_splitter.split_texts(documents)
-    elif file_extension == ".pdf":
-        pdf_document = fitz.open(temp_file_path)
-        documents = []
-        for page_num in range(len(pdf_document)):
-            page = pdf_document.load_page(page_num)
-            text = page.get_text()
-            documents.append(text)
-        texts = text_splitter.split_texts(documents)
-    else:
-        raise ValueError("Unsupported file type")
-    return texts
 @cl.on_chat_start
 async def on_chat_start():
     files = None
     # Wait for the user to upload a file
     while not files:
         files = await cl.AskFileMessage(
@@ -102,63 +83,65 @@ async def on_chat_start():
     await msg.send()
     # load the file
-    texts = process_text_file(file)
     msg = cl.Message(
         content=f"Resulted in {len(texts)} chunks", disable_human_feedback=True
     )
     await msg.send()
-    print(f"Processing {len(texts)} text chunks")
     # decide if to use the dict vector store of the Qdrant vector store
-    use_qdrant = True
-    from qdrant_client import QdrantClient
-    from qdrant_client.http.models import VectorParams, Distance
     # Create a dict vector store
-    if use_qdrant:
         embedding_model = EmbeddingModel()
-        qdrant_client = QdrantClient(
-            url='https://6b3eac94-adfe-42cb-98f8-9f068538243c.europe-west3-0.gcp.cloud.qdrant.io:6333',  # Replace with your cluster URL
-            api_key='YrnApyEfdNAt41N7WkcZwjhjKqiIQQbXHBtzk_04guNyRLa83J0hOw'    # Replace with your API key
-        )
-        vectors_config = {
-            "default": VectorParams(size=1536, distance="Cosine")  # Adjust size as per your model's output
-        }
-        if not qdrant_client.collection_exists("my_collection"):
-            qdrant_client.create_collection(
                 collection_name="my_collection",
-                vectors_config=vectors_config
             )
-        vector_db = QdrantDatabase(
-            qdrant_client=qdrant_client,
-            collection_name="my_collection",
-            embedding_model=embedding_model  # Replace with your embedding model instance
-        )
-        vector_db = await vector_db.abuild_from_list(texts)
-    else:
-        vector_db = VectorDatabase()
-        vector_db = await vector_db.abuild_from_list(texts)
     msg = cl.Message(
         content=f"The Vector store has been created", disable_human_feedback=True
     )
     await msg.send()
     chat_openai = ChatOpenAI()
     # Create a chain
     retrieval_augmented_qa_pipeline = RetrievalAugmentedQAPipeline(
         vector_db_retriever=vector_db,
-        llm=chat_openai
     )
     # Let the user know that the system is ready
-    msg.content = f"Processing `{file.name}` done. You can now ask questions!"
     await msg.update()
     cl.user_session.set("chain", retrieval_augmented_qa_pipeline)

 import os
 from chainlit.types import AskFileResponse
 from aimakerspace.openai_utils.prompts import (
     UserRolePrompt,
     SystemRolePrompt,
     AssistantRolePrompt,
 )
 from aimakerspace.openai_utils.embedding import EmbeddingModel
+from aimakerspace.vectordatabase import VectorDatabase
 from aimakerspace.openai_utils.chatmodel import ChatOpenAI
 import chainlit as cl
+from richard.text_utils import FileLoader
+from richard.pipeline import RetrievalAugmentedQAPipeline
+# from richard.vector_database import QdrantDatabase
+from qdrant_client import QdrantClient
+from langchain.vectorstores import Qdrant
 system_template = """\
+Use the following context to answer a users question.
+If you cannot find the answer in the context, say you don't know the answer.
+The context contains the text from a document. Refer to it as the document not the context.
+"""
 system_role_prompt = SystemRolePrompt(system_template)
 user_prompt_template = """\
 """
 user_role_prompt = UserRolePrompt(user_prompt_template)
+def process_file(file: AskFileResponse):
+    fileLoader = FileLoader()
+    return fileLoader.load_file(file)
 @cl.on_chat_start
 async def on_chat_start():
+    res = await cl.AskActionMessage(
+        content="Do you want to use Qdrant?",
+        actions=[
+            cl.Action(name="yes", value="yes", label="✅ Yes"),
+            cl.Action(name="no", value="no", label="❌ No"),
+        ],
+    ).send()
+    use_qdrant = False
+    use_qdrant_type = "Local"
+    if res and res.get("value") == "yes":
+        use_qdrant = True
+        local_res = await cl.AskActionMessage(
+                content="Do you want to use local or cloud?",
+                actions=[
+                    cl.Action(name="Local", value="Local", label="✅ Local"),
+                    cl.Action(name="Cloud", value="Cloud", label="❌ Cloud"),
+                ],
+            ).send()
+        if local_res and local_res.get("value") == "Cloud":
+            use_qdrant_type = "Cloud"
+        msg = cl.Message(
+            content=f"Sorry - the Qdrant processing has been temporarily disconnected"
+        )
+        await msg.send()
+        use_qdrant = False
     files = None
     # Wait for the user to upload a file
     while not files:
         files = await cl.AskFileMessage(
     await msg.send()
     # load the file
+    texts = process_file(file)
     msg = cl.Message(
         content=f"Resulted in {len(texts)} chunks", disable_human_feedback=True
     )
     await msg.send()
     # decide if to use the dict vector store of the Qdrant vector store
+    from qdrant_client.models import PointStruct, VectorParams
     # Create a dict vector store
+    if use_qdrant == False:
+        vector_db = VectorDatabase()
+        vector_db = await vector_db.abuild_from_list(texts)
+    else:
         embedding_model = EmbeddingModel()
+        if use_qdrant_type == "Local":
+            from qdrant_client.http.models import OptimizersConfig
+            print("Using qdrant local")
+            qdrant_client = QdrantClient(location=":memory:")
+            vector_params = VectorParams(
+                size=1536,  # vector size
+                distance="Cosine"  # distance metric
+            )
+            qdrant_client.recreate_collection(
                 collection_name="my_collection",
+                vectors_config={"default": vector_params},
             )
+            from richard.vector_database import QdrantDatabase
+            vector_db = QdrantDatabase(
+                qdrant_client=qdrant_client,
+                collection_name="my_collection",
+                embedding_model=embedding_model
+            )
+            vector_db = await vector_db.abuild_from_list(texts)
     msg = cl.Message(
         content=f"The Vector store has been created", disable_human_feedback=True
     )
     await msg.send()
     chat_openai = ChatOpenAI()
     # Create a chain
     retrieval_augmented_qa_pipeline = RetrievalAugmentedQAPipeline(
         vector_db_retriever=vector_db,
+        llm=chat_openai,
+        system_role_prompt=system_role_prompt,
+        user_role_prompt=user_role_prompt
     )
     # Let the user know that the system is ready
+    msg.content = f"Processing `{file.name}` is complete."
+    await msg.update()
+    msg.content = f"You can now ask questions about `{file.name}`."
     await msg.update()
     cl.user_session.set("chain", retrieval_augmented_qa_pipeline)

richard/__init__.py ADDED Viewed

File without changes

richard/pipeline.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from aimakerspace.vectordatabase import VectorDatabase
+class RetrievalAugmentedQAPipeline:
+    def __init__(self, llm, vector_db_retriever: VectorDatabase,
+                 system_role_prompt, user_role_prompt
+        ) -> None:
+        self.llm = llm
+        self.vector_db_retriever = vector_db_retriever
+        self.system_role_prompt = system_role_prompt
+        self.user_role_prompt = user_role_prompt
+    async def arun_pipeline(self, user_query: str):
+        context_list = self.vector_db_retriever.search_by_text(user_query, k=4)
+        context_prompt = ""
+        for context in context_list:
+            context_prompt += context[0] + "\n"
+        formatted_system_prompt = self.system_role_prompt.create_message()
+        formatted_user_prompt = self.user_role_prompt.create_message(question=user_query, context=context_prompt)
+        async def generate_response():
+            async for chunk in self.llm.astream([formatted_system_prompt, formatted_user_prompt]):
+                yield chunk
+        return {"response": generate_response(), "context": context_list}

richard/text_utils.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import os
+import fitz
+import tempfile
+from aimakerspace.text_utils import CharacterTextSplitter
+class FileLoader:
+    def __init__(self, encoding: str = "utf-8"):
+        self.documents = []
+        self.encoding = encoding
+        self.temp_file_path = ""
+    def load_file(self, file, text_splitter=CharacterTextSplitter()):
+        file_extension = os.path.splitext(file.name)[1].lower()
+        with tempfile.NamedTemporaryFile(mode="wb", delete=False, suffix=file_extension) as temp_file:
+            self.temp_file_path = temp_file.name
+            temp_file.write(file.content)
+        if os.path.isfile(self.temp_file_path):
+            if self.temp_file_path.endswith(".txt"):
+                self.load_text_file()
+            elif self.temp_file_path.endswith(".pdf"):
+                self.load_pdf_file()
+            else:
+                raise ValueError(
+                    f"Unsupported file type: {self.temp_file_path}"
+                )
+            return text_splitter.split_texts(self.documents)
+        else:
+            raise ValueError(
+                    "Not a file"
+                )
+    def load_text_file(self):
+        with open(self.temp_file_path, "r", encoding=self.encoding) as f:
+            self.documents.append(f.read())
+    def load_pdf_file(self):
+        print("load_pdf_file()")
+        pdf_document = fitz.open(self.temp_file_path)
+        print(len(pdf_document))
+        for page_num in range(len(pdf_document)):
+            page = pdf_document.load_page(page_num)
+            text = page.get_text()
+            self.documents.append(text)

richard/vector_database.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import numpy as np
+from collections import defaultdict
+from typing import List, Tuple, Callable
+from aimakerspace.openai_utils.embedding import EmbeddingModel
+import hashlib
+from qdrant_client import QdrantClient
+from qdrant_client.http.models import PointStruct
+def cosine_similarity(vector_a: np.array, vector_b: np.array) -> float:
+    """Computes the cosine similarity between two vectors."""
+    dot_product = np.dot(vector_a, vector_b)
+    norm_a = np.linalg.norm(vector_a)
+    norm_b = np.linalg.norm(vector_b)
+    return dot_product / (norm_a * norm_b)
+class QdrantDatabase:
+    def __init__(self, qdrant_client: QdrantClient, collection_name: str, embedding_model=None):
+        self.qdrant_client = qdrant_client
+        self.collection_name = collection_name
+        self.embedding_model = embedding_model or EmbeddingModel()
+        self.vectors = defaultdict(np.array)  # Still keeps a local copy if needed
+    def string_to_int_id(self, s: str) -> int:
+        return int(hashlib.sha256(s.encode('utf-8')).hexdigest(), 16) % (10**8)
+    def get_test_vector(self):
+        retrieved_vector = self.qdrant_client.retrieve(
+                collection_name="my_collection",
+                ids=[self.string_to_int_id("test_key")]
+            )
+        return retrieved_vector
+    def insert(self, key: str, vector: np.array) -> None:
+        point_id = self.string_to_int_id(key)
+        payload = {"text": key}
+        point = PointStruct(
+            id=point_id,
+            vector={"default": vector.tolist()},
+            payload=payload
+        )
+        print(f"Inserting vector for key: {key}, ID: {point_id}")
+        # Insert the vector into Qdrant with the associated document
+        self.qdrant_client.upsert(
+            collection_name=self.collection_name,
+            points=[point]  # Qdrant expects a list of PointStruct
+        )
+        print(f"Inserted vector for key: {key} with ID: {point_id}")
+        retrieved_vector = self.qdrant_client.retrieve(
+            collection_name=self.collection_name,
+            ids=[point_id]
+        )
+        print(f"Inserted vector with ID: {point_id}, retrieved: {retrieved_vector}")
+        self.list_vectors()
+    def list_vectors(self):
+        # List all vectors in the collection for debugging
+        collection_info = self.qdrant_client.get_collection(self.collection_name)
+        print(f"Collection info: {collection_info}")
+    def search(
+        self,
+        query_vector: np.array,
+        k: int,
+        distance_measure: Callable = None,
+    ) -> List[Tuple[str, float]]:
+        # Perform search in Qdrant
+        if isinstance(query_vector, list):
+            query_vector = np.array(query_vector)
+        print(self.collection_name)
+        print(f"Searching in collection: {self.collection_name} with vector: {query_vector}")
+        collection_info = self.qdrant_client.get_collection(self.collection_name)
+        print(f"Collection info: {collection_info}")
+        search_results = self.qdrant_client.search(
+            collection_name=self.collection_name,
+            query_vector=query_vector.tolist(),  # Pass the vector as a list
+            limit=k
+        )
+        print(f"Search results: {search_results}")
+        # print(query_vector.tolist())
+        # search_results = self.qdrant_client.query_points(
+        #     collection_name=self.collection_name,
+        #     query=query_vector.tolist(),  # Pass the vector as a list
+        #     limit=k,
+        # )
+        # Extract and return results
+        return [(result.payload['text'], result.score) for result in search_results]
+    def search_by_text(
+        self,
+        query_text: str,
+        k: int,
+        distance_measure: Callable = None,
+        return_as_text: bool = False,
+    ) -> List[Tuple[str, float]]:
+        self.list_vectors()
+        query_vector = self.embedding_model.get_embedding(query_text)
+        results = self.search(query_vector, k, distance_measure)
+        return [result[0] for result in results] if return_as_text else results
+    def retrieve_from_key(self, key: str) -> np.array:
+        # Retrieve from local cache
+        return self.vectors.get(key, None)
+    async def abuild_from_list(self, list_of_text: List[str]) -> "QdrantDatabase":
+        embeddings = await self.embedding_model.async_get_embeddings(list_of_text)
+        for text, embedding in zip(list_of_text, embeddings):
+            self.insert(text, np.array(embedding))
+        return self