Spaces:

rchrdgwr
/

CoolApp

Sleeping

App Files Files Community

rchrdgwr commited on Aug 26, 2024

Commit

0fbd1a9

1 Parent(s): 0614fbf

add recursive text splitter

Browse files

Files changed (5) hide show

BuildingAChainlitApp.md +3 -4
aimakerspace/text_utils.py +2 -4
app.py +15 -12
requirements.txt +2 -1
richard/text_utils.py +62 -3

BuildingAChainlitApp.md CHANGED Viewed

@@ -123,7 +123,7 @@ def process_text_file(file: AskFileResponse):
     text_loader = TextFileLoader(temp_file_path)
     documents = text_loader.load_documents()
-    texts = text_splitter.split_texts(documents)
     return texts
 ```
@@ -286,7 +286,7 @@ Code was modified to support pdf documents in the following areas:
                 raise ValueError(
                     f"Unsupported file type: {self.temp_file_path}"
                 )
-            return text_splitter.split_texts(self.documents)
         else:
             raise ValueError(
                     "Not a file"
@@ -297,9 +297,8 @@ Code was modified to support pdf documents in the following areas:
             self.documents.append(f.read())
     def load_pdf_file(self):
-        print("load_pdf_file()")
         pdf_document = fitz.open(self.temp_file_path)
-        print(len(pdf_document))
         for page_num in range(len(pdf_document)):
             page = pdf_document.load_page(page_num)
             text = page.get_text()

     text_loader = TextFileLoader(temp_file_path)
     documents = text_loader.load_documents()
+    texts = text_splitter.split_text(documents)
     return texts
 ```
                 raise ValueError(
                     f"Unsupported file type: {self.temp_file_path}"
                 )
+            return text_splitter.split_text(self.documents)
         else:
             raise ValueError(
                     "Not a file"
             self.documents.append(f.read())
     def load_pdf_file(self):
         pdf_document = fitz.open(self.temp_file_path)
         for page_num in range(len(pdf_document)):
             page = pdf_document.load_page(page_num)
             text = page.get_text()

aimakerspace/text_utils.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import os
 from typing import List
 class TextFileLoader:
     def __init__(self, path: str, encoding: str = "utf-8"):
         self.documents = []
@@ -55,18 +54,17 @@ class CharacterTextSplitter:
             chunks.append(text[i : i + self.chunk_size])
         return chunks
-    def split_texts(self, texts: List[str]) -> List[str]:
         chunks = []
         for text in texts:
             chunks.extend(self.split(text))
         return chunks
 if __name__ == "__main__":
     loader = TextFileLoader("data/KingLear.txt")
     loader.load()
     splitter = CharacterTextSplitter()
-    chunks = splitter.split_texts(loader.documents)
     print(len(chunks))
     print(chunks[0])
     print("--------")

 import os
 from typing import List
 class TextFileLoader:
     def __init__(self, path: str, encoding: str = "utf-8"):
         self.documents = []
             chunks.append(text[i : i + self.chunk_size])
         return chunks
+    def split_text(self, texts: List[str]) -> List[str]:
         chunks = []
         for text in texts:
             chunks.extend(self.split(text))
         return chunks
 if __name__ == "__main__":
     loader = TextFileLoader("data/KingLear.txt")
     loader.load()
     splitter = CharacterTextSplitter()
+    chunks = splitter.split_text(loader.documents)
     print(len(chunks))
     print(chunks[0])
     print("--------")

app.py CHANGED Viewed

@@ -14,8 +14,10 @@ from richard.text_utils import FileLoader
 from richard.pipeline import RetrievalAugmentedQAPipeline
 # from richard.vector_database import QdrantDatabase
 from qdrant_client import QdrantClient
-from langchain.vectorstores import Qdrant
 system_template = """\
 Use the following context to answer a users question.
@@ -33,11 +35,6 @@ Question:
 """
 user_role_prompt = UserRolePrompt(user_prompt_template)
-def process_file(file: AskFileResponse):
-    fileLoader = FileLoader()
-    return fileLoader.load_file(file)
 @cl.on_chat_start
 async def on_chat_start():
     res = await cl.AskActionMessage(
@@ -65,6 +62,17 @@ async def on_chat_start():
         )
         await msg.send()
         use_qdrant = False
     files = None
     # Wait for the user to upload a file
     while not files:
@@ -82,8 +90,7 @@ async def on_chat_start():
     )
     await msg.send()
-    # load the file
-    texts = process_file(file)
     msg = cl.Message(
         content=f"Resulted in {len(texts)} chunks", disable_human_feedback=True
@@ -99,15 +106,11 @@ async def on_chat_start():
     else:
         embedding_model = EmbeddingModel()
         if use_qdrant_type == "Local":
-            from qdrant_client.http.models import OptimizersConfig
-            print("Using qdrant local")
             qdrant_client = QdrantClient(location=":memory:")
             vector_params = VectorParams(
                 size=1536,  # vector size
                 distance="Cosine"  # distance metric
             )
             qdrant_client.recreate_collection(
                 collection_name="my_collection",
                 vectors_config={"default": vector_params},

 from richard.pipeline import RetrievalAugmentedQAPipeline
 # from richard.vector_database import QdrantDatabase
 from qdrant_client import QdrantClient
+def process_file(file, use_rct):
+    fileLoader = FileLoader()
+    return fileLoader.load_file(file, use_rct)
 system_template = """\
 Use the following context to answer a users question.
 """
 user_role_prompt = UserRolePrompt(user_prompt_template)
 @cl.on_chat_start
 async def on_chat_start():
     res = await cl.AskActionMessage(
         )
         await msg.send()
         use_qdrant = False
+    use_rct = False
+    res = await cl.AskActionMessage(
+        content="Do you want to use RecursiveCharacterTextSplitter?",
+        actions=[
+            cl.Action(name="yes", value="yes", label="✅ Yes"),
+            cl.Action(name="no", value="no", label="❌ No"),
+        ],
+    ).send()
+    if res and res.get("value") == "yes":
+        use_rct = True
     files = None
     # Wait for the user to upload a file
     while not files:
     )
     await msg.send()
+    texts = process_file(file, use_rct)
     msg = cl.Message(
         content=f"Resulted in {len(texts)} chunks", disable_human_feedback=True
     else:
         embedding_model = EmbeddingModel()
         if use_qdrant_type == "Local":
             qdrant_client = QdrantClient(location=":memory:")
             vector_params = VectorParams(
                 size=1536,  # vector size
                 distance="Cosine"  # distance metric
             )
             qdrant_client.recreate_collection(
                 collection_name="my_collection",
                 vectors_config={"default": vector_params},

requirements.txt CHANGED Viewed

@@ -2,4 +2,5 @@ numpy==1.26.4
 chainlit==0.7.700   # 1.1.402
 openai==1.3.5
 pymupdf==1.24.9
-qdrant-client==1.11.0

 chainlit==0.7.700   # 1.1.402
 openai==1.3.5
 pymupdf==1.24.9
+qdrant-client==1.11.0
+langchain-text-splitters

richard/text_utils.py CHANGED Viewed

@@ -1,8 +1,14 @@
 import os
 import fitz
 import tempfile
 from aimakerspace.text_utils import CharacterTextSplitter
 class FileLoader:
     def __init__(self, encoding: str = "utf-8"):
@@ -11,7 +17,11 @@ class FileLoader:
         self.temp_file_path = ""
-    def load_file(self, file, text_splitter=CharacterTextSplitter()):
         file_extension = os.path.splitext(file.name)[1].lower()
         with tempfile.NamedTemporaryFile(mode="wb", delete=False, suffix=file_extension) as temp_file:
             self.temp_file_path = temp_file.name
@@ -26,12 +36,14 @@ class FileLoader:
                 raise ValueError(
                     f"Unsupported file type: {self.temp_file_path}"
                 )
-            return text_splitter.split_texts(self.documents)
         else:
             raise ValueError(
                     "Not a file"
                 )
     def load_text_file(self):
         with open(self.temp_file_path, "r", encoding=self.encoding) as f:
             self.documents.append(f.read())
@@ -43,4 +55,51 @@ class FileLoader:
         for page_num in range(len(pdf_document)):
             page = pdf_document.load_page(page_num)
             text = page.get_text()
-            self.documents.append(text)

 import os
+from typing import List
 import fitz
 import tempfile
 from aimakerspace.text_utils import CharacterTextSplitter
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+# load the file
 class FileLoader:
     def __init__(self, encoding: str = "utf-8"):
         self.temp_file_path = ""
+    def load_file(self, file, use_rct):
+        if use_rct:
+            text_splitter=MyRecursiveCharacterTextSplitter()
+        else:
+            text_splitter=CharacterTextSplitter()
         file_extension = os.path.splitext(file.name)[1].lower()
         with tempfile.NamedTemporaryFile(mode="wb", delete=False, suffix=file_extension) as temp_file:
             self.temp_file_path = temp_file.name
                 raise ValueError(
                     f"Unsupported file type: {self.temp_file_path}"
                 )
+            print(self.documents)
+            return text_splitter.split_text(self.documents)
         else:
             raise ValueError(
                     "Not a file"
                 )
     def load_text_file(self):
         with open(self.temp_file_path, "r", encoding=self.encoding) as f:
             self.documents.append(f.read())
         for page_num in range(len(pdf_document)):
             page = pdf_document.load_page(page_num)
             text = page.get_text()
+            self.documents.append(text)
+class CharacterTextSplitter:
+    def __init__(
+        self,
+        chunk_size: int = 1000,
+        chunk_overlap: int = 200,
+    ):
+        assert (
+            chunk_size > chunk_overlap
+        ), "Chunk size must be greater than chunk overlap"
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+    def split(self, text: str) -> List[str]:
+        chunks = []
+        for i in range(0, len(text), self.chunk_size - self.chunk_overlap):
+            chunks.append(text[i : i + self.chunk_size])
+        return chunks
+    def split_text(self, texts: List[str]) -> List[str]:
+        chunks = []
+        for text in texts:
+            chunks.extend(self.split(text))
+        return chunks
+class MyRecursiveCharacterTextSplitter:
+    def __init__(
+        self
+    ):
+        self.RCTS = RecursiveCharacterTextSplitter(
+            chunk_size=1000,
+            chunk_overlap=20,
+            length_function=len,
+            separators=["\n\n", "\n", " ", ""]
+        )
+    def split_text(self, texts: List[str]) -> List[str]:
+        all_chunks = []
+        for doc in texts:
+            chunks = self.RCTS.split_text(doc)
+            all_chunks.extend(chunks)
+        return all_chunks