whatsthispdf07

Running

mitulagr2 commited on Jun 11, 2024

Commit

881c0e5

1 Parent(s): 8b90c15

minor fix

Files changed (1) hide show

app/rag.py CHANGED Viewed

@@ -52,7 +52,7 @@ class ChatPDF:
     def __init__(self):
-        text_parser = SentenceSplitter(chunk_size=512, chunk_overlap=20)
         self.logger.info("initializing the vector store related objects")
         # client = QdrantClient(host="localhost", port=6333)
@@ -78,11 +78,11 @@ class ChatPDF:
         # tokenizer.save_pretrained("./models/tokenizer/")
         self.logger.info("initializing the global settings")
-        Settings.text_splitter = text_parser
         Settings.embed_model = self.embed_model
         Settings.llm = llm
         # Settings.tokenzier = tokenizer
-        Settings.transformations = [text_parser]
     def ingest(self, files_dir: str):
         text_chunks = []
@@ -93,7 +93,7 @@ class ChatPDF:
         self.logger.info("enumerating docs")
         for doc_idx, doc in enumerate(docs):
-            curr_text_chunks = text_parser.split_text(doc.text)
             text_chunks.extend(curr_text_chunks)
             doc_ids.extend([doc_idx] * len(curr_text_chunks))

     def __init__(self):
+        self.text_parser = SentenceSplitter(chunk_size=512, chunk_overlap=20)
         self.logger.info("initializing the vector store related objects")
         # client = QdrantClient(host="localhost", port=6333)
         # tokenizer.save_pretrained("./models/tokenizer/")
         self.logger.info("initializing the global settings")
+        Settings.text_splitter = self.text_parser
         Settings.embed_model = self.embed_model
         Settings.llm = llm
         # Settings.tokenzier = tokenizer
+        Settings.transformations = [self.text_parser]
     def ingest(self, files_dir: str):
         text_chunks = []
         self.logger.info("enumerating docs")
         for doc_idx, doc in enumerate(docs):
+            curr_text_chunks = self.text_parser.split_text(doc.text)
             text_chunks.extend(curr_text_chunks)
             doc_ids.extend([doc_idx] * len(curr_text_chunks))