Spaces:

LeetTools
/

AskPy

Building

App Files Files Community

LeetTools commited on Nov 11, 2024

Commit

6eabcb6

verified ·

1 Parent(s): 7f2676a

Upload ask.py

Browse files

Files changed (1) hide show

ask.py +34 -34

ask.py CHANGED Viewed

@@ -17,6 +17,7 @@ import duckdb
 import gradio as gr
 import requests
 from bs4 import BeautifulSoup
 from dotenv import load_dotenv
 from jinja2 import BaseLoader, Environment
 from openai import OpenAI
@@ -113,6 +114,10 @@ class Ask:
         else:
             self.logger = _get_logger("INFO")
         self.db_con = duckdb.connect(":memory:")
         self.db_con.install_extension("vss")
@@ -248,15 +253,10 @@ class Ask:
         return scrape_results
-    def chunk_results(
-        self, scrape_results: Dict[str, str], size: int, overlap: int
-    ) -> Dict[str, List[str]]:
         chunking_results: Dict[str, List[str]] = {}
         for url, text in scrape_results.items():
-            chunks = []
-            for pos in range(0, len(text), size - overlap):
-                chunks.append(text[pos : pos + size])
-            chunking_results[url] = chunks
         return chunking_results
     def get_embedding(self, client: OpenAI, texts: List[str]) -> List[List[float]]:
@@ -304,7 +304,7 @@ CREATE TABLE {table_name} (
         )
         return table_name
-    def save_chunks_to_db(self, chunking_results: Dict[str, List[str]]) -> str:
         """
         The key of chunking_results is the URL and the value is the list of chunks.
         """
@@ -316,10 +316,10 @@ CREATE TABLE {table_name} (
         table_name = self._create_table()
         batches: List[Tuple[str, List[str]]] = []
-        for url, list_chunks in chunking_results.items():
             for i in range(0, len(list_chunks), embed_batch_size):
-                list_chunks = list_chunks[i : i + embed_batch_size]
-                batches.append((url, list_chunks))
         self.logger.info(f"Embedding {len(batches)} batches of chunks ...")
         partial_get_embedding = partial(self.batch_get_embedding, client)
@@ -327,9 +327,9 @@ CREATE TABLE {table_name} (
             all_embeddings = executor.map(partial_get_embedding, batches)
         self.logger.info(f"✅ Finished embedding.")
-        # we batch the insert data to speed up the insertion operation
-        # although the DuckDB doc says executeMany is optimized for batch insert
-        # but we found that it is faster to batch the insert data and run a single insert
         for chunk_batch, embeddings in all_embeddings:
             url = chunk_batch[0]
             list_chunks = chunk_batch[1]
@@ -678,19 +678,19 @@ Below is the provided content:
             if settings.output_mode == OutputMode.answer:
                 logger.info("Chunking the text ...")
                 yield "", update_logs()
-                chunking_results = self.chunk_results(scrape_results, 1000, 100)
-                total_chunks = 0
-                for url, chunks in chunking_results.items():
                     logger.debug(f"URL: {url}")
-                    total_chunks += len(chunks)
                     for i, chunk in enumerate(chunks):
-                        logger.debug(f"Chunk {i+1}: {chunk}")
-                logger.info(f"✅ Generated {total_chunks} chunks ...")
                 yield "", update_logs()
-                logger.info(f"Saving {total_chunks} chunks to DB ...")
                 yield "", update_logs()
-                table_name = self.save_chunks_to_db(chunking_results)
                 logger.info(f"✅ Successfully embedded and saved chunks to DB.")
                 yield "", update_logs()
@@ -940,7 +940,6 @@ def launch_gradio(
 )
 @click.option(
     "--inference-model-name",
-    "-m",
     required=False,
     default="gpt-4o-mini",
     help="Model name to use for inference",
@@ -951,9 +950,10 @@ def launch_gradio(
     help="Use hybrid search mode with both vector search and full-text search",
 )
 @click.option(
-    "--web-ui",
     is_flag=True,
-    help="Launch the web interface",
 )
 @click.option(
     "-l",
@@ -975,7 +975,7 @@ def search_extract_summarize(
     extract_schema_file: str,
     inference_model_name: str,
     hybrid_search: bool,
-    web_ui: bool,
     log_level: str,
 ):
     load_dotenv(dotenv_path=default_env_file, override=False)
@@ -996,7 +996,14 @@ def search_extract_summarize(
         extract_schema_str=_read_extract_schema_str(extract_schema_file),
     )
-    if web_ui or os.environ.get("RUN_GRADIO_UI", "false").lower() != "false":
         if os.environ.get("SHARE_GRADIO_UI", "false").lower() == "true":
             share_ui = True
         else:
@@ -1007,13 +1014,6 @@ def search_extract_summarize(
             share_ui=share_ui,
             logger=logger,
         )
-    else:
-        if query is None:
-            raise Exception("Query is required for the command line mode")
-        ask = Ask(logger=logger)
-        final_result = ask.run_query(query=query, settings=settings)
-        click.echo(final_result)
 if __name__ == "__main__":

 import gradio as gr
 import requests
 from bs4 import BeautifulSoup
+from chonkie import Chunk, TokenChunker
 from dotenv import load_dotenv
 from jinja2 import BaseLoader, Environment
 from openai import OpenAI
         else:
             self.logger = _get_logger("INFO")
+        self.logger.info("Initializing Chonkie ...")
+        self.chunker = TokenChunker(chunk_size=1000, chunk_overlap=100)
+        self.logger.info("✅ Successfully initialized Chonkie.")
         self.db_con = duckdb.connect(":memory:")
         self.db_con.install_extension("vss")
         return scrape_results
+    def chunk_results(self, scrape_results: Dict[str, str]) -> Dict[str, List[Chunk]]:
         chunking_results: Dict[str, List[str]] = {}
         for url, text in scrape_results.items():
+            chunking_results[url] = self.chunker.chunk(text)
         return chunking_results
     def get_embedding(self, client: OpenAI, texts: List[str]) -> List[List[float]]:
         )
         return table_name
+    def save_chunks_to_db(self, all_chunks: Dict[str, List[Chunk]]) -> str:
         """
         The key of chunking_results is the URL and the value is the list of chunks.
         """
         table_name = self._create_table()
         batches: List[Tuple[str, List[str]]] = []
+        for url, list_chunks in all_chunks.items():
             for i in range(0, len(list_chunks), embed_batch_size):
+                batch = [chunk.text for chunk in list_chunks[i : i + embed_batch_size]]
+                batches.append((url, batch))
         self.logger.info(f"Embedding {len(batches)} batches of chunks ...")
         partial_get_embedding = partial(self.batch_get_embedding, client)
             all_embeddings = executor.map(partial_get_embedding, batches)
         self.logger.info(f"✅ Finished embedding.")
+        # We batch the insert data to speed up the insertion operation.
+        # Although the DuckDB doc says executeMany is optimized for batch insert,
+        # we found that it is faster to batch the insert data and run a single insert.
         for chunk_batch, embeddings in all_embeddings:
             url = chunk_batch[0]
             list_chunks = chunk_batch[1]
             if settings.output_mode == OutputMode.answer:
                 logger.info("Chunking the text ...")
                 yield "", update_logs()
+                all_chunks = self.chunk_results(scrape_results)
+                chunk_count = 0
+                for url, chunks in all_chunks.items():
                     logger.debug(f"URL: {url}")
+                    chunk_count += len(chunks)
                     for i, chunk in enumerate(chunks):
+                        logger.debug(f"Chunk {i+1}: {chunk.text}")
+                logger.info(f"✅ Generated {chunk_count} chunks ...")
                 yield "", update_logs()
+                logger.info(f"Saving {chunk_count} chunks to DB ...")
                 yield "", update_logs()
+                table_name = self.save_chunks_to_db(all_chunks)
                 logger.info(f"✅ Successfully embedded and saved chunks to DB.")
                 yield "", update_logs()
 )
 @click.option(
     "--inference-model-name",
     required=False,
     default="gpt-4o-mini",
     help="Model name to use for inference",
     help="Use hybrid search mode with both vector search and full-text search",
 )
 @click.option(
+    "--run-cli",
+    "-c",
     is_flag=True,
+    help="Run as a command line tool instead of launching the Gradio UI",
 )
 @click.option(
     "-l",
     extract_schema_file: str,
     inference_model_name: str,
     hybrid_search: bool,
+    run_cli: bool,
     log_level: str,
 ):
     load_dotenv(dotenv_path=default_env_file, override=False)
         extract_schema_str=_read_extract_schema_str(extract_schema_file),
     )
+    if run_cli:
+        if query is None:
+            raise Exception("Query is required for the command line mode")
+        ask = Ask(logger=logger)
+        final_result = ask.run_query(query=query, settings=settings)
+        click.echo(final_result)
+    else:
         if os.environ.get("SHARE_GRADIO_UI", "false").lower() == "true":
             share_ui = True
         else:
             share_ui=share_ui,
             logger=logger,
         )
 if __name__ == "__main__":