Spaces:

DocSA
/

Legal_Position_hybrid_search_without_AI

Runtime error

App Files Files Community

DocUA commited on Dec 15, 2024

Commit

45fc0a6

1 Parent(s): b3880cc

refactoring + add run.py

Browse files

Files changed (6) hide show

README.md +1 -1
interface.py +8 -15
main.py +43 -7
requirements.txt +7 -8
run.py +15 -0
storage.py +43 -28

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: "Legal Position Search (without AI) BM25S long & short text"
 emoji: "⚖️"
 colorFrom: "blue"
 colorTo: "green"

 ---
+title: "Legal Position Search (without AI) BM25S long & short text + vector search ChromaDB"
 emoji: "⚖️"
 colorFrom: "blue"
 colorTo: "green"

interface.py CHANGED Viewed

@@ -1,23 +1,14 @@
 import gradio as gr
 import re
-from typing import Callable, Awaitable, Any, Tuple
-def create_gradio_interface(search_action: Callable[[str], Awaitable[Tuple[str, Any]]]) -> gr.Blocks:
-    """
-    Creates Gradio interface for legal search system.
-    Args:
-        search_action: Async function that performs the search and returns (output_text, nodes)
-    Returns:
-        gr.Blocks: Configured Gradio interface
-    """
     with gr.Blocks() as app:
         gr.Markdown("# Знаходьте правові позиції Верховного Суду")
         input_field = gr.Textbox(
-            label="Введіть текст для пошуку або посилання на судове рішення (у форматі https://reyestr.court.gov.ua/Review/{doc_id})",
             lines=1
         )
         search_button = gr.Button("Пошук", interactive=False)
@@ -25,8 +16,10 @@ def create_gradio_interface(search_action: Callable[[str], Awaitable[Tuple[str,
         search_output = gr.Markdown(label="Результат пошуку")
         state_nodes = gr.State()
         def update_button_state(text: str) -> Tuple[gr.update, gr.update]:
-            """Updates button state and warning message based on input text."""
             text = text.strip()
             if not text:
                 return gr.update(value="Пошук", interactive=False), gr.update(visible=False)
@@ -41,7 +34,7 @@ def create_gradio_interface(search_action: Callable[[str], Awaitable[Tuple[str,
                 return gr.update(value="Пошук за текстом", interactive=True), gr.update(visible=False)
         search_button.click(
-            fn=search_action,
             inputs=input_field,
             outputs=[search_output, state_nodes]
         )

 import gradio as gr
 import re
+from typing import Callable, Any, Tuple
+import asyncio
+def create_gradio_interface(search_action: Callable) -> gr.Blocks:
     with gr.Blocks() as app:
         gr.Markdown("# Знаходьте правові позиції Верховного Суду")
         input_field = gr.Textbox(
+            label="Введіть текст або посилання на судове рішення",
             lines=1
         )
         search_button = gr.Button("Пошук", interactive=False)
         search_output = gr.Markdown(label="Результат пошуку")
         state_nodes = gr.State()
+        async def async_wrapper(text):
+            return await search_action(text)
         def update_button_state(text: str) -> Tuple[gr.update, gr.update]:
             text = text.strip()
             if not text:
                 return gr.update(value="Пошук", interactive=False), gr.update(visible=False)
                 return gr.update(value="Пошук за текстом", interactive=True), gr.update(visible=False)
         search_button.click(
+            fn=async_wrapper,
             inputs=input_field,
             outputs=[search_output, state_nodes]
         )

main.py CHANGED Viewed

@@ -5,11 +5,15 @@ from pathlib import Path
 import nest_asyncio
 import requests
 from bs4 import BeautifulSoup
 from dotenv import load_dotenv
-from llama_index.core import Settings
 from llama_index.core.retrievers import QueryFusionRetriever
 from llama_index.retrievers.bm25 import BM25Retriever
 from interface import create_gradio_interface
 from storage import StorageManager
@@ -18,22 +22,28 @@ from storage import StorageManager
 load_dotenv()
 # Basic settings
-Settings.similarity_top_k = 20  # type: ignore
 Settings.llm = None
 # Storage settings
 LOCAL_DIR = Path("Save_Index_Local")
 BUCKET_NAME = "legal-position"
 PREFIX_RETRIEVER = "Save_Index_Ivan/"
 # Index parameters
 INDEX_NAME_BM25_LONG = "bm25_retriever"
 INDEX_NAME_BM25_SHORT = "bm25_retriever_short"
 REQUIRED_FILES = [INDEX_NAME_BM25_LONG, INDEX_NAME_BM25_SHORT]
 # Global retrievers
 retriever_bm25_long = None
 retriever_bm25_short = None
 # Initialize nest_asyncio for async operations
 nest_asyncio.apply()
@@ -100,12 +110,12 @@ def initialize_components():
         )
         # Check and sync data
-        if not storage_manager.sync_data(REQUIRED_FILES):
             raise FileNotFoundError("Failed to obtain required files")
-        global retriever_bm25_long, retriever_bm25_short
-        # Initialize retrievers
         bm25_retriever_long = BM25Retriever.from_persist_dir(
             str(LOCAL_DIR / INDEX_NAME_BM25_LONG)
         )
@@ -120,8 +130,27 @@ def initialize_components():
             use_async=True,
         )
         retriever_bm25_short = QueryFusionRetriever(
-            [bm25_retriever_short],
             similarity_top_k=Settings.similarity_top_k,
             num_queries=1,
             use_async=True,
@@ -220,7 +249,10 @@ def main():
     if initialize_components():
         print("Components initialized successfully!")
         app = create_gradio_interface(main_search_action)
-        app.launch(share=True)
     else:
         print(
             "Failed to initialize components. Please check the paths and try again.",
@@ -228,6 +260,10 @@ def main():
         )
         sys.exit(1)
 if __name__ == "__main__":
     main()

 import nest_asyncio
 import requests
+import chromadb
 from bs4 import BeautifulSoup
 from dotenv import load_dotenv
+from llama_index.core import Settings, StorageContext
 from llama_index.core.retrievers import QueryFusionRetriever
+from llama_index.vector_stores.chroma import ChromaVectorStore
+from llama_index.core import VectorStoreIndex
 from llama_index.retrievers.bm25 import BM25Retriever
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
 from interface import create_gradio_interface
 from storage import StorageManager
 load_dotenv()
 # Basic settings
+Settings.similarity_top_k = 20
 Settings.llm = None
+Settings.embed_model = HuggingFaceEmbedding(
+    model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
+)
 # Storage settings
 LOCAL_DIR = Path("Save_Index_Local")
 BUCKET_NAME = "legal-position"
 PREFIX_RETRIEVER = "Save_Index_Ivan/"
+CHROMA_DIR = "chroma_db_hf"
 # Index parameters
 INDEX_NAME_BM25_LONG = "bm25_retriever"
 INDEX_NAME_BM25_SHORT = "bm25_retriever_short"
 REQUIRED_FILES = [INDEX_NAME_BM25_LONG, INDEX_NAME_BM25_SHORT]
+REQUIRED_DIRS = [CHROMA_DIR]
 # Global retrievers
 retriever_bm25_long = None
 retriever_bm25_short = None
+retriever_chroma = None
 # Initialize nest_asyncio for async operations
 nest_asyncio.apply()
         )
         # Check and sync data
+        if not storage_manager.sync_data(REQUIRED_FILES, REQUIRED_DIRS):
             raise FileNotFoundError("Failed to obtain required files")
+        global retriever_bm25_long, retriever_bm25_short, retriever_chroma
+        # Initialize BM25 retrievers
         bm25_retriever_long = BM25Retriever.from_persist_dir(
             str(LOCAL_DIR / INDEX_NAME_BM25_LONG)
         )
             use_async=True,
         )
+        # Initialize ChromaDB
+        db_chroma = chromadb.PersistentClient(path=str(LOCAL_DIR / CHROMA_DIR))
+        chroma_collection = db_chroma.get_or_create_collection(name="legal_position")
+        chroma_vector_store = ChromaVectorStore(
+            chroma_collection=chroma_collection,
+            embedding_model=Settings.embed_model
+        )
+        storage_context = StorageContext.from_defaults(vector_store=chroma_vector_store)
+        # Create vector store index
+        vector_index = VectorStoreIndex.from_vector_store(
+            chroma_vector_store,
+            storage_context=storage_context,
+            embed_model=Settings.embed_model
+        )
+        retriever_chroma = vector_index.as_retriever(similarity_top_k=Settings.similarity_top_k)
+        # Create hybrid retriever for short texts
         retriever_bm25_short = QueryFusionRetriever(
+            [bm25_retriever_short, retriever_chroma],
             similarity_top_k=Settings.similarity_top_k,
             num_queries=1,
             use_async=True,
     if initialize_components():
         print("Components initialized successfully!")
         app = create_gradio_interface(main_search_action)
+        app.queue(max_size=1).launch(
+            show_error=True,
+            share=True
+        )
     else:
         print(
             "Failed to initialize components. Please check the paths and try again.",
         )
         sys.exit(1)
+if __name__ == "__main__":
+    # Видаляємо nest_asyncio.apply()
+    main()
 if __name__ == "__main__":
     main()

requirements.txt CHANGED Viewed

@@ -1,14 +1,13 @@
 llama-index
 llama-index-readers-file
-llama-index-vector-stores-faiss
 llama-index-retrievers-bm25
-openai
-faiss-cpu
-llama-index-embeddings-openai
-llama-index-llms-openai
-gradio
 beautifulsoup4
-nest-asyncio
 boto3
 python-dotenv
-openpyxl

 llama-index
 llama-index-readers-file
 llama-index-retrievers-bm25
+llama_index-vector-stores-chroma
+llama-index-embeddings-huggingface
 beautifulsoup4
 boto3
 python-dotenv
+gradio==4.44.1
+nest-asyncio>=1.5.6
+uvicorn>=0.22.0

run.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from fastapi import FastAPI
+from main import initialize_components, main_search_action
+from interface import create_gradio_interface
+import uvicorn
+import gradio as gr
+app = FastAPI()
+if initialize_components():
+    print("Components initialized successfully!")
+    gr_app = create_gradio_interface(main_search_action)
+    app = gr.mount_gradio_app(app, gr_app, path="/")
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=7860)

storage.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import os
 from pathlib import Path
 import boto3
-from typing import Optional
 class StorageManager:
@@ -28,31 +30,36 @@ class StorageManager:
                 region_name=region_name
             )
-    def check_local_data(self, required_files: list[str]) -> bool:
         """
-        Check if all required files exist in local directory.
         Args:
             required_files: List of required file names
         Returns:
-            bool: True if all files exist, False otherwise
         """
         self.local_dir.mkdir(parents=True, exist_ok=True)
         for file_name in required_files:
             if not (self.local_dir / file_name).exists():
                 print(f"Missing required file: {file_name}")
                 return False
         return True
     def download_s3_file(self, s3_key: str, local_path: Path) -> bool:
-        """
-        Download single file from S3.
-        Returns:
-            bool: True if download successful, False otherwise
-        """
         try:
             self.s3_client.download_file(self.bucket_name, s3_key, str(local_path))
             print(f"Downloaded: {s3_key} -> {local_path}")
@@ -61,24 +68,26 @@ class StorageManager:
             print(f"Error downloading {s3_key}: {str(e)}")
             return False
-    def download_s3_folder(self) -> bool:
         """
         Download entire folder from S3 to local directory.
-        Returns:
-            bool: True if download successful, False otherwise
         """
         try:
             if not self.use_s3:
                 raise ValueError("S3 credentials not configured")
             response = self.s3_client.list_objects_v2(
                 Bucket=self.bucket_name,
-                Prefix=self.prefix
             )
             if 'Contents' not in response:
-                print(f"No files found in S3 bucket {self.bucket_name} with prefix {self.prefix}")
                 return False
             success = True
@@ -87,7 +96,8 @@ class StorageManager:
                 if s3_key.endswith('/'):
                     continue
-                local_file_path = self.local_dir / Path(s3_key).relative_to(self.prefix)
                 local_file_path.parent.mkdir(parents=True, exist_ok=True)
                 if not self.download_s3_file(s3_key, local_file_path):
@@ -98,28 +108,33 @@ class StorageManager:
             print(f"Error downloading S3 folder: {str(e)}")
             return False
-    def sync_data(self, required_files: list[str]) -> bool:
         """
         Check local data and sync from S3 if needed.
         Args:
             required_files: List of required file names
         Returns:
-            bool: True if all required files are available after sync
         """
-        # First check if we have all files locally
-        if self.check_local_data(required_files):
-            print("All required files found locally")
             return True
-        # If not all files exist locally and S3 is configured, try to download
         if self.use_s3:
-            print("Downloading required files from S3...")
-            if self.download_s3_folder():
-                # Verify files after download
-                return self.check_local_data(required_files)
-            return False
-        print("Missing required files and S3 is not configured")
         return False

+# storage.py
 import os
 from pathlib import Path
 import boto3
+import shutil
+from typing import Optional, List
 class StorageManager:
                 region_name=region_name
             )
+    def check_local_data(self, required_files: List[str], required_dirs: List[str] = None) -> bool:
         """
+        Check if all required files and directories exist locally.
         Args:
             required_files: List of required file names
+            required_dirs: List of required directory names
         Returns:
+            bool: True if all required data exists, False otherwise
         """
         self.local_dir.mkdir(parents=True, exist_ok=True)
+        # Check files
         for file_name in required_files:
             if not (self.local_dir / file_name).exists():
                 print(f"Missing required file: {file_name}")
                 return False
+        # Check directories
+        if required_dirs:
+            for dir_name in required_dirs:
+                if not (self.local_dir / dir_name).is_dir():
+                    print(f"Missing required directory: {dir_name}")
+                    return False
         return True
     def download_s3_file(self, s3_key: str, local_path: Path) -> bool:
+        """Download single file from S3."""
         try:
             self.s3_client.download_file(self.bucket_name, s3_key, str(local_path))
             print(f"Downloaded: {s3_key} -> {local_path}")
             print(f"Error downloading {s3_key}: {str(e)}")
             return False
+    def download_s3_folder(self, specific_prefix: str = None) -> bool:
         """
         Download entire folder from S3 to local directory.
+        Args:
+            specific_prefix: Optional specific prefix to download only a subfolder
         """
         try:
             if not self.use_s3:
                 raise ValueError("S3 credentials not configured")
+            prefix = f"{self.prefix}{specific_prefix}" if specific_prefix else self.prefix
             response = self.s3_client.list_objects_v2(
                 Bucket=self.bucket_name,
+                Prefix=prefix
             )
             if 'Contents' not in response:
+                print(f"No files found in S3 bucket {self.bucket_name} with prefix {prefix}")
                 return False
             success = True
                 if s3_key.endswith('/'):
                     continue
+                relative_path = Path(s3_key).relative_to(self.prefix)
+                local_file_path = self.local_dir / relative_path
                 local_file_path.parent.mkdir(parents=True, exist_ok=True)
                 if not self.download_s3_file(s3_key, local_file_path):
             print(f"Error downloading S3 folder: {str(e)}")
             return False
+    def sync_data(self, required_files: List[str], required_dirs: List[str] = None) -> bool:
         """
         Check local data and sync from S3 if needed.
         Args:
             required_files: List of required file names
+            required_dirs: List of required directory names
         Returns:
+            bool: True if all required data is available after sync
         """
+        if self.check_local_data(required_files, required_dirs):
+            print("All required files and directories found locally")
             return True
         if self.use_s3:
+            print("Downloading required data from S3...")
+            if not self.download_s3_folder():
+                return False
+            # If we have specific directories to sync
+            if required_dirs:
+                for dir_name in required_dirs:
+                    if not self.download_s3_folder(dir_name):
+                        return False
+            return self.check_local_data(required_files, required_dirs)
+        print("Missing required data and S3 is not configured")
         return False