Spaces:

DocSA
/

Legal_Position_hybrid_search_without_AI

Runtime error

App Files Files Community

DocUA commited on Oct 6, 2024

Commit

5f1f67c

0 Parent(s):

Initial commit

Browse files

Files changed (5) hide show

config.py +10 -0
main.py +277 -0
prompts +0 -0
prompts.py +56 -0
requirements.txt +11 -0

config.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import os
+from dotenv import load_dotenv
+# Завантажуємо налаштування з .env файлу
+load_dotenv()
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+if not OPENAI_API_KEY:
+    raise ValueError("API ключ OpenAI не знайдено. Додайте його в .env файл.")

main.py ADDED Viewed

	@@ -0,0 +1,277 @@

+import os
+import re
+import gradio as gr
+import pandas as pd
+import requests
+import json
+import faiss
+import nest_asyncio
+import sys
+from pathlib import Path
+from bs4 import BeautifulSoup
+from typing import Union, List
+import asyncio
+import nest_asyncio
+nest_asyncio.apply()
+from llama_index.core import (
+    StorageContext,
+    ServiceContext,
+    VectorStoreIndex,
+    Settings,
+    load_index_from_storage
+)
+from llama_index.llms.openai import OpenAI
+from llama_index.core.llms import ChatMessage
+from llama_index.core.schema import IndexNode
+from llama_index.core.storage.docstore import SimpleDocumentStore
+from llama_index.retrievers.bm25 import BM25Retriever
+from llama_index.embeddings.openai import OpenAIEmbedding
+from llama_index.vector_stores.faiss import FaissVectorStore
+from llama_index.core.retrievers import QueryFusionRetriever
+from llama_index.core.workflow import Event, Context, Workflow, StartEvent, StopEvent, step
+from llama_index.core.schema import NodeWithScore
+from llama_index.core.prompts import PromptTemplate
+from llama_index.core.response_synthesizers import ResponseMode, get_response_synthesizer
+from prompts import CITATION_QA_TEMPLATE, CITATION_REFINE_TEMPLATE
+# Constants and Settings
+from dotenv import load_dotenv
+# Завантажуємо налаштування з .env файлу
+load_dotenv()
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
+# Initialize embeddings and settings
+embed_model = OpenAIEmbedding(model_name="text-embedding-3-small")
+Settings.embed_model = embed_model
+Settings.context_window = 20000
+Settings.chunk_size = 1024
+Settings.similarity_top_k = 20
+# Your specific persist directory
+PERSIST_DIR = "/home/docsa/PycharmProjects/Legal_Position/Save_index"
+# Templates remain the same...
+class RetrieverEvent(Event):
+    """Result of running retrieval"""
+    nodes: list[NodeWithScore]
+class CitationQueryEngineWorkflow(Workflow):
+    @step
+    async def retrieve(self, ctx: Context, ev: StartEvent) -> Union[RetrieverEvent, None]:
+        query = ev.get("query")
+        question = ev.get("question")
+        if not query:
+            return None
+        await ctx.set("query", query)
+        await ctx.set("question", question)
+        nodes = retriever_fusion_faiss_bm25.retrieve(query)
+        return RetrieverEvent(nodes=nodes)
+    @step
+    async def synthesize(self, ctx: Context, ev: RetrieverEvent) -> StopEvent:
+        query = await ctx.get("query", default=None)
+        question = await ctx.get("question", default=None)
+        llm_answer = OpenAI(model="gpt-4o-mini")
+        synthesizer = get_response_synthesizer(
+            llm=llm_answer,
+            text_qa_template=CITATION_QA_TEMPLATE,
+            refine_template=CITATION_REFINE_TEMPLATE,
+            response_mode=ResponseMode.COMPACT,
+            use_async=True,
+        )
+        response = await synthesizer.asynthesize(query=query, question=question, nodes=ev.nodes)
+        return StopEvent(result=response)
+def initialize_components():
+    try:
+        persist_path = Path(PERSIST_DIR)
+        if not persist_path.exists():
+            raise FileNotFoundError(f"Directory not found: {persist_path}")
+        required_files = ['docstore.json', 'bm25_retriever', 'index_faiss']
+        missing_files = [f for f in required_files if not (persist_path / f).exists()]
+        if missing_files:
+            raise FileNotFoundError(f"Missing required files: {', '.join(missing_files)}")
+        global retriever_fusion_faiss_bm25
+        docstore = SimpleDocumentStore.from_persist_path(str(persist_path / "docstore.json"))
+        bm25_retriever = BM25Retriever.from_persist_dir(str(persist_path / "bm25_retriever"))
+        faiss_vector_store = FaissVectorStore.from_persist_dir(str(persist_path / "index_faiss"))
+        storage_context_faiss = StorageContext.from_defaults(
+            vector_store=faiss_vector_store,
+            persist_dir=str(persist_path / "index_faiss")
+        )
+        index_faiss = load_index_from_storage(storage_context=storage_context_faiss)
+        retriever_fusion_faiss_bm25 = QueryFusionRetriever(
+            [
+                bm25_retriever,
+                index_faiss.as_retriever(similarity_top_k=Settings.similarity_top_k, response_mode="no_text")
+            ],
+            mode="reciprocal_rerank",
+            similarity_top_k=Settings.similarity_top_k,
+            num_queries=1,
+            use_async=True,
+        )
+        return True
+    except Exception as e:
+        print(f"Error initializing components: {str(e)}", file=sys.stderr)
+        return False
+# Add this function before create_gradio_interface()
+async def process_court_decision(url, question, progress=gr.Progress()):
+    try:
+        # Extract text from URL
+        progress(0, desc="Extracting court decision text...")
+        court_decision_text = extract_court_decision_text(url)
+        # Generate legal position
+        progress(0.3, desc="Generating legal position...")
+        legal_position_json = generate_legal_position(court_decision_text, question)
+        # Initialize workflow
+        progress(0.5, desc="Initializing analysis workflow...")
+        w = CitationQueryEngineWorkflow(timeout=600)
+        # Run workflow
+        progress(0.7, desc="Analyzing and finding precedents...")
+        result = await w.run(query=legal_position_json["Legal_position"], question=question)
+        # Process results
+        progress(0.9, desc="Processing results...")
+        citations = re.findall(r'\[(\d+)\]', result.response)
+        unique_citations = sorted(set(citations), key=int)
+        # Prepare output
+        output = f"**Правова позиція:**\n{legal_position_json['Title']}: {legal_position_json['Legal_position']}\n\n"
+        output += f"**Відповідь ШІ:**\n{result.response}\n\n"
+        output += "**Цитовані джерела:**\n"
+        for citation in unique_citations:
+            citation_index = int(citation) - 1
+            if 0 <= citation_index < len(result.source_nodes):
+                output += f"[{citation}]: {result.source_nodes[citation_index].node.metadata['title']}\n"
+        progress(1.0, desc="Complete!")
+        return output
+    except Exception as e:
+        return f"Error processing court decision: {str(e)}"
+# Also, add the extract_court_decision_text function if it's not already there
+def extract_court_decision_text(url):
+    response = requests.get(url)
+    soup = BeautifulSoup(response.content, 'html.parser')
+    unwanted_texts = [
+        "Доступ до Реєстру здійснюється в тестовому (обмеженому) режимі.",
+        "З метою упередження перешкоджанню стабільній роботі Реєстру"
+    ]
+    decision_text = ""
+    for paragraph in soup.find_all('p'):
+        text = paragraph.get_text(separator="\n").strip()
+        if not any(unwanted_text in text for unwanted_text in unwanted_texts):
+            decision_text += text + "\n"
+    return decision_text.strip()
+# And the generate_legal_position function
+def generate_legal_position(court_decision_text, user_question):
+    llm_lp = OpenAI(model="ft:gpt-4o-mini-2024-07-18:dochome:legal-position-100:9wSVvFmd", temperature=0)
+    system_prompt = """
+    You are a qualified lawyer tasked with creating a Legal Position based on a court decision.
+    Your result will be used to search for precedents in the database of existing legal positions of the Supreme Court of Ukraine.
+    """
+    prompt = f"""To create the legal position:
+    1. Carefully read and analyze the Court decision.
+    2. Identify the key legal principle or ruling established in the decision.
+    3. Summarize this principle concisely, focusing on its legal implications.
+    4. Ensure your summary is clear, precise, and uses appropriate legal terminology.
+    Format your legal position following these guidelines:
+    - Keep it brief, ideally no more than 3-4 sentences.
+    - Use appropriate legal terminology.
+    - Do not include any additional explanations or comments.
+    Text content should be in Ukrainian only!
+    Return the result as JSON in the format:
+    {{
+        "Title": "Brief title of the legal position",
+        "Legal_position": "Full text of the legal position"
+    }}
+    Court decision:
+    {court_decision_text}
+    User's question:
+    {user_question}
+    """
+    messages = [
+        ChatMessage(role="system", content=system_prompt),
+        ChatMessage(role="user", content=prompt),
+    ]
+    response = llm_lp.chat(messages)
+    try:
+        return json.loads(response.message.content)
+    except json.JSONDecodeError:
+        # If JSON parsing fails, create a structured response
+        return {
+            "Title": "Error parsing response",
+            "Legal_position": response.message.content
+        }
+# Update the create_gradio_interface function to use share=True
+def create_gradio_interface():
+    with gr.Blocks() as app:
+        gr.Markdown("# Аналізатор судових рішень на основі правових позицій Верховного Суду")
+        with gr.Row():
+            url_input = gr.Textbox(label="URL судового рішення:")
+            question_input = gr.Textbox(label="Ваше питання:")
+        analyze_button = gr.Button("Аналізувати")
+        output = gr.Markdown(label="Результат аналізу")
+        analyze_button.click(
+            fn=lambda url, q: asyncio.run(process_court_decision(url, q)),
+            inputs=[url_input, question_input],
+            outputs=output
+        )
+    return app
+if __name__ == "__main__":
+    if initialize_components():
+        print("Components initialized successfully!")
+        app = create_gradio_interface()
+        app.launch(share=True)  # Added share=True here
+    else:
+        print("Failed to initialize components. Please check the paths and try again.", file=sys.stderr)
+        sys.exit(1)

prompts ADDED Viewed

File without changes

prompts.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from llama_index.core.prompts import PromptTemplate
+CITATION_QA_TEMPLATE = PromptTemplate(
+    "You are a qualified lawyer. "
+    "Please provide a legal analysis based solely on the provided legal precedents and positions. "
+    "When referencing a legal precedent or position, "
+    "cite the appropriate source(s) using their corresponding numbers. "
+    "Every answer should include at least one legal source citation relevant to the query. "
+    "Only cite a source when you are explicitly referencing it. "
+    "If none of the legal precedents or positions are helpful, you should indicate that. "
+    "Use only the Ukrainian language to answer. "
+    "For example:\n"
+    "Source 1:\n"
+    "The court ruled that contractual obligations must be fulfilled even if the terms are vague.\n"
+    "Source 2:\n"
+    "In case of unforeseen circumstances, contracts may be voided under certain conditions.\n"
+    "Query: Can a contract be voided due to unforeseen circumstances?\n"
+    "Answer: A contract may be voided under certain conditions if unforeseen circumstances arise [2]. "
+    "Now it's your turn. Below are several numbered legal sources and precedents (legal positions):"
+    "\n------\n"
+    "{context_str}"
+    "\n------\n"
+    "New legal issue: {query_str}\n"
+    "User question: {question}\n"
+    "Answer: "
+)
+CITATION_REFINE_TEMPLATE = PromptTemplate(
+    "You are a qualified lawyer. "
+    "Please refine the legal analysis based solely on the provided legal precedents and positions. "
+    "When referencing a legal precedent or position, "
+    "cite the appropriate source(s) using their corresponding numbers. "
+    "Every refined answer should include at least one relevant legal source citation. "
+    "Only cite a source when you are explicitly referencing it. "
+    "If none of the legal precedents or positions are helpful, you should repeat the existing answer. "
+    "Use only the Ukrainian language to answer."
+    "For example:\n"
+    "Source 1:\n"
+    "The court ruled that contractual obligations must be fulfilled even if the terms are vague.\n"
+    "Source 2:\n"
+    "In case of unforeseen circumstances, contracts may be voided under certain conditions.\n"
+    "Query: Can a contract be voided due to unforeseen circumstances?\n"
+    "Existing answer: A contract may be voided under certain conditions if unforeseen circumstances arise [2].\n"
+    "Now it's your turn. "
+    "We have provided an existing legal analysis: {existing_answer}"
+    "Below are several numbered legal sources and precedents. "
+    "Use them to refine the existing legal answer. "
+    "If the provided legal sources are not helpful, you will repeat the existing answer."
+    "\nBegin refining!"
+    "\n------\n"
+    "{context_msg}"
+    "\n------\n"
+    "New legal issue: {query_str}\n"
+    "User question: {question}\n"
+    "Answer: "
+)

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+llama-index
+llama-index-readers-file
+llama-index-vector-stores-faiss
+llama-index-retrievers-bm25
+openai
+faiss-cpu
+llama-index-embeddings-openai
+llama-index-llms-openai
+gradio
+beautifulsoup4
+nest-asyncio