Spaces:
Runtime error
Runtime error
Commit
·
920001b
1
Parent(s):
a171113
Add basic BM25 search and corpus generation
Browse files- .gitignore +5 -0
- healthcheck_bm25.py +28 -0
- init_bm25.py +298 -0
- main.py +77 -53
- poetry.lock +0 -0
- pyproject.toml +41 -0
- requirements.txt +1 -0
.gitignore
CHANGED
@@ -7,6 +7,7 @@
|
|
7 |
# Ігноруємо кеші Python
|
8 |
__pycache__/
|
9 |
*.pyc
|
|
|
10 |
|
11 |
# Ігноруємо конфіденційні файли
|
12 |
.env
|
@@ -15,3 +16,7 @@ __pycache__/
|
|
15 |
Save_index/
|
16 |
/lp/
|
17 |
/Save_Index_Local/
|
|
|
|
|
|
|
|
|
|
7 |
# Ігноруємо кеші Python
|
8 |
__pycache__/
|
9 |
*.pyc
|
10 |
+
.gradio/
|
11 |
|
12 |
# Ігноруємо конфіденційні файли
|
13 |
.env
|
|
|
16 |
Save_index/
|
17 |
/lp/
|
18 |
/Save_Index_Local/
|
19 |
+
|
20 |
+
# Ігноруємо дані для генерації корпуса
|
21 |
+
*.csv
|
22 |
+
*.xlsx
|
healthcheck_bm25.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
|
3 |
+
from llama_index.retrievers.bm25 import BM25Retriever
|
4 |
+
from main import extract_court_decision_text
|
5 |
+
|
6 |
+
|
7 |
+
PERSIST_PATH = Path("Save_Index_Local")
|
8 |
+
|
9 |
+
INDEX_NAME = "bm25_retriever"
|
10 |
+
# INDEX_NAME = "bm25_retriever_meta"
|
11 |
+
|
12 |
+
TEST_CD_URL = "https://reyestr.court.gov.ua/Review/118766467"
|
13 |
+
# TEST_CD_URL = "https://reyestr.court.gov.ua/Review/118763429"
|
14 |
+
|
15 |
+
PRINT_CD = False
|
16 |
+
|
17 |
+
|
18 |
+
retriever = BM25Retriever.from_persist_dir(str(PERSIST_PATH / INDEX_NAME))
|
19 |
+
|
20 |
+
court_decision_text = extract_court_decision_text(TEST_CD_URL)
|
21 |
+
|
22 |
+
if PRINT_CD:
|
23 |
+
print(court_decision_text, "\n\n\n\n\n")
|
24 |
+
|
25 |
+
nodes_with_score = retriever.retrieve(court_decision_text)
|
26 |
+
for index, node_with_score in enumerate(nodes_with_score, start=1):
|
27 |
+
source_title = node_with_score.node.metadata.get("title", "Невідомий заголовок")
|
28 |
+
print(index, f"{node_with_score.score:.4f}", source_title, "\n", sep="\t")
|
init_bm25.py
ADDED
@@ -0,0 +1,298 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import unicodedata
|
3 |
+
from pathlib import Path
|
4 |
+
|
5 |
+
import bm25s
|
6 |
+
import pandas as pd
|
7 |
+
from llama_index.core import Document
|
8 |
+
from llama_index.core.schema import MetadataMode
|
9 |
+
from llama_index.core.vector_stores.utils import node_to_metadata_dict
|
10 |
+
from llama_index.retrievers.bm25 import BM25Retriever
|
11 |
+
|
12 |
+
|
13 |
+
|
14 |
+
PERSIST_PATH = Path("Save_Index_Local")
|
15 |
+
LP_INFO_FILE = "legal_position_with_categories_documents_all.xlsx"
|
16 |
+
|
17 |
+
INDEX_NAME = "bm25_retriever"
|
18 |
+
USE_META = False
|
19 |
+
|
20 |
+
# INDEX_NAME = "bm25_retriever_meta"
|
21 |
+
# USE_META = True
|
22 |
+
|
23 |
+
|
24 |
+
def clean_string(text: pd.Series):
|
25 |
+
text = text.fillna("")
|
26 |
+
text = text.str.replace(r"«|»", '"', regex=True)
|
27 |
+
text = text.str.replace(r"\xa0", " ")
|
28 |
+
text = text.str.replace(r"§", "№")
|
29 |
+
|
30 |
+
# Handle unicode fractions
|
31 |
+
text = text.apply(lambda t: unicodedata.normalize("NFKC", t)) # type: ignore
|
32 |
+
text = text.str.replace("⁄", "/")
|
33 |
+
|
34 |
+
return text
|
35 |
+
|
36 |
+
|
37 |
+
def find_matching_pattern(categories):
|
38 |
+
"""
|
39 |
+
Search for matching patterns in the categories list and return the first match found.
|
40 |
+
|
41 |
+
Args:
|
42 |
+
categories: List of strings or string to search in
|
43 |
+
|
44 |
+
Returns:
|
45 |
+
str: Matching pattern or empty string if no match found
|
46 |
+
"""
|
47 |
+
patterns = [
|
48 |
+
"Велика Палата",
|
49 |
+
"Касаційний кримінальний суд",
|
50 |
+
"Касаційний адміністративний суд",
|
51 |
+
"Касаційний господарський суд",
|
52 |
+
"Касаційний цивільний суд",
|
53 |
+
]
|
54 |
+
|
55 |
+
# Handle both string and list inputs
|
56 |
+
if isinstance(categories, str):
|
57 |
+
categories = [categories]
|
58 |
+
elif isinstance(categories, list):
|
59 |
+
# If list contains lists, flatten it
|
60 |
+
categories = [item for sublist in categories for item in sublist]
|
61 |
+
|
62 |
+
# Search for patterns
|
63 |
+
for category in categories:
|
64 |
+
for pattern in patterns:
|
65 |
+
if pattern in category:
|
66 |
+
return pattern
|
67 |
+
return ""
|
68 |
+
|
69 |
+
|
70 |
+
ukrainian_stopwords_1 = [
|
71 |
+
"я",
|
72 |
+
"ти",
|
73 |
+
"він",
|
74 |
+
"вона",
|
75 |
+
"воно",
|
76 |
+
"ми",
|
77 |
+
"ви",
|
78 |
+
"вони",
|
79 |
+
"це",
|
80 |
+
"той",
|
81 |
+
"така",
|
82 |
+
"таке",
|
83 |
+
"такі",
|
84 |
+
"цей",
|
85 |
+
"моя",
|
86 |
+
"твоя",
|
87 |
+
"його",
|
88 |
+
"її",
|
89 |
+
"наш",
|
90 |
+
"ваш",
|
91 |
+
"їх",
|
92 |
+
"де",
|
93 |
+
"чи",
|
94 |
+
"а",
|
95 |
+
"але",
|
96 |
+
"і",
|
97 |
+
"або",
|
98 |
+
"так",
|
99 |
+
"ні",
|
100 |
+
"чи",
|
101 |
+
"в",
|
102 |
+
"на",
|
103 |
+
"з",
|
104 |
+
"до",
|
105 |
+
"під",
|
106 |
+
"через",
|
107 |
+
"після",
|
108 |
+
"між",
|
109 |
+
"серед",
|
110 |
+
"без",
|
111 |
+
"для",
|
112 |
+
"про",
|
113 |
+
"о",
|
114 |
+
"за",
|
115 |
+
"від",
|
116 |
+
"до",
|
117 |
+
"як",
|
118 |
+
"якби",
|
119 |
+
"коли",
|
120 |
+
"де",
|
121 |
+
"тому",
|
122 |
+
"тому що",
|
123 |
+
"що",
|
124 |
+
"чому",
|
125 |
+
"хто",
|
126 |
+
"що",
|
127 |
+
"якось",
|
128 |
+
"коли-небудь",
|
129 |
+
"де-небудь",
|
130 |
+
"чимало",
|
131 |
+
]
|
132 |
+
|
133 |
+
ukrainian_stopwords_2 = [
|
134 |
+
# Articles
|
135 |
+
"і",
|
136 |
+
"й",
|
137 |
+
"у",
|
138 |
+
"в",
|
139 |
+
"та",
|
140 |
+
"і",
|
141 |
+
# Pronouns
|
142 |
+
"я",
|
143 |
+
"ти",
|
144 |
+
"він",
|
145 |
+
"вона",
|
146 |
+
"воно",
|
147 |
+
"ми",
|
148 |
+
"ви",
|
149 |
+
"вони",
|
150 |
+
"мене",
|
151 |
+
"тебе",
|
152 |
+
"його",
|
153 |
+
"її",
|
154 |
+
"нас",
|
155 |
+
"вас",
|
156 |
+
"їх",
|
157 |
+
"мій",
|
158 |
+
"твій",
|
159 |
+
"наш",
|
160 |
+
"ваш",
|
161 |
+
"свій",
|
162 |
+
# Prepositions
|
163 |
+
"з",
|
164 |
+
"до",
|
165 |
+
"від",
|
166 |
+
"біля",
|
167 |
+
"над",
|
168 |
+
"під",
|
169 |
+
"через",
|
170 |
+
"для",
|
171 |
+
"без",
|
172 |
+
"між",
|
173 |
+
"серед",
|
174 |
+
"крізь",
|
175 |
+
"понад",
|
176 |
+
"поза",
|
177 |
+
"крім",
|
178 |
+
# Conjunctions
|
179 |
+
"та",
|
180 |
+
"і",
|
181 |
+
"але",
|
182 |
+
"або",
|
183 |
+
"однак",
|
184 |
+
"проте",
|
185 |
+
"тому",
|
186 |
+
"тому що",
|
187 |
+
"оскільки",
|
188 |
+
"якщо",
|
189 |
+
"коли",
|
190 |
+
"хоча",
|
191 |
+
# Auxiliary words
|
192 |
+
"так",
|
193 |
+
"ні",
|
194 |
+
"не",
|
195 |
+
"бути",
|
196 |
+
"мати",
|
197 |
+
"можна",
|
198 |
+
"треба",
|
199 |
+
# Common filler words
|
200 |
+
"цей",
|
201 |
+
"той",
|
202 |
+
"це",
|
203 |
+
"те",
|
204 |
+
"такий",
|
205 |
+
"який",
|
206 |
+
"котрий",
|
207 |
+
# Modal words
|
208 |
+
"мабуть",
|
209 |
+
"напевно",
|
210 |
+
"звичайно",
|
211 |
+
"можливо",
|
212 |
+
# Particles
|
213 |
+
"ось",
|
214 |
+
"ніби",
|
215 |
+
"майже",
|
216 |
+
"майже що",
|
217 |
+
"саме",
|
218 |
+
"лише",
|
219 |
+
"тільки",
|
220 |
+
]
|
221 |
+
|
222 |
+
ukrainian_stopwords = list(set(ukrainian_stopwords_1 + ukrainian_stopwords_2))
|
223 |
+
|
224 |
+
|
225 |
+
final_df = pd.read_excel(LP_INFO_FILE)
|
226 |
+
|
227 |
+
if USE_META:
|
228 |
+
category_columns = [
|
229 |
+
col for col in final_df.columns if re.match(r"category_\d+$", col)
|
230 |
+
]
|
231 |
+
|
232 |
+
text_columns = ["title", "text_lp", "category_all"] + category_columns
|
233 |
+
final_df[text_columns] = final_df[text_columns].apply(clean_string)
|
234 |
+
|
235 |
+
final_df["category_search"] = final_df[category_columns].apply(
|
236 |
+
lambda row: ", ".join([str(val) for val in row if pd.notna(val)]), axis=1
|
237 |
+
)
|
238 |
+
final_df["category_filter"] = final_df["category_all"].apply(find_matching_pattern)
|
239 |
+
|
240 |
+
legal_position_title_category = [
|
241 |
+
Document(
|
242 |
+
text=row["text_lp"], # type: ignore
|
243 |
+
metadata={ # type: ignore
|
244 |
+
"lp_id": row["id"],
|
245 |
+
"title": row["title"],
|
246 |
+
"doc_id": row["document_ids"],
|
247 |
+
"category_filter": find_matching_pattern(row["category_all"]),
|
248 |
+
"category_search": row["category_search"],
|
249 |
+
},
|
250 |
+
excluded_embed_metadata_keys=["doc_id", "category_filter"],
|
251 |
+
excluded_llm_metadata_keys=["doc_id", "category_filter"],
|
252 |
+
)
|
253 |
+
for _, row in final_df.iterrows()
|
254 |
+
]
|
255 |
+
else:
|
256 |
+
final_df[["title", "text_lp"]] = final_df[["title", "text_lp"]].apply(clean_string)
|
257 |
+
legal_position_title_category = [
|
258 |
+
Document(
|
259 |
+
text=row["text_lp"], # type: ignore
|
260 |
+
metadata={ # type: ignore
|
261 |
+
"title": row["title"],
|
262 |
+
},
|
263 |
+
excluded_embed_metadata_keys=["title"],
|
264 |
+
excluded_llm_metadata_keys=["title"],
|
265 |
+
)
|
266 |
+
for _, row in final_df.iterrows()
|
267 |
+
]
|
268 |
+
|
269 |
+
|
270 |
+
# Copied from BM25Retriever __init__ method, but note that output looks awful and might work worse (this needs checking)
|
271 |
+
corpus = [node_to_metadata_dict(node) for node in legal_position_title_category]
|
272 |
+
corpus_tokens = bm25s.tokenize(
|
273 |
+
[
|
274 |
+
node.get_content(metadata_mode=MetadataMode.EMBED)
|
275 |
+
for node in legal_position_title_category
|
276 |
+
],
|
277 |
+
stopwords=ukrainian_stopwords,
|
278 |
+
)
|
279 |
+
|
280 |
+
existing_bm25 = bm25s.BM25(
|
281 |
+
k1=1.88,
|
282 |
+
b=1.25,
|
283 |
+
delta=0.5,
|
284 |
+
method="robertson",
|
285 |
+
# No corpus is saved without this line:
|
286 |
+
corpus=corpus, # prevents TypeError: 'NoneType' object is not subscriptable
|
287 |
+
)
|
288 |
+
existing_bm25.index(corpus=corpus_tokens)
|
289 |
+
|
290 |
+
bm25_retriever = BM25Retriever(
|
291 |
+
existing_bm25=existing_bm25,
|
292 |
+
similarity_top_k=20,
|
293 |
+
)
|
294 |
+
|
295 |
+
bm25_retriever.persist(str(PERSIST_PATH / INDEX_NAME))
|
296 |
+
|
297 |
+
# Returns an error on invalid corpus
|
298 |
+
loaded_retriever = BM25Retriever.from_persist_dir(str(PERSIST_PATH / INDEX_NAME))
|
main.py
CHANGED
@@ -1,32 +1,36 @@
|
|
|
|
1 |
import os
|
2 |
import re
|
3 |
-
import gradio as gr
|
4 |
-
import requests
|
5 |
-
import nest_asyncio
|
6 |
import sys
|
7 |
-
import boto3
|
8 |
-
|
9 |
from pathlib import Path
|
10 |
-
from bs4 import BeautifulSoup
|
11 |
-
from llama_index.core import (
|
12 |
-
Settings,
|
13 |
-
)
|
14 |
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
from llama_index.core.retrievers import QueryFusionRetriever
|
|
|
17 |
|
18 |
|
19 |
-
from dotenv import load_dotenv
|
20 |
-
|
21 |
load_dotenv()
|
22 |
|
23 |
-
Settings.similarity_top_k = 20
|
24 |
|
25 |
# Параметри S3
|
26 |
BUCKET_NAME = "legal-position"
|
27 |
PREFIX_RETRIEVER = "Save_Index/" # Префікс для всього вмісту, який потрібно завантажити
|
28 |
LOCAL_DIR = Path("Save_Index_Local") # Локальна директорія для збереження даних з S3
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
# Ініціалізація клієнта S3
|
32 |
s3_client = boto3.client(
|
@@ -36,9 +40,6 @@ s3_client = boto3.client(
|
|
36 |
region_name="eu-north-1"
|
37 |
)
|
38 |
|
39 |
-
# Створюємо локальну директорію, якщо вона не існує
|
40 |
-
LOCAL_DIR.mkdir(parents=True, exist_ok=True)
|
41 |
-
|
42 |
# Функція для завантаження файлу з S3
|
43 |
def download_s3_file(bucket_name, s3_key, local_path):
|
44 |
s3_client.download_file(bucket_name, s3_key, str(local_path))
|
@@ -73,60 +74,66 @@ def parse_doc_ids(doc_ids):
|
|
73 |
if doc_ids is None:
|
74 |
return []
|
75 |
if isinstance(doc_ids, list):
|
76 |
-
return [str(id).strip(
|
77 |
if isinstance(doc_ids, str):
|
78 |
-
cleaned = doc_ids.strip(
|
79 |
if cleaned:
|
80 |
-
return [id.strip() for id in cleaned.split(
|
81 |
return []
|
82 |
|
|
|
83 |
def get_links_html(doc_ids):
|
84 |
parsed_ids = parse_doc_ids(doc_ids)
|
85 |
if not parsed_ids:
|
86 |
return ""
|
87 |
-
links = [
|
88 |
-
|
|
|
|
|
89 |
return ", ".join(links)
|
90 |
|
|
|
91 |
def parse_lp_ids(lp_ids):
|
92 |
if lp_ids is None:
|
93 |
return []
|
94 |
if isinstance(lp_ids, (str, int)):
|
95 |
-
cleaned = str(lp_ids).strip(
|
96 |
if cleaned:
|
97 |
return [cleaned]
|
98 |
return []
|
99 |
|
|
|
100 |
def get_links_html_lp(lp_ids):
|
101 |
parsed_ids = parse_lp_ids(lp_ids)
|
102 |
if not parsed_ids:
|
103 |
return ""
|
104 |
-
links = [
|
|
|
|
|
|
|
105 |
return ", ".join(links)
|
106 |
|
107 |
|
108 |
def initialize_components():
|
109 |
try:
|
110 |
-
|
111 |
-
|
112 |
-
if not persist_path.exists():
|
113 |
-
raise FileNotFoundError(f"Directory not found: {persist_path}")
|
114 |
|
115 |
-
required_files = [
|
116 |
-
missing_files = [f for f in required_files if not (
|
117 |
|
118 |
if missing_files:
|
119 |
raise FileNotFoundError(f"Missing required files: {', '.join(missing_files)}")
|
120 |
|
121 |
global retriever_bm25
|
122 |
|
123 |
-
bm25_retriever = BM25Retriever.from_persist_dir(str(
|
124 |
|
125 |
retriever_bm25 = QueryFusionRetriever(
|
126 |
[
|
127 |
bm25_retriever,
|
128 |
],
|
129 |
-
similarity_top_k=Settings.similarity_top_k,
|
130 |
num_queries=1,
|
131 |
use_async=True,
|
132 |
)
|
@@ -138,15 +145,15 @@ def initialize_components():
|
|
138 |
|
139 |
def extract_court_decision_text(url):
|
140 |
response = requests.get(url)
|
141 |
-
soup = BeautifulSoup(response.content,
|
142 |
|
143 |
unwanted_texts = [
|
144 |
"Доступ до Реєстру здійснюється в тестовому (обмеженому) режимі.",
|
145 |
-
"З метою упередження перешкоджанню стабільній роботі Реєстру"
|
146 |
]
|
147 |
|
148 |
decision_text = ""
|
149 |
-
for paragraph in soup.find_all(
|
150 |
text = paragraph.get_text(separator="\n").strip()
|
151 |
if not any(unwanted_text in text for unwanted_text in unwanted_texts):
|
152 |
decision_text += text + "\n"
|
@@ -158,11 +165,13 @@ async def search_without_ai_action(url):
|
|
158 |
court_decision_text = extract_court_decision_text(url)
|
159 |
nodes = await retriever_bm25.aretrieve(court_decision_text)
|
160 |
|
161 |
-
search_output_content =
|
|
|
|
|
162 |
for index, node in enumerate(nodes, start=1):
|
163 |
-
source_title = node.node.metadata.get(
|
164 |
-
doc_ids = node.node.metadata.get(
|
165 |
-
lp_ids = node.node.metadata.get(
|
166 |
links = get_links_html(doc_ids)
|
167 |
links_lp = get_links_html_lp(lp_ids)
|
168 |
|
@@ -172,20 +181,20 @@ async def search_without_ai_action(url):
|
|
172 |
except Exception as e:
|
173 |
return f"Error during search: {str(e)}", None
|
174 |
|
|
|
175 |
async def search_without_ai_action_text(question_input):
|
176 |
try:
|
177 |
nodes = await retriever_bm25.aretrieve(question_input)
|
178 |
|
179 |
search_output_content = f"**Результати пошуку (наявні правові позиції ВС) за текстовим запитом:** \n\n"
|
180 |
for index, node in enumerate(nodes, start=1):
|
181 |
-
source_title = node.node.metadata.get(
|
182 |
-
doc_ids = node.node.metadata.get(
|
183 |
-
lp_ids = node.node.metadata.get(
|
184 |
links = get_links_html(doc_ids)
|
185 |
links_lp = get_links_html_lp(lp_ids)
|
186 |
search_output_content += f"\n[{index}] *{source_title}* ⚖️ {links_lp} | {links} 👉 Score: {node.score} \n"
|
187 |
|
188 |
-
|
189 |
return search_output_content, nodes
|
190 |
except Exception as e:
|
191 |
return f"Error during search: {str(e)}", None
|
@@ -195,7 +204,9 @@ def create_gradio_interface():
|
|
195 |
with gr.Blocks() as app:
|
196 |
gr.Markdown("# Знаходьте правові позиції Верховного Суду")
|
197 |
|
198 |
-
input_field = gr.Textbox(
|
|
|
|
|
199 |
search_button = gr.Button("Пошук", interactive=False)
|
200 |
warning_message = gr.Markdown(visible=False)
|
201 |
|
@@ -204,7 +215,9 @@ def create_gradio_interface():
|
|
204 |
state_nodes = gr.State()
|
205 |
|
206 |
async def search_action(input_text):
|
207 |
-
if re.match(
|
|
|
|
|
208 |
return await search_without_ai_action(input_text)
|
209 |
else:
|
210 |
return await search_without_ai_action_text(input_text)
|
@@ -212,33 +225,44 @@ def create_gradio_interface():
|
|
212 |
def update_button_state(text):
|
213 |
text = text.strip()
|
214 |
if not text:
|
215 |
-
return gr.update(value="Пошук", interactive=False), gr.update(
|
|
|
|
|
216 |
elif re.match(r"^https://reyestr\.court\.gov\.ua/Review/\d+$", text):
|
217 |
-
return gr.update(value="Пошук за URL", interactive=True), gr.update(
|
|
|
|
|
218 |
elif text.startswith("http"):
|
219 |
-
return gr.update(value="Пошук", interactive=False), gr.update(
|
|
|
|
|
|
|
220 |
else:
|
221 |
-
return gr.update(value="Пошук за текстом", interactive=True), gr.update(
|
|
|
|
|
222 |
|
223 |
search_button.click(
|
224 |
-
fn=search_action,
|
225 |
-
inputs=input_field,
|
226 |
-
outputs=[search_output, state_nodes]
|
227 |
)
|
228 |
|
229 |
input_field.change(
|
230 |
fn=update_button_state,
|
231 |
inputs=input_field,
|
232 |
-
outputs=[search_button, warning_message]
|
233 |
)
|
234 |
|
235 |
return app
|
236 |
|
|
|
237 |
if __name__ == "__main__":
|
238 |
if initialize_components():
|
239 |
print("Components initialized successfully!")
|
240 |
app = create_gradio_interface()
|
241 |
app.launch(share=True)
|
242 |
else:
|
243 |
-
print(
|
|
|
|
|
|
|
244 |
sys.exit(1)
|
|
|
1 |
+
import asyncio
|
2 |
import os
|
3 |
import re
|
|
|
|
|
|
|
4 |
import sys
|
|
|
|
|
5 |
from pathlib import Path
|
|
|
|
|
|
|
|
|
6 |
|
7 |
+
import boto3
|
8 |
+
import gradio as gr
|
9 |
+
import nest_asyncio
|
10 |
+
import requests
|
11 |
+
from bs4 import BeautifulSoup
|
12 |
+
from dotenv import load_dotenv
|
13 |
+
from llama_index.core import Settings
|
14 |
from llama_index.core.retrievers import QueryFusionRetriever
|
15 |
+
from llama_index.retrievers.bm25 import BM25Retriever
|
16 |
|
17 |
|
|
|
|
|
18 |
load_dotenv()
|
19 |
|
20 |
+
Settings.similarity_top_k = 20 # type: ignore
|
21 |
|
22 |
# Параметри S3
|
23 |
BUCKET_NAME = "legal-position"
|
24 |
PREFIX_RETRIEVER = "Save_Index/" # Префікс для всього вмісту, який потрібно завантажити
|
25 |
LOCAL_DIR = Path("Save_Index_Local") # Локальна директорія для збереження даних з S3
|
26 |
|
27 |
+
# Параметри індексу
|
28 |
+
PERSIST_PATH = Path("Save_Index_Local")
|
29 |
+
INDEX_NAME = "bm25_retriever"
|
30 |
+
# INDEX_NAME = "bm25_retriever_meta"
|
31 |
+
|
32 |
+
# Створюємо локальну директорію, якщо вона не існує
|
33 |
+
LOCAL_DIR.mkdir(parents=True, exist_ok=True)
|
34 |
|
35 |
# Ініціалізація клієнта S3
|
36 |
s3_client = boto3.client(
|
|
|
40 |
region_name="eu-north-1"
|
41 |
)
|
42 |
|
|
|
|
|
|
|
43 |
# Функція для завантаження файлу з S3
|
44 |
def download_s3_file(bucket_name, s3_key, local_path):
|
45 |
s3_client.download_file(bucket_name, s3_key, str(local_path))
|
|
|
74 |
if doc_ids is None:
|
75 |
return []
|
76 |
if isinstance(doc_ids, list):
|
77 |
+
return [str(id).strip("[]") for id in doc_ids]
|
78 |
if isinstance(doc_ids, str):
|
79 |
+
cleaned = doc_ids.strip("[]").replace(" ", "")
|
80 |
if cleaned:
|
81 |
+
return [id.strip() for id in cleaned.split(",")]
|
82 |
return []
|
83 |
|
84 |
+
|
85 |
def get_links_html(doc_ids):
|
86 |
parsed_ids = parse_doc_ids(doc_ids)
|
87 |
if not parsed_ids:
|
88 |
return ""
|
89 |
+
links = [
|
90 |
+
f"[Рішення ВС: {doc_id}](https://reyestr.court.gov.ua/Review/{doc_id})"
|
91 |
+
for doc_id in parsed_ids
|
92 |
+
]
|
93 |
return ", ".join(links)
|
94 |
|
95 |
+
|
96 |
def parse_lp_ids(lp_ids):
|
97 |
if lp_ids is None:
|
98 |
return []
|
99 |
if isinstance(lp_ids, (str, int)):
|
100 |
+
cleaned = str(lp_ids).strip("[]").replace(" ", "")
|
101 |
if cleaned:
|
102 |
return [cleaned]
|
103 |
return []
|
104 |
|
105 |
+
|
106 |
def get_links_html_lp(lp_ids):
|
107 |
parsed_ids = parse_lp_ids(lp_ids)
|
108 |
if not parsed_ids:
|
109 |
return ""
|
110 |
+
links = [
|
111 |
+
f"[Правова позиція ВС: {lp_id}](https://lpd.court.gov.ua/home/search/{lp_id})"
|
112 |
+
for lp_id in parsed_ids
|
113 |
+
]
|
114 |
return ", ".join(links)
|
115 |
|
116 |
|
117 |
def initialize_components():
|
118 |
try:
|
119 |
+
if not PERSIST_PATH.exists():
|
120 |
+
raise FileNotFoundError(f"Directory not found: {PERSIST_PATH}")
|
|
|
|
|
121 |
|
122 |
+
required_files = [INDEX_NAME]
|
123 |
+
missing_files = [f for f in required_files if not (PERSIST_PATH / f).exists()]
|
124 |
|
125 |
if missing_files:
|
126 |
raise FileNotFoundError(f"Missing required files: {', '.join(missing_files)}")
|
127 |
|
128 |
global retriever_bm25
|
129 |
|
130 |
+
bm25_retriever = BM25Retriever.from_persist_dir(str(PERSIST_PATH / INDEX_NAME))
|
131 |
|
132 |
retriever_bm25 = QueryFusionRetriever(
|
133 |
[
|
134 |
bm25_retriever,
|
135 |
],
|
136 |
+
similarity_top_k=Settings.similarity_top_k, # type: ignore
|
137 |
num_queries=1,
|
138 |
use_async=True,
|
139 |
)
|
|
|
145 |
|
146 |
def extract_court_decision_text(url):
|
147 |
response = requests.get(url)
|
148 |
+
soup = BeautifulSoup(response.content, "html.parser")
|
149 |
|
150 |
unwanted_texts = [
|
151 |
"Доступ до Реєстру здійснюється в тестовому (обмеженому) режимі.",
|
152 |
+
"З метою упередження перешкоджанню стабільній роботі Реєстру",
|
153 |
]
|
154 |
|
155 |
decision_text = ""
|
156 |
+
for paragraph in soup.find_all("p"):
|
157 |
text = paragraph.get_text(separator="\n").strip()
|
158 |
if not any(unwanted_text in text for unwanted_text in unwanted_texts):
|
159 |
decision_text += text + "\n"
|
|
|
165 |
court_decision_text = extract_court_decision_text(url)
|
166 |
nodes = await retriever_bm25.aretrieve(court_decision_text)
|
167 |
|
168 |
+
search_output_content = (
|
169 |
+
f"**Результати пошуку (наявні правові позиції ВС) за посиланням:** \n\n"
|
170 |
+
)
|
171 |
for index, node in enumerate(nodes, start=1):
|
172 |
+
source_title = node.node.metadata.get("title", "Невідомий заголовок")
|
173 |
+
doc_ids = node.node.metadata.get("doc_id")
|
174 |
+
lp_ids = node.node.metadata.get("lp_id")
|
175 |
links = get_links_html(doc_ids)
|
176 |
links_lp = get_links_html_lp(lp_ids)
|
177 |
|
|
|
181 |
except Exception as e:
|
182 |
return f"Error during search: {str(e)}", None
|
183 |
|
184 |
+
|
185 |
async def search_without_ai_action_text(question_input):
|
186 |
try:
|
187 |
nodes = await retriever_bm25.aretrieve(question_input)
|
188 |
|
189 |
search_output_content = f"**Результати пошуку (наявні правові позиції ВС) за текстовим запитом:** \n\n"
|
190 |
for index, node in enumerate(nodes, start=1):
|
191 |
+
source_title = node.node.metadata.get("title", "Невідомий заголовок")
|
192 |
+
doc_ids = node.node.metadata.get("doc_id")
|
193 |
+
lp_ids = node.node.metadata.get("lp_id")
|
194 |
links = get_links_html(doc_ids)
|
195 |
links_lp = get_links_html_lp(lp_ids)
|
196 |
search_output_content += f"\n[{index}] *{source_title}* ⚖️ {links_lp} | {links} 👉 Score: {node.score} \n"
|
197 |
|
|
|
198 |
return search_output_content, nodes
|
199 |
except Exception as e:
|
200 |
return f"Error during search: {str(e)}", None
|
|
|
204 |
with gr.Blocks() as app:
|
205 |
gr.Markdown("# Знаходьте правові позиції Верховного Суду")
|
206 |
|
207 |
+
input_field = gr.Textbox(
|
208 |
+
label="Введіть текст або посилання на судове рішення", lines=1
|
209 |
+
)
|
210 |
search_button = gr.Button("Пошук", interactive=False)
|
211 |
warning_message = gr.Markdown(visible=False)
|
212 |
|
|
|
215 |
state_nodes = gr.State()
|
216 |
|
217 |
async def search_action(input_text):
|
218 |
+
if re.match(
|
219 |
+
r"^https://reyestr\.court\.gov\.ua/Review/\d+$", input_text.strip()
|
220 |
+
):
|
221 |
return await search_without_ai_action(input_text)
|
222 |
else:
|
223 |
return await search_without_ai_action_text(input_text)
|
|
|
225 |
def update_button_state(text):
|
226 |
text = text.strip()
|
227 |
if not text:
|
228 |
+
return gr.update(value="Пошук", interactive=False), gr.update(
|
229 |
+
visible=False
|
230 |
+
)
|
231 |
elif re.match(r"^https://reyestr\.court\.gov\.ua/Review/\d+$", text):
|
232 |
+
return gr.update(value="Пошук за URL", interactive=True), gr.update(
|
233 |
+
visible=False
|
234 |
+
)
|
235 |
elif text.startswith("http"):
|
236 |
+
return gr.update(value="Пошук", interactive=False), gr.update(
|
237 |
+
value="Неправильний формат URL. Використовуйте посилання формату https://reyestr.court.gov.ua/Review/{doc_id}",
|
238 |
+
visible=True,
|
239 |
+
)
|
240 |
else:
|
241 |
+
return gr.update(value="Пошук за текстом", interactive=True), gr.update(
|
242 |
+
visible=False
|
243 |
+
)
|
244 |
|
245 |
search_button.click(
|
246 |
+
fn=search_action, inputs=input_field, outputs=[search_output, state_nodes]
|
|
|
|
|
247 |
)
|
248 |
|
249 |
input_field.change(
|
250 |
fn=update_button_state,
|
251 |
inputs=input_field,
|
252 |
+
outputs=[search_button, warning_message],
|
253 |
)
|
254 |
|
255 |
return app
|
256 |
|
257 |
+
|
258 |
if __name__ == "__main__":
|
259 |
if initialize_components():
|
260 |
print("Components initialized successfully!")
|
261 |
app = create_gradio_interface()
|
262 |
app.launch(share=True)
|
263 |
else:
|
264 |
+
print(
|
265 |
+
"Failed to initialize components. Please check the paths and try again.",
|
266 |
+
file=sys.stderr,
|
267 |
+
)
|
268 |
sys.exit(1)
|
poetry.lock
ADDED
The diff for this file is too large to render.
See raw diff
|
|
pyproject.toml
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[tool.poetry]
|
2 |
+
name = "prototype"
|
3 |
+
version = "0.1.0"
|
4 |
+
description = ""
|
5 |
+
authors = ["Ivan Lytvynenko <[email protected]>"]
|
6 |
+
readme = "README.md"
|
7 |
+
|
8 |
+
[tool.poetry.dependencies]
|
9 |
+
python = "^3.10"
|
10 |
+
llama-index = "^0.12.3"
|
11 |
+
llama-index-readers-file = "^0.4.1"
|
12 |
+
llama-index-vector-stores-faiss = "^0.3.0"
|
13 |
+
llama-index-retrievers-bm25 = "^0.5.0"
|
14 |
+
openai = "^1.57.0"
|
15 |
+
faiss-cpu = "^1.9.0.post1"
|
16 |
+
llama-index-embeddings-openai = "^0.3.1"
|
17 |
+
llama-index-llms-openai = "^0.3.2"
|
18 |
+
gradio = "^5.8.0"
|
19 |
+
beautifulsoup4 = "^4.12.3"
|
20 |
+
nest-asyncio = "^1.6.0"
|
21 |
+
boto3 = "^1.35.76"
|
22 |
+
python-dotenv = "^1.0.1"
|
23 |
+
openpyxl = "^3.1.5"
|
24 |
+
|
25 |
+
[tool.pyright]
|
26 |
+
venvPath = "."
|
27 |
+
venv = ".venv"
|
28 |
+
|
29 |
+
[tool.black]
|
30 |
+
line-length = 90
|
31 |
+
target-version = ["py311"]
|
32 |
+
|
33 |
+
[tool.isort]
|
34 |
+
src_paths = ["src"]
|
35 |
+
profile = "black"
|
36 |
+
line_length = 90
|
37 |
+
lines_after_imports = 2
|
38 |
+
|
39 |
+
[build-system]
|
40 |
+
requires = ["poetry-core"]
|
41 |
+
build-backend = "poetry.core.masonry.api"
|
requirements.txt
CHANGED
@@ -11,3 +11,4 @@ beautifulsoup4
|
|
11 |
nest-asyncio
|
12 |
boto3
|
13 |
python-dotenv
|
|
|
|
11 |
nest-asyncio
|
12 |
boto3
|
13 |
python-dotenv
|
14 |
+
openpyxl
|