Spaces:
Runtime error
Runtime error
Commit
·
bca00b1
1
Parent(s):
c664621
Short query index
Browse files- healthcheck_bm25.py +16 -12
- init_bm25.py +37 -49
healthcheck_bm25.py
CHANGED
@@ -1,28 +1,32 @@
|
|
1 |
from pathlib import Path
|
2 |
|
3 |
from llama_index.retrievers.bm25 import BM25Retriever
|
4 |
-
from main import extract_court_decision_text
|
5 |
|
6 |
|
7 |
-
PERSIST_PATH = Path("Save_Index_Local")
|
8 |
-
|
9 |
-
INDEX_NAME = "bm25_retriever"
|
10 |
-
# INDEX_NAME = "bm25_retriever_meta"
|
11 |
|
12 |
-
|
|
|
|
|
13 |
# TEST_CD_URL = "https://reyestr.court.gov.ua/Review/118763429"
|
|
|
14 |
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
16 |
|
|
|
|
|
17 |
|
18 |
-
retriever = BM25Retriever.from_persist_dir(str(PERSIST_PATH / INDEX_NAME))
|
19 |
|
20 |
-
|
21 |
|
22 |
-
if
|
23 |
-
print(
|
24 |
|
25 |
-
nodes_with_score = retriever.retrieve(
|
26 |
for index, node_with_score in enumerate(nodes_with_score, start=1):
|
27 |
source_title = node_with_score.node.metadata.get("title", "Невідомий заголовок")
|
28 |
print(index, f"{node_with_score.score:.4f}", source_title, "\n", sep="\t")
|
|
|
1 |
from pathlib import Path
|
2 |
|
3 |
from llama_index.retrievers.bm25 import BM25Retriever
|
|
|
4 |
|
5 |
|
|
|
|
|
|
|
|
|
6 |
|
7 |
+
# from main import extract_court_decision_text
|
8 |
+
# INDEX_NAME = "bm25_retriever_long"
|
9 |
+
# TEST_CD_URL = "https://reyestr.court.gov.ua/Review/118766467"
|
10 |
# TEST_CD_URL = "https://reyestr.court.gov.ua/Review/118763429"
|
11 |
+
# query = extract_court_decision_text(TEST_CD_URL)
|
12 |
|
13 |
+
INDEX_NAME = "bm25_retriever_short"
|
14 |
+
query = (
|
15 |
+
"Викрадення майна, злочини, пов'язані з порушенням законодавчих норм щодо обігу та використання "
|
16 |
+
"документів, печаток, штампів, бланків, а також спеціальних технічних засобів "
|
17 |
+
"для отримання інформації та комунікаційних ліній."
|
18 |
+
)
|
19 |
|
20 |
+
PERSIST_PATH = Path("Save_Index_Local")
|
21 |
+
PRINT_QUERY = True
|
22 |
|
|
|
23 |
|
24 |
+
retriever = BM25Retriever.from_persist_dir(str(PERSIST_PATH / INDEX_NAME))
|
25 |
|
26 |
+
if PRINT_QUERY:
|
27 |
+
print(query)
|
28 |
|
29 |
+
nodes_with_score = retriever.retrieve(query)
|
30 |
for index, node_with_score in enumerate(nodes_with_score, start=1):
|
31 |
source_title = node_with_score.node.metadata.get("title", "Невідомий заголовок")
|
32 |
print(index, f"{node_with_score.score:.4f}", source_title, "\n", sep="\t")
|
init_bm25.py
CHANGED
@@ -10,15 +10,21 @@ from llama_index.core.vector_stores.utils import node_to_metadata_dict
|
|
10 |
from llama_index.retrievers.bm25 import BM25Retriever
|
11 |
|
12 |
|
13 |
-
|
14 |
PERSIST_PATH = Path("Save_Index_Local")
|
15 |
LP_INFO_FILE = "legal_position_with_categories_documents_all.xlsx"
|
16 |
|
17 |
-
|
18 |
-
|
|
|
|
|
|
|
|
|
19 |
|
20 |
-
|
21 |
-
|
|
|
|
|
|
|
22 |
|
23 |
|
24 |
def clean_string(text: pd.Series):
|
@@ -69,49 +75,31 @@ def find_matching_pattern(categories):
|
|
69 |
|
70 |
final_df = pd.read_excel(LP_INFO_FILE)
|
71 |
|
72 |
-
if
|
73 |
-
category_columns = [
|
74 |
-
col for col in final_df.columns if re.match(r"category_\d+$", col)
|
75 |
-
]
|
76 |
|
77 |
-
|
78 |
-
|
79 |
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
else:
|
101 |
-
final_df[["title", "text_lp"]] = final_df[["title", "text_lp"]].apply(clean_string)
|
102 |
-
legal_position_title_category = [
|
103 |
-
Document(
|
104 |
-
text=row["text_lp"], # type: ignore
|
105 |
-
metadata={ # type: ignore
|
106 |
-
"lp_id": row["id"],
|
107 |
-
"doc_id": row["document_ids"],
|
108 |
-
"title": row["title"],
|
109 |
-
},
|
110 |
-
excluded_embed_metadata_keys=["lp_id", "doc_id", "title"],
|
111 |
-
excluded_llm_metadata_keys=["lp_id", "doc_id", "title"],
|
112 |
-
)
|
113 |
-
for _, row in final_df.iterrows()
|
114 |
-
]
|
115 |
|
116 |
|
117 |
ukrainian_stopwords_1 = [
|
@@ -281,10 +269,10 @@ corpus_tokens = bm25s.tokenize(
|
|
281 |
)
|
282 |
|
283 |
existing_bm25 = bm25s.BM25(
|
284 |
-
k1=
|
285 |
-
b=
|
286 |
-
delta=
|
287 |
-
method=
|
288 |
# No corpus is saved without this line:
|
289 |
corpus=corpus, # stores metadata and prevents TypeError: 'NoneType' object is not subscriptable
|
290 |
)
|
|
|
10 |
from llama_index.retrievers.bm25 import BM25Retriever
|
11 |
|
12 |
|
|
|
13 |
PERSIST_PATH = Path("Save_Index_Local")
|
14 |
LP_INFO_FILE = "legal_position_with_categories_documents_all.xlsx"
|
15 |
|
16 |
+
# NOTE: previously named "bm25_retriever_meta"
|
17 |
+
# INDEX_NAME = "bm25_retriever_long"
|
18 |
+
# k1 = 1.88
|
19 |
+
# b = 1.25
|
20 |
+
# delta = 0.5
|
21 |
+
# method = "robertson"
|
22 |
|
23 |
+
INDEX_NAME = "bm25_retriever_short"
|
24 |
+
k1 = 0.35
|
25 |
+
b = 0.6
|
26 |
+
delta = 0.5
|
27 |
+
method = "robertson"
|
28 |
|
29 |
|
30 |
def clean_string(text: pd.Series):
|
|
|
75 |
|
76 |
final_df = pd.read_excel(LP_INFO_FILE)
|
77 |
|
78 |
+
category_columns = [col for col in final_df.columns if re.match(r"category_\d+$", col)]
|
|
|
|
|
|
|
79 |
|
80 |
+
text_columns = ["title", "text_lp", "category_all"] + category_columns
|
81 |
+
final_df[text_columns] = final_df[text_columns].apply(clean_string)
|
82 |
|
83 |
+
final_df["category_search"] = final_df[category_columns].apply(
|
84 |
+
lambda row: ", ".join([str(val) for val in row if pd.notna(val)]), axis=1
|
85 |
+
)
|
86 |
+
final_df["category_filter"] = final_df["category_all"].apply(find_matching_pattern)
|
87 |
|
88 |
+
legal_position_title_category = [
|
89 |
+
Document(
|
90 |
+
text=row["text_lp"], # type: ignore
|
91 |
+
metadata={ # type: ignore
|
92 |
+
"lp_id": row["id"],
|
93 |
+
"title": row["title"],
|
94 |
+
"doc_id": row["document_ids"],
|
95 |
+
"category_filter": find_matching_pattern(row["category_all"]),
|
96 |
+
"category_search": row["category_search"],
|
97 |
+
},
|
98 |
+
excluded_embed_metadata_keys=["doc_id", "category_filter"],
|
99 |
+
excluded_llm_metadata_keys=["doc_id", "category_filter"],
|
100 |
+
)
|
101 |
+
for _, row in final_df.iterrows()
|
102 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
|
104 |
|
105 |
ukrainian_stopwords_1 = [
|
|
|
269 |
)
|
270 |
|
271 |
existing_bm25 = bm25s.BM25(
|
272 |
+
k1=k1,
|
273 |
+
b=b,
|
274 |
+
delta=b,
|
275 |
+
method=method,
|
276 |
# No corpus is saved without this line:
|
277 |
corpus=corpus, # stores metadata and prevents TypeError: 'NoneType' object is not subscriptable
|
278 |
)
|