i-d-lytvynenko commited on
Commit
bca00b1
·
1 Parent(s): c664621

Short query index

Browse files
Files changed (2) hide show
  1. healthcheck_bm25.py +16 -12
  2. init_bm25.py +37 -49
healthcheck_bm25.py CHANGED
@@ -1,28 +1,32 @@
1
  from pathlib import Path
2
 
3
  from llama_index.retrievers.bm25 import BM25Retriever
4
- from main import extract_court_decision_text
5
 
6
 
7
- PERSIST_PATH = Path("Save_Index_Local")
8
-
9
- INDEX_NAME = "bm25_retriever"
10
- # INDEX_NAME = "bm25_retriever_meta"
11
 
12
- TEST_CD_URL = "https://reyestr.court.gov.ua/Review/118766467"
 
 
13
  # TEST_CD_URL = "https://reyestr.court.gov.ua/Review/118763429"
 
14
 
15
- PRINT_CD = False
 
 
 
 
 
16
 
 
 
17
 
18
- retriever = BM25Retriever.from_persist_dir(str(PERSIST_PATH / INDEX_NAME))
19
 
20
- court_decision_text = extract_court_decision_text(TEST_CD_URL)
21
 
22
- if PRINT_CD:
23
- print(court_decision_text, "\n\n\n\n\n")
24
 
25
- nodes_with_score = retriever.retrieve(court_decision_text)
26
  for index, node_with_score in enumerate(nodes_with_score, start=1):
27
  source_title = node_with_score.node.metadata.get("title", "Невідомий заголовок")
28
  print(index, f"{node_with_score.score:.4f}", source_title, "\n", sep="\t")
 
1
  from pathlib import Path
2
 
3
  from llama_index.retrievers.bm25 import BM25Retriever
 
4
 
5
 
 
 
 
 
6
 
7
+ # from main import extract_court_decision_text
8
+ # INDEX_NAME = "bm25_retriever_long"
9
+ # TEST_CD_URL = "https://reyestr.court.gov.ua/Review/118766467"
10
  # TEST_CD_URL = "https://reyestr.court.gov.ua/Review/118763429"
11
+ # query = extract_court_decision_text(TEST_CD_URL)
12
 
13
+ INDEX_NAME = "bm25_retriever_short"
14
+ query = (
15
+ "Викрадення майна, злочини, пов'язані з порушенням законодавчих норм щодо обігу та використання "
16
+ "документів, печаток, штампів, бланків, а також спеціальних технічних засобів "
17
+ "для отримання інформації та комунікаційних ліній."
18
+ )
19
 
20
+ PERSIST_PATH = Path("Save_Index_Local")
21
+ PRINT_QUERY = True
22
 
 
23
 
24
+ retriever = BM25Retriever.from_persist_dir(str(PERSIST_PATH / INDEX_NAME))
25
 
26
+ if PRINT_QUERY:
27
+ print(query)
28
 
29
+ nodes_with_score = retriever.retrieve(query)
30
  for index, node_with_score in enumerate(nodes_with_score, start=1):
31
  source_title = node_with_score.node.metadata.get("title", "Невідомий заголовок")
32
  print(index, f"{node_with_score.score:.4f}", source_title, "\n", sep="\t")
init_bm25.py CHANGED
@@ -10,15 +10,21 @@ from llama_index.core.vector_stores.utils import node_to_metadata_dict
10
  from llama_index.retrievers.bm25 import BM25Retriever
11
 
12
 
13
-
14
  PERSIST_PATH = Path("Save_Index_Local")
15
  LP_INFO_FILE = "legal_position_with_categories_documents_all.xlsx"
16
 
17
- INDEX_NAME = "bm25_retriever"
18
- USE_META = False
 
 
 
 
19
 
20
- # INDEX_NAME = "bm25_retriever_meta"
21
- # USE_META = True
 
 
 
22
 
23
 
24
  def clean_string(text: pd.Series):
@@ -69,49 +75,31 @@ def find_matching_pattern(categories):
69
 
70
  final_df = pd.read_excel(LP_INFO_FILE)
71
 
72
- if USE_META:
73
- category_columns = [
74
- col for col in final_df.columns if re.match(r"category_\d+$", col)
75
- ]
76
 
77
- text_columns = ["title", "text_lp", "category_all"] + category_columns
78
- final_df[text_columns] = final_df[text_columns].apply(clean_string)
79
 
80
- final_df["category_search"] = final_df[category_columns].apply(
81
- lambda row: ", ".join([str(val) for val in row if pd.notna(val)]), axis=1
82
- )
83
- final_df["category_filter"] = final_df["category_all"].apply(find_matching_pattern)
84
 
85
- legal_position_title_category = [
86
- Document(
87
- text=row["text_lp"], # type: ignore
88
- metadata={ # type: ignore
89
- "lp_id": row["id"],
90
- "title": row["title"],
91
- "doc_id": row["document_ids"],
92
- "category_filter": find_matching_pattern(row["category_all"]),
93
- "category_search": row["category_search"],
94
- },
95
- excluded_embed_metadata_keys=["doc_id", "category_filter"],
96
- excluded_llm_metadata_keys=["doc_id", "category_filter"],
97
- )
98
- for _, row in final_df.iterrows()
99
- ]
100
- else:
101
- final_df[["title", "text_lp"]] = final_df[["title", "text_lp"]].apply(clean_string)
102
- legal_position_title_category = [
103
- Document(
104
- text=row["text_lp"], # type: ignore
105
- metadata={ # type: ignore
106
- "lp_id": row["id"],
107
- "doc_id": row["document_ids"],
108
- "title": row["title"],
109
- },
110
- excluded_embed_metadata_keys=["lp_id", "doc_id", "title"],
111
- excluded_llm_metadata_keys=["lp_id", "doc_id", "title"],
112
- )
113
- for _, row in final_df.iterrows()
114
- ]
115
 
116
 
117
  ukrainian_stopwords_1 = [
@@ -281,10 +269,10 @@ corpus_tokens = bm25s.tokenize(
281
  )
282
 
283
  existing_bm25 = bm25s.BM25(
284
- k1=1.88,
285
- b=1.25,
286
- delta=0.5,
287
- method="robertson",
288
  # No corpus is saved without this line:
289
  corpus=corpus, # stores metadata and prevents TypeError: 'NoneType' object is not subscriptable
290
  )
 
10
  from llama_index.retrievers.bm25 import BM25Retriever
11
 
12
 
 
13
  PERSIST_PATH = Path("Save_Index_Local")
14
  LP_INFO_FILE = "legal_position_with_categories_documents_all.xlsx"
15
 
16
+ # NOTE: previously named "bm25_retriever_meta"
17
+ # INDEX_NAME = "bm25_retriever_long"
18
+ # k1 = 1.88
19
+ # b = 1.25
20
+ # delta = 0.5
21
+ # method = "robertson"
22
 
23
+ INDEX_NAME = "bm25_retriever_short"
24
+ k1 = 0.35
25
+ b = 0.6
26
+ delta = 0.5
27
+ method = "robertson"
28
 
29
 
30
  def clean_string(text: pd.Series):
 
75
 
76
  final_df = pd.read_excel(LP_INFO_FILE)
77
 
78
+ category_columns = [col for col in final_df.columns if re.match(r"category_\d+$", col)]
 
 
 
79
 
80
+ text_columns = ["title", "text_lp", "category_all"] + category_columns
81
+ final_df[text_columns] = final_df[text_columns].apply(clean_string)
82
 
83
+ final_df["category_search"] = final_df[category_columns].apply(
84
+ lambda row: ", ".join([str(val) for val in row if pd.notna(val)]), axis=1
85
+ )
86
+ final_df["category_filter"] = final_df["category_all"].apply(find_matching_pattern)
87
 
88
+ legal_position_title_category = [
89
+ Document(
90
+ text=row["text_lp"], # type: ignore
91
+ metadata={ # type: ignore
92
+ "lp_id": row["id"],
93
+ "title": row["title"],
94
+ "doc_id": row["document_ids"],
95
+ "category_filter": find_matching_pattern(row["category_all"]),
96
+ "category_search": row["category_search"],
97
+ },
98
+ excluded_embed_metadata_keys=["doc_id", "category_filter"],
99
+ excluded_llm_metadata_keys=["doc_id", "category_filter"],
100
+ )
101
+ for _, row in final_df.iterrows()
102
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
 
105
  ukrainian_stopwords_1 = [
 
269
  )
270
 
271
  existing_bm25 = bm25s.BM25(
272
+ k1=k1,
273
+ b=b,
274
+ delta=b,
275
+ method=method,
276
  # No corpus is saved without this line:
277
  corpus=corpus, # stores metadata and prevents TypeError: 'NoneType' object is not subscriptable
278
  )