import re import unicodedata from pathlib import Path import bm25s import pandas as pd from llama_index.core import Document from llama_index.core.schema import MetadataMode from llama_index.core.vector_stores.utils import node_to_metadata_dict from llama_index.retrievers.bm25 import BM25Retriever PERSIST_PATH = Path("Save_Index_Local") LP_INFO_FILE = "legal_position_with_categories_documents_all.xlsx" # NOTE: previously named "bm25_retriever_meta" # INDEX_NAME = "bm25_retriever_long" # k1 = 1.88 # b = 1.25 # delta = 0.5 # method = "robertson" INDEX_NAME_BM25_LONG = "bm25_retriever_short" k1 = 0.35 b = 0.6 delta = 0.5 method = "robertson" def clean_string(text: pd.Series): text = text.fillna("") text = text.str.replace(r"«|»", '"', regex=True) text = text.str.replace(r"\xa0", " ") text = text.str.replace(r"§", "№") # Handle unicode fractions text = text.apply(lambda t: unicodedata.normalize("NFKC", t)) # type: ignore text = text.str.replace("⁄", "/") return text def find_matching_pattern(categories): """ Search for matching patterns in the categories list and return the first match found. Args: categories: List of strings or string to search in Returns: str: Matching pattern or empty string if no match found """ patterns = [ "Велика Палата", "Касаційний кримінальний суд", "Касаційний адміністративний суд", "Касаційний господарський суд", "Касаційний цивільний суд", ] # Handle both string and list inputs if isinstance(categories, str): categories = [categories] elif isinstance(categories, list): # If list contains lists, flatten it categories = [item for sublist in categories for item in sublist] # Search for patterns for category in categories: for pattern in patterns: if pattern in category: return pattern return "" final_df = pd.read_excel(LP_INFO_FILE) category_columns = [col for col in final_df.columns if re.match(r"category_\d+$", col)] text_columns = ["title", "text_lp", "category_all"] + category_columns final_df[text_columns] = final_df[text_columns].apply(clean_string) final_df["category_search"] = final_df[category_columns].apply( lambda row: ", ".join([str(val) for val in row if pd.notna(val)]), axis=1 ) final_df["category_filter"] = final_df["category_all"].apply(find_matching_pattern) legal_position_title_category = [ Document( text=row["text_lp"], # type: ignore metadata={ # type: ignore "lp_id": row["id"], "title": row["title"], "doc_id": row["document_ids"], "category_filter": find_matching_pattern(row["category_all"]), "category_search": row["category_search"], }, excluded_embed_metadata_keys=["doc_id", "category_filter"], excluded_llm_metadata_keys=["doc_id", "category_filter"], ) for _, row in final_df.iterrows() ] ukrainian_stopwords_1 = [ "я", "ти", "він", "вона", "воно", "ми", "ви", "вони", "це", "той", "така", "таке", "такі", "цей", "моя", "твоя", "його", "її", "наш", "ваш", "їх", "де", "чи", "а", "але", "і", "або", "так", "ні", "чи", "в", "на", "з", "до", "під", "через", "після", "між", "серед", "без", "для", "про", "о", "за", "від", "до", "як", "якби", "коли", "де", "тому", "тому що", "що", "чому", "хто", "що", "якось", "коли-небудь", "де-небудь", "чимало", ] ukrainian_stopwords_2 = [ # Articles "і", "й", "у", "в", "та", "і", # Pronouns "я", "ти", "він", "вона", "воно", "ми", "ви", "вони", "мене", "тебе", "його", "її", "нас", "вас", "їх", "мій", "твій", "наш", "ваш", "свій", # Prepositions "з", "до", "від", "біля", "над", "під", "через", "для", "без", "між", "серед", "крізь", "понад", "поза", "крім", # Conjunctions "та", "і", "але", "або", "однак", "проте", "тому", "тому що", "оскільки", "якщо", "коли", "хоча", # Auxiliary words "так", "ні", "не", "бути", "мати", "можна", "треба", # Common filler words "цей", "той", "це", "те", "такий", "який", "котрий", # Modal words "мабуть", "напевно", "звичайно", "можливо", # Particles "ось", "ніби", "майже", "майже що", "саме", "лише", "тільки", ] ukrainian_stopwords = list(set(ukrainian_stopwords_1 + ukrainian_stopwords_2)) stemmer = lambda words: words # Copied from BM25Retriever __init__ method, but note that output looks awful and might work worse (this needs checking) corpus = [node_to_metadata_dict(node) for node in legal_position_title_category] corpus_tokens = bm25s.tokenize( [ node.get_content(metadata_mode=MetadataMode.EMBED) for node in legal_position_title_category ], stopwords=ukrainian_stopwords, stemmer=stemmer, ) existing_bm25 = bm25s.BM25( k1=k1, b=b, delta=b, method=method, # No corpus is saved without this line: corpus=corpus, # stores metadata and prevents TypeError: 'NoneType' object is not subscriptable ) existing_bm25.index(corpus=corpus_tokens) bm25_retriever = BM25Retriever( stemmer=stemmer, # prevents the use of default stemmer existing_bm25=existing_bm25, similarity_top_k=20, ) bm25_retriever.persist(str(PERSIST_PATH / INDEX_NAME_BM25_LONG)) # Returns an error on invalid corpus loaded_retriever = BM25Retriever.from_persist_dir(str(PERSIST_PATH / INDEX_NAME_BM25_LONG))