# utils/metadata.py from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline import re import dateparser # ๐Ÿง  Load advanced NER model model_name = "Jean-Baptiste/roberta-large-ner-english" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForTokenClassification.from_pretrained(model_name) # ๐Ÿ”ง Build NER pipeline with grouping ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") def clean_text(text): """ ๐Ÿงผ Clean contract text for better NER and regex performance. """ return text.replace("\n", " ").replace(" ", " ").strip() def extract_effective_date(text): """ ๐Ÿ“… Extract natural language 'Effective Date' (e.g., 'as of August 28, 2025'). """ match = re.search(r"(?i)as of (.+?)(\.|,|\n)", text) if match: raw_date = match.group(1).strip() parsed = dateparser.parse(raw_date) if parsed: return [parsed.strftime("%Y-%m-%d")] return [] def extract_parties(text): """ ๐Ÿงพ Extract contracting parties using 'by and between X and Y'. """ pattern = r"(?i)by and between[:\s\n]+(.+?)\s+and\s+(.+?)\s*(\(|\n|$)" match = re.search(pattern, text, re.DOTALL) if match: return [match.group(1).strip(), match.group(2).strip()] return [] def extract_governing_law(text): """ โš–๏ธ Capture governing law even if it's stated less directly. """ patterns = [ r"(?i)governed by the laws of ([\w\s,]+)", r"(?i)under the laws of ([\w\s,]+)" ] for pattern in patterns: match = re.search(pattern, text) if match: return [match.group(1).strip()] return [] def extract_venue(text): """ ๐Ÿ›๏ธ Look for venue in dispute clause like 'submitted to ... in XYZ'. """ match = re.search(r"(?i)submitted to.*?in ([\w\s,]+)", text) return [match.group(1).strip()] if match else [] def extract_metadata(text): """ ๐Ÿ“ฆ Extract full structured metadata using hybrid rule-based + NER. """ if not text.strip(): return {"error": "No input provided."} text = clean_text(text) # NER chunking max_chunk_length = 512 words = text.split() chunks = [" ".join(words[i:i + max_chunk_length]) for i in range(0, len(words), max_chunk_length)] ner_metadata = { "EFFECTIVE_DATE": [], "PARTIES": [], "GOVERNING_LAW": [], "JURISDICTION": [] } label_mapping = { "DATE": "EFFECTIVE_DATE", "PERSON": "PARTIES", "ORGANIZATION": "PARTIES", "LOCATION": "GOVERNING_LAW" } for chunk in chunks: ner_results = ner_pipeline(chunk) for ent in ner_results: label = ent["entity_group"] word = ent["word"] custom_label = label_mapping.get(label) if custom_label and word not in ner_metadata[custom_label]: ner_metadata[custom_label].append(word) # ๐Ÿง  Replace/enhance with rule-based extraction ner_metadata["PARTIES"] = extract_parties(text) or ner_metadata["PARTIES"] ner_metadata["EFFECTIVE_DATE"] = extract_effective_date(text) or ner_metadata["EFFECTIVE_DATE"] ner_metadata["GOVERNING_LAW"] = extract_governing_law(text) or ner_metadata["GOVERNING_LAW"] ner_metadata["VENUE"] = extract_venue(text) or ner_metadata["VENUE"] ner_metadata["JURISDICTION"] = extract_venue(text) or ner_metadata["JURISDICTION"] return ner_metadata