Spaces:
Running
Running
# utils/metadata.py | |
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline | |
import re | |
import dateparser | |
# π§ Load advanced NER model | |
model_name = "Jean-Baptiste/roberta-large-ner-english" | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForTokenClassification.from_pretrained(model_name) | |
# π§ Build NER pipeline with grouping | |
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") | |
def clean_text(text): | |
""" | |
π§Ό Clean contract text for better NER and regex performance. | |
""" | |
return text.replace("\n", " ").replace(" ", " ").strip() | |
def extract_effective_date(text): | |
""" | |
π Extract natural language 'Effective Date' (e.g., 'as of August 28, 2025'). | |
""" | |
match = re.search(r"(?i)as of (.+?)(\.|,|\n)", text) | |
if match: | |
raw_date = match.group(1).strip() | |
parsed = dateparser.parse(raw_date) | |
if parsed: | |
return [parsed.strftime("%Y-%m-%d")] | |
return [] | |
def extract_parties(text): | |
""" | |
π§Ύ Extract contracting parties using 'by and between X and Y'. | |
""" | |
pattern = r"(?i)by and between[:\s\n]+(.+?)\s+and\s+(.+?)\s*(\(|\n|$)" | |
match = re.search(pattern, text, re.DOTALL) | |
if match: | |
return [match.group(1).strip(), match.group(2).strip()] | |
return [] | |
def extract_governing_law(text): | |
""" | |
βοΈ Capture governing law even if it's stated less directly. | |
""" | |
patterns = [ | |
r"(?i)governed by the laws of ([\w\s,]+)", | |
r"(?i)under the laws of ([\w\s,]+)" | |
] | |
for pattern in patterns: | |
match = re.search(pattern, text) | |
if match: | |
return [match.group(1).strip()] | |
return [] | |
def extract_venue(text): | |
""" | |
ποΈ Look for venue in dispute clause like 'submitted to ... in XYZ'. | |
""" | |
match = re.search(r"(?i)submitted to.*?in ([\w\s,]+)", text) | |
return [match.group(1).strip()] if match else [] | |
def extract_metadata(text): | |
""" | |
π¦ Extract full structured metadata using hybrid rule-based + NER. | |
""" | |
if not text.strip(): | |
return {"error": "No input provided."} | |
text = clean_text(text) | |
# NER chunking | |
max_chunk_length = 512 | |
words = text.split() | |
chunks = [" ".join(words[i:i + max_chunk_length]) for i in range(0, len(words), max_chunk_length)] | |
ner_metadata = { | |
"EFFECTIVE_DATE": [], | |
"PARTIES": [], | |
"GOVERNING_LAW": [], | |
"JURISDICTION": [] | |
} | |
label_mapping = { | |
"DATE": "EFFECTIVE_DATE", | |
"PERSON": "PARTIES", | |
"ORGANIZATION": "PARTIES", | |
"LOCATION": "GOVERNING_LAW" | |
} | |
for chunk in chunks: | |
ner_results = ner_pipeline(chunk) | |
for ent in ner_results: | |
label = ent["entity_group"] | |
word = ent["word"] | |
custom_label = label_mapping.get(label) | |
if custom_label and word not in ner_metadata[custom_label]: | |
ner_metadata[custom_label].append(word) | |
# π§ Replace/enhance with rule-based extraction | |
ner_metadata["PARTIES"] = extract_parties(text) or ner_metadata["PARTIES"] | |
ner_metadata["EFFECTIVE_DATE"] = extract_effective_date(text) or ner_metadata["EFFECTIVE_DATE"] | |
ner_metadata["GOVERNING_LAW"] = extract_governing_law(text) or ner_metadata["GOVERNING_LAW"] | |
ner_metadata["VENUE"] = extract_venue(text) or ner_metadata["VENUE"] | |
ner_metadata["JURISDICTION"] = extract_venue(text) or ner_metadata["JURISDICTION"] | |
return ner_metadata | |