synclm-demo / utils /metadata.py
SCBconsulting's picture
Update utils/metadata.py
2ceb2ac verified
# utils/metadata.py
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import re
import dateparser
# 🧠 Load advanced NER model
model_name = "Jean-Baptiste/roberta-large-ner-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
# πŸ”§ Build NER pipeline with grouping
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
def clean_text(text):
"""
🧼 Clean contract text for better NER and regex performance.
"""
return text.replace("\n", " ").replace(" ", " ").strip()
def extract_effective_date(text):
"""
πŸ“… Extract natural language 'Effective Date' (e.g., 'as of August 28, 2025').
"""
match = re.search(r"(?i)as of (.+?)(\.|,|\n)", text)
if match:
raw_date = match.group(1).strip()
parsed = dateparser.parse(raw_date)
if parsed:
return [parsed.strftime("%Y-%m-%d")]
return []
def extract_parties(text):
"""
🧾 Extract contracting parties using 'by and between X and Y'.
"""
pattern = r"(?i)by and between[:\s\n]+(.+?)\s+and\s+(.+?)\s*(\(|\n|$)"
match = re.search(pattern, text, re.DOTALL)
if match:
return [match.group(1).strip(), match.group(2).strip()]
return []
def extract_governing_law(text):
"""
βš–οΈ Capture governing law even if it's stated less directly.
"""
patterns = [
r"(?i)governed by the laws of ([\w\s,]+)",
r"(?i)under the laws of ([\w\s,]+)"
]
for pattern in patterns:
match = re.search(pattern, text)
if match:
return [match.group(1).strip()]
return []
def extract_venue(text):
"""
πŸ›οΈ Look for venue in dispute clause like 'submitted to ... in XYZ'.
"""
match = re.search(r"(?i)submitted to.*?in ([\w\s,]+)", text)
return [match.group(1).strip()] if match else []
def extract_metadata(text):
"""
πŸ“¦ Extract full structured metadata using hybrid rule-based + NER.
"""
if not text.strip():
return {"error": "No input provided."}
text = clean_text(text)
# NER chunking
max_chunk_length = 512
words = text.split()
chunks = [" ".join(words[i:i + max_chunk_length]) for i in range(0, len(words), max_chunk_length)]
ner_metadata = {
"EFFECTIVE_DATE": [],
"PARTIES": [],
"GOVERNING_LAW": [],
"JURISDICTION": []
}
label_mapping = {
"DATE": "EFFECTIVE_DATE",
"PERSON": "PARTIES",
"ORGANIZATION": "PARTIES",
"LOCATION": "GOVERNING_LAW"
}
for chunk in chunks:
ner_results = ner_pipeline(chunk)
for ent in ner_results:
label = ent["entity_group"]
word = ent["word"]
custom_label = label_mapping.get(label)
if custom_label and word not in ner_metadata[custom_label]:
ner_metadata[custom_label].append(word)
# 🧠 Replace/enhance with rule-based extraction
ner_metadata["PARTIES"] = extract_parties(text) or ner_metadata["PARTIES"]
ner_metadata["EFFECTIVE_DATE"] = extract_effective_date(text) or ner_metadata["EFFECTIVE_DATE"]
ner_metadata["GOVERNING_LAW"] = extract_governing_law(text) or ner_metadata["GOVERNING_LAW"]
ner_metadata["VENUE"] = extract_venue(text) or ner_metadata["VENUE"]
ner_metadata["JURISDICTION"] = extract_venue(text) or ner_metadata["JURISDICTION"]
return ner_metadata