Spaces:
Running
Running
Update utils/metadata.py
Browse files- utils/metadata.py +44 -26
utils/metadata.py
CHANGED
@@ -2,64 +2,84 @@
|
|
2 |
|
3 |
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
|
4 |
import re
|
|
|
5 |
|
6 |
# π§ Load advanced NER model
|
7 |
model_name = "Jean-Baptiste/roberta-large-ner-english"
|
8 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
9 |
model = AutoModelForTokenClassification.from_pretrained(model_name)
|
10 |
|
11 |
-
#
|
12 |
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
|
13 |
|
14 |
-
|
15 |
def clean_text(text):
|
16 |
"""
|
17 |
-
Clean contract text for
|
18 |
"""
|
19 |
return text.replace("\n", " ").replace(" ", " ").strip()
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
def extract_governing_law(text):
|
23 |
"""
|
24 |
-
|
25 |
"""
|
26 |
match = re.search(r"(?i)governed by the laws of ([\w\s,]+)", text)
|
27 |
-
return match.group(1).strip() if match else
|
28 |
-
|
29 |
|
30 |
def extract_venue(text):
|
31 |
"""
|
32 |
-
|
33 |
"""
|
34 |
match = re.search(r"(?i)submitted to.*?in ([\w\s,]+)", text)
|
35 |
-
return match.group(1).strip() if match else
|
36 |
-
|
37 |
|
38 |
def extract_metadata(text):
|
39 |
"""
|
40 |
-
Extract
|
41 |
"""
|
42 |
if not text.strip():
|
43 |
return {"error": "No input provided."}
|
44 |
|
45 |
text = clean_text(text)
|
46 |
-
|
|
|
|
|
47 |
words = text.split()
|
48 |
chunks = [" ".join(words[i:i + max_chunk_length]) for i in range(0, len(words), max_chunk_length)]
|
49 |
|
50 |
-
|
51 |
"EFFECTIVE_DATE": [],
|
52 |
"PARTIES": [],
|
53 |
"GOVERNING_LAW": [],
|
54 |
"VENUE": []
|
55 |
}
|
56 |
|
57 |
-
# NER label mapping β custom keys
|
58 |
label_mapping = {
|
59 |
"DATE": "EFFECTIVE_DATE",
|
60 |
"PERSON": "PARTIES",
|
61 |
"ORGANIZATION": "PARTIES",
|
62 |
-
"LOCATION": "GOVERNING_LAW"
|
63 |
}
|
64 |
|
65 |
for chunk in chunks:
|
@@ -68,15 +88,13 @@ def extract_metadata(text):
|
|
68 |
label = ent["entity_group"]
|
69 |
word = ent["word"]
|
70 |
custom_label = label_mapping.get(label)
|
71 |
-
if custom_label and word not in
|
72 |
-
|
73 |
-
|
74 |
-
# π§
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
return metadata
|
|
|
2 |
|
3 |
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
|
4 |
import re
|
5 |
+
import dateparser
|
6 |
|
7 |
# π§ Load advanced NER model
|
8 |
model_name = "Jean-Baptiste/roberta-large-ner-english"
|
9 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
10 |
model = AutoModelForTokenClassification.from_pretrained(model_name)
|
11 |
|
12 |
+
# π§ Build NER pipeline with grouping
|
13 |
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
|
14 |
|
|
|
15 |
def clean_text(text):
|
16 |
"""
|
17 |
+
π§Ό Clean contract text for better NER and regex performance.
|
18 |
"""
|
19 |
return text.replace("\n", " ").replace(" ", " ").strip()
|
20 |
|
21 |
+
def extract_effective_date(text):
|
22 |
+
"""
|
23 |
+
π
Extract natural language 'Effective Date' (e.g., 'as of August 28, 2025').
|
24 |
+
"""
|
25 |
+
match = re.search(r"(?i)as of (.+?)(\.|,|\n)", text)
|
26 |
+
if match:
|
27 |
+
raw_date = match.group(1).strip()
|
28 |
+
parsed = dateparser.parse(raw_date)
|
29 |
+
if parsed:
|
30 |
+
return [parsed.strftime("%Y-%m-%d")]
|
31 |
+
return []
|
32 |
+
|
33 |
+
def extract_parties(text):
|
34 |
+
"""
|
35 |
+
π§Ύ Extract contracting parties using 'by and between X and Y'.
|
36 |
+
"""
|
37 |
+
pattern = r"(?i)by and between[:\s\n]+(.+?)\s+and\s+(.+?)\s*(\(|\n|$)"
|
38 |
+
match = re.search(pattern, text, re.DOTALL)
|
39 |
+
if match:
|
40 |
+
return [match.group(1).strip(), match.group(2).strip()]
|
41 |
+
return []
|
42 |
|
43 |
def extract_governing_law(text):
|
44 |
"""
|
45 |
+
βοΈ Look for 'governed by the laws of XYZ'.
|
46 |
"""
|
47 |
match = re.search(r"(?i)governed by the laws of ([\w\s,]+)", text)
|
48 |
+
return [match.group(1).strip()] if match else []
|
|
|
49 |
|
50 |
def extract_venue(text):
|
51 |
"""
|
52 |
+
ποΈ Look for venue in dispute clause like 'submitted to ... in XYZ'.
|
53 |
"""
|
54 |
match = re.search(r"(?i)submitted to.*?in ([\w\s,]+)", text)
|
55 |
+
return [match.group(1).strip()] if match else []
|
|
|
56 |
|
57 |
def extract_metadata(text):
|
58 |
"""
|
59 |
+
π¦ Extract full structured metadata using hybrid rule-based + NER.
|
60 |
"""
|
61 |
if not text.strip():
|
62 |
return {"error": "No input provided."}
|
63 |
|
64 |
text = clean_text(text)
|
65 |
+
|
66 |
+
# NER chunking
|
67 |
+
max_chunk_length = 512
|
68 |
words = text.split()
|
69 |
chunks = [" ".join(words[i:i + max_chunk_length]) for i in range(0, len(words), max_chunk_length)]
|
70 |
|
71 |
+
ner_metadata = {
|
72 |
"EFFECTIVE_DATE": [],
|
73 |
"PARTIES": [],
|
74 |
"GOVERNING_LAW": [],
|
75 |
"VENUE": []
|
76 |
}
|
77 |
|
|
|
78 |
label_mapping = {
|
79 |
"DATE": "EFFECTIVE_DATE",
|
80 |
"PERSON": "PARTIES",
|
81 |
"ORGANIZATION": "PARTIES",
|
82 |
+
"LOCATION": "GOVERNING_LAW"
|
83 |
}
|
84 |
|
85 |
for chunk in chunks:
|
|
|
88 |
label = ent["entity_group"]
|
89 |
word = ent["word"]
|
90 |
custom_label = label_mapping.get(label)
|
91 |
+
if custom_label and word not in ner_metadata[custom_label]:
|
92 |
+
ner_metadata[custom_label].append(word)
|
93 |
+
|
94 |
+
# π§ Replace/enhance with rule-based extraction
|
95 |
+
ner_metadata["PARTIES"] = extract_parties(text) or ner_metadata["PARTIES"]
|
96 |
+
ner_metadata["EFFECTIVE_DATE"] = extract_effective_date(text) or ner_metadata["EFFECTIVE_DATE"]
|
97 |
+
ner_metadata["GOVERNING_LAW"] = extract_governing_law(text) or ner_metadata["GOVERNING_LAW"]
|
98 |
+
ner_metadata["VENUE"] = extract_venue(text) or ner_metadata["VENUE"]
|
99 |
+
|
100 |
+
return ner_metadata
|
|
|
|