SCBconsulting commited on
Commit
38670e5
Β·
verified Β·
1 Parent(s): bcc8f3b

Update utils/metadata.py

Browse files
Files changed (1) hide show
  1. utils/metadata.py +44 -26
utils/metadata.py CHANGED
@@ -2,64 +2,84 @@
2
 
3
  from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
4
  import re
 
5
 
6
  # 🧠 Load advanced NER model
7
  model_name = "Jean-Baptiste/roberta-large-ner-english"
8
  tokenizer = AutoTokenizer.from_pretrained(model_name)
9
  model = AutoModelForTokenClassification.from_pretrained(model_name)
10
 
11
- # πŸ› οΈ NER pipeline with aggregation
12
  ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
13
 
14
-
15
  def clean_text(text):
16
  """
17
- Clean contract text for more accurate NER.
18
  """
19
  return text.replace("\n", " ").replace(" ", " ").strip()
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  def extract_governing_law(text):
23
  """
24
- Rule-based extractor for governing law clause.
25
  """
26
  match = re.search(r"(?i)governed by the laws of ([\w\s,]+)", text)
27
- return match.group(1).strip() if match else None
28
-
29
 
30
  def extract_venue(text):
31
  """
32
- Rule-based extractor for venue / jurisdiction clause.
33
  """
34
  match = re.search(r"(?i)submitted to.*?in ([\w\s,]+)", text)
35
- return match.group(1).strip() if match else None
36
-
37
 
38
  def extract_metadata(text):
39
  """
40
- Extract contract metadata using NER and rule-based matchers.
41
  """
42
  if not text.strip():
43
  return {"error": "No input provided."}
44
 
45
  text = clean_text(text)
46
- max_chunk_length = 512 # safe for transformer models
 
 
47
  words = text.split()
48
  chunks = [" ".join(words[i:i + max_chunk_length]) for i in range(0, len(words), max_chunk_length)]
49
 
50
- metadata = {
51
  "EFFECTIVE_DATE": [],
52
  "PARTIES": [],
53
  "GOVERNING_LAW": [],
54
  "VENUE": []
55
  }
56
 
57
- # NER label mapping β†’ custom keys
58
  label_mapping = {
59
  "DATE": "EFFECTIVE_DATE",
60
  "PERSON": "PARTIES",
61
  "ORGANIZATION": "PARTIES",
62
- "LOCATION": "GOVERNING_LAW" # may include cities/states
63
  }
64
 
65
  for chunk in chunks:
@@ -68,15 +88,13 @@ def extract_metadata(text):
68
  label = ent["entity_group"]
69
  word = ent["word"]
70
  custom_label = label_mapping.get(label)
71
- if custom_label and word not in metadata[custom_label]:
72
- metadata[custom_label].append(word)
73
-
74
- # 🧠 Override/inject rule-based values
75
- law = extract_governing_law(text)
76
- venue = extract_venue(text)
77
- if law and law not in metadata["GOVERNING_LAW"]:
78
- metadata["GOVERNING_LAW"].append(law)
79
- if venue and venue not in metadata["VENUE"]:
80
- metadata["VENUE"].append(venue)
81
-
82
- return metadata
 
2
 
3
  from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
4
  import re
5
+ import dateparser
6
 
7
  # 🧠 Load advanced NER model
8
  model_name = "Jean-Baptiste/roberta-large-ner-english"
9
  tokenizer = AutoTokenizer.from_pretrained(model_name)
10
  model = AutoModelForTokenClassification.from_pretrained(model_name)
11
 
12
+ # πŸ”§ Build NER pipeline with grouping
13
  ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
14
 
 
15
  def clean_text(text):
16
  """
17
+ 🧼 Clean contract text for better NER and regex performance.
18
  """
19
  return text.replace("\n", " ").replace(" ", " ").strip()
20
 
21
+ def extract_effective_date(text):
22
+ """
23
+ πŸ“… Extract natural language 'Effective Date' (e.g., 'as of August 28, 2025').
24
+ """
25
+ match = re.search(r"(?i)as of (.+?)(\.|,|\n)", text)
26
+ if match:
27
+ raw_date = match.group(1).strip()
28
+ parsed = dateparser.parse(raw_date)
29
+ if parsed:
30
+ return [parsed.strftime("%Y-%m-%d")]
31
+ return []
32
+
33
+ def extract_parties(text):
34
+ """
35
+ 🧾 Extract contracting parties using 'by and between X and Y'.
36
+ """
37
+ pattern = r"(?i)by and between[:\s\n]+(.+?)\s+and\s+(.+?)\s*(\(|\n|$)"
38
+ match = re.search(pattern, text, re.DOTALL)
39
+ if match:
40
+ return [match.group(1).strip(), match.group(2).strip()]
41
+ return []
42
 
43
  def extract_governing_law(text):
44
  """
45
+ βš–οΈ Look for 'governed by the laws of XYZ'.
46
  """
47
  match = re.search(r"(?i)governed by the laws of ([\w\s,]+)", text)
48
+ return [match.group(1).strip()] if match else []
 
49
 
50
  def extract_venue(text):
51
  """
52
+ πŸ›οΈ Look for venue in dispute clause like 'submitted to ... in XYZ'.
53
  """
54
  match = re.search(r"(?i)submitted to.*?in ([\w\s,]+)", text)
55
+ return [match.group(1).strip()] if match else []
 
56
 
57
  def extract_metadata(text):
58
  """
59
+ πŸ“¦ Extract full structured metadata using hybrid rule-based + NER.
60
  """
61
  if not text.strip():
62
  return {"error": "No input provided."}
63
 
64
  text = clean_text(text)
65
+
66
+ # NER chunking
67
+ max_chunk_length = 512
68
  words = text.split()
69
  chunks = [" ".join(words[i:i + max_chunk_length]) for i in range(0, len(words), max_chunk_length)]
70
 
71
+ ner_metadata = {
72
  "EFFECTIVE_DATE": [],
73
  "PARTIES": [],
74
  "GOVERNING_LAW": [],
75
  "VENUE": []
76
  }
77
 
 
78
  label_mapping = {
79
  "DATE": "EFFECTIVE_DATE",
80
  "PERSON": "PARTIES",
81
  "ORGANIZATION": "PARTIES",
82
+ "LOCATION": "GOVERNING_LAW"
83
  }
84
 
85
  for chunk in chunks:
 
88
  label = ent["entity_group"]
89
  word = ent["word"]
90
  custom_label = label_mapping.get(label)
91
+ if custom_label and word not in ner_metadata[custom_label]:
92
+ ner_metadata[custom_label].append(word)
93
+
94
+ # 🧠 Replace/enhance with rule-based extraction
95
+ ner_metadata["PARTIES"] = extract_parties(text) or ner_metadata["PARTIES"]
96
+ ner_metadata["EFFECTIVE_DATE"] = extract_effective_date(text) or ner_metadata["EFFECTIVE_DATE"]
97
+ ner_metadata["GOVERNING_LAW"] = extract_governing_law(text) or ner_metadata["GOVERNING_LAW"]
98
+ ner_metadata["VENUE"] = extract_venue(text) or ner_metadata["VENUE"]
99
+
100
+ return ner_metadata