SCBconsulting commited on
Commit
a18ce35
·
verified ·
1 Parent(s): cb4344f

Update utils/metadata.py

Browse files
Files changed (1) hide show
  1. utils/metadata.py +28 -10
utils/metadata.py CHANGED
@@ -1,25 +1,43 @@
1
- from transformers import pipeline
2
 
3
- model = "dslim/bert-base-NER"
4
- ner = pipeline("ner", model=model, tokenizer=model, grouped_entities=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  def extract_metadata(text):
 
 
 
7
  if not text.strip():
8
  return {"error": "No input provided."}
9
 
10
- text = text[:1000]
11
- entities = ner(text)
12
 
13
- result = {
14
  "DATE": [],
15
  "PERSON": [],
16
  "ORGANIZATION": [],
17
  "LOCATION": []
18
  }
19
 
20
- for ent in entities:
21
  label = ent["entity_group"]
22
- if label in result and ent["word"] not in result[label]:
23
- result[label].append(ent["word"])
 
 
24
 
25
- return result
 
1
+ # utils/metadata.py
2
 
3
+ from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
4
+
5
+ # 🧠 More accurate NER model
6
+ model_name = "Jean-Baptiste/roberta-large-ner-english"
7
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
8
+ model = AutoModelForTokenClassification.from_pretrained(model_name)
9
+
10
+ # 🛠️ NER pipeline with aggregation
11
+ ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
12
+
13
+ def clean_text(text):
14
+ """
15
+ Clean contract text for more accurate NER.
16
+ """
17
+ return text.replace("\n", " ").replace(" ", " ").strip()
18
 
19
  def extract_metadata(text):
20
+ """
21
+ Extract named entities from contract using advanced NER.
22
+ """
23
  if not text.strip():
24
  return {"error": "No input provided."}
25
 
26
+ text = clean_text(text)
27
+ ner_results = ner_pipeline(text)
28
 
29
+ metadata = {
30
  "DATE": [],
31
  "PERSON": [],
32
  "ORGANIZATION": [],
33
  "LOCATION": []
34
  }
35
 
36
+ for ent in ner_results:
37
  label = ent["entity_group"]
38
+ word = ent["word"]
39
+
40
+ if label in metadata and word not in metadata[label]:
41
+ metadata[label].append(word)
42
 
43
+ return metadata