File size: 1,088 Bytes
5207833
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import spacy
from transformers import pipeline
import re
from dateutil.parser import parse

# Regex pattern for dates
def extract_entities(email_text, nlp, ner_pipeline):
    date_pattern = r'\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:th|st|nd|rd)?,\s+\d{4}\b'
    # Use spaCy for initial extraction
    doc = nlp(email_text)
    spacy_entities = [{"Text": ent.text, "Type": ent.label_} for ent in doc.ents]

    # Use transformer model for refined extraction
    transformer_entities = ner_pipeline(email_text)
    transformer_entities = [{"Text": ent['word'], "Type": ent['entity'], "Score": ent['score']} for ent in transformer_entities if ent['score'] > 0.75]

    # Extract dates using regex
    potential_dates = re.findall(date_pattern, email_text)
    dates = [parse(date).strftime('%Y-%m-%d') for date in potential_dates]

    return {
        "spaCy Entities": spacy_entities,
        "Transformer Entities": transformer_entities,
        "Dates": dates
    }