email_parser / app.py
jeanpoll
first working version of app
79e12fd
raw
history blame
4.39 kB
import logging, regex
import gradio
from email_parser import utils, nlp
from email_parser.doc_email import Email
def print_highlighted_text(text, df_result, offset=0):
iter_match = regex.finditer("\s|$", text)
start_pos = 0
list_values = []
for match in iter_match:
word = match.string[start_pos:match.start()]
df_entity = df_result.query(f"{start_pos + offset}>=start & {match.start() + offset}<=end").head(1)
if len(df_entity) == 1:
entity = df_entity["entity"].values[0]
else:
entity = None
list_values.append((word, entity))
# list_values.append((match.string[match.start():match.end()], None))
start_pos = match.end()
return list_values
def display_email(text, part=1):
doc = Email(text)
list_emails = doc.list_emails
if part <= len(list_emails):
text = list_emails[int(part-1)]["body"]
header = list_emails[int(part-1)]["header"]
lang = nlp.f_detect_language(text)
if len(header)>0:
df_results_header = nlp.f_ner(header, lang=lang)
df_results_header = Email.f_find_person_in_header(header, df_result=df_results_header)
list_words_headers = print_highlighted_text(header, df_results_header)
else:
list_words_headers = []
df_result = nlp.f_ner(text, lang=lang)
df_signature = nlp.f_detect_email_signature(text, df_ner=df_result)
if df_signature is not None and len(df_signature) > 0:
start_signature_position = df_signature["start"].values[0]
text_body = text[:start_signature_position]
text_signature = text[start_signature_position:]
list_words_signature = print_highlighted_text(text_signature, df_result, offset=start_signature_position)
else:
text_body = text
list_words_signature = []
list_words_body = print_highlighted_text(text_body, df_result)
return None, lang, list_words_headers, list_words_body, list_words_signature
else:
return f"Email number {int(part)} was requested but only {len(list_emails)} emails was found in this thread", \
None, None, None, None
utils.f_setup_logger(level_sysout=logging.ERROR, level_file=logging.INFO, folder_path="logs")
iface = gradio.Interface(title="Parser of email",
description="Small application that can extract a specific email in a thread of email,"
" highlights the entities found in the text (person, organization, date,...)"
" and extract email signature if any.",
fn=display_email,
inputs=["textbox",
gradio.inputs.Number(default=1, label="Email number in thread")],
outputs=[
gradio.outputs.Textbox(type="str", label="Error"),
gradio.outputs.Textbox(type="str", label="Language"),
gradio.outputs.HighlightedText(label="Header"),
gradio.outputs.HighlightedText(label="Body"),
gradio.outputs.HighlightedText(label="Signature")],
examples=[["""Bonjour Vincent,
Merci de m’avoir rappelé hier.
Seriez vous disponible pour un rendez vous la semaine prochaine?
Merci,
Jean-Baptiste""", 1], ["""Hello Jack,
I hope you had nice holiday as well.
Please find attached the requested documents,
Best Regards,
George
Vice president of Something
email: [email protected]
tel: 512-222-5555
On Mon, Jan 7, 2022 at 12:39 PM, Jack <[email protected]> wrote:
Hello George,
I wish you a happy new year. I hope you had nice holidays.
Did you see Garry during your vacation?
Do you have the documents I requested earlier?
Thanks,
Jack
""", 1] , ["""Hello Jack,
I hope you had nice holiday as well.
Please find attached the requested documents,
Best Regards,
George
Vice president of Something
email: [email protected]
tel: 512-222-5555
On Mon, Jan 7, 2022 at 12:39 PM, Jack <[email protected]> wrote:
Hello George,
I wish you a happy new year. I hope you had nice holidays.
Did you see Garry during your vacation?
Do you have the documents I requested earlier?
Thanks,
Jack
""", 2] ])
iface.launch()