Spaces:

Jean-Baptiste
/

email_parser

Running

App Files Files Community

jeanpoll commited on Jan 7, 2022

Commit

79e12fd

1 Parent(s): ba8d0da

first working version of app

Browse files

Files changed (16) hide show

.gitignore +144 -0
Untitled.ipynb +275 -0
app.py +123 -4
email_parser/__init__.py +0 -0
email_parser/_models_signatures.py +184 -0
email_parser/config.ini +7 -0
email_parser/doc_email.py +142 -0
email_parser/models/model_signature_lstm_v10/keras_metadata.pb +3 -0
email_parser/models/model_signature_lstm_v10/minmax_scaler.p +0 -0
email_parser/models/model_signature_lstm_v10/saved_model.pb +3 -0
email_parser/models/model_signature_lstm_v10/standard_scaler.p +0 -0
email_parser/models/model_signature_lstm_v10/variables/variables.data-00000-of-00001 +0 -0
email_parser/models/model_signature_lstm_v10/variables/variables.index +0 -0
email_parser/nlp.py +322 -0
email_parser/utils.py +74 -0
setup.py +26 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,144 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Pycharm
+.idea/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# additionnals stuff
+logs/

Untitled.ipynb ADDED Viewed

	@@ -0,0 +1,275 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "spiritual-swift",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%config Completer.use_jedi = False\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "stopped-single",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow\n",
+    "import regex"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "numeric-handle",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import pipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "numerous-overall",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from email_parser import nlp"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "studied-oracle",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = \"\"\"tel: 512 222 5555\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "pacific-walter",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'en'"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "lang = nlp.f_detect_language(text)\n",
+    "lang"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "every-gardening",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>entity</th>\n",
+       "      <th>value</th>\n",
+       "      <th>start</th>\n",
+       "      <th>end</th>\n",
+       "      <th>score</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>TEL</td>\n",
+       "      <td>512 222 5555</td>\n",
+       "      <td>5</td>\n",
+       "      <td>17</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  entity         value  start  end  score\n",
+       "0    TEL  512 222 5555      5   17      1"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_result = nlp.f_ner(text, lang=lang)\n",
+    "df_result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "operating-recorder",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "delayed-overhead",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>entity</th>\n",
+       "      <th>value</th>\n",
+       "      <th>start</th>\n",
+       "      <th>end</th>\n",
+       "      <th>score</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>SIGNATURE</td>\n",
+       "      <td>JB</td>\n",
+       "      <td>119</td>\n",
+       "      <td>122</td>\n",
+       "      <td>0.955208</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      entity value  start  end     score\n",
+       "0  SIGNATURE    JB    119  122  0.955208"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "nlp.f_detect_email_signature(text, lang=\"fr\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "frozen-jones",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('je', None), (\"m'appelle\", None), ('Jean-Baptiste', 'PER')]"
+      ]
+     },
+     "execution_count": 33,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "iter_match = regex.finditer(\"\\s|$\", text)\n",
+    "list_values = []\n",
+    "start_pos = 0\n",
+    "for match in iter_match:\n",
+    "    word = match.string[start_pos:match.start()]\n",
+    "    \n",
+    "    df_entity = df_result.query(f\"start>={start_pos} & end<={match.start()}\").head(1)\n",
+    "    if len(df_entity)==1:\n",
+    "        entity = df_entity[\"entity\"].values[0]\n",
+    "    else:\n",
+    "        entity = None\n",
+    "#     list_values\n",
+    "    list_values.append((word, entity))\n",
+    "    start_pos = match.end()\n",
+    "list_values\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "solid-speaker",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

app.py CHANGED Viewed

@@ -1,7 +1,126 @@
-import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-iface = gr.Interface(fn=greet, inputs="text", outputs="text")
 iface.launch()

+import logging, regex
+import gradio
+from email_parser import utils, nlp
+from email_parser.doc_email import Email
+def print_highlighted_text(text, df_result, offset=0):
+    iter_match = regex.finditer("\s|$", text)
+    start_pos = 0
+    list_values = []
+    for match in iter_match:
+        word = match.string[start_pos:match.start()]
+        df_entity = df_result.query(f"{start_pos + offset}>=start & {match.start() + offset}<=end").head(1)
+        if len(df_entity) == 1:
+            entity = df_entity["entity"].values[0]
+        else:
+            entity = None
+        list_values.append((word, entity))
+        # list_values.append((match.string[match.start():match.end()], None))
+        start_pos = match.end()
+    return list_values
+def display_email(text, part=1):
+    doc = Email(text)
+    list_emails = doc.list_emails
+    if part <= len(list_emails):
+        text = list_emails[int(part-1)]["body"]
+        header = list_emails[int(part-1)]["header"]
+        lang = nlp.f_detect_language(text)
+        if len(header)>0:
+            df_results_header = nlp.f_ner(header, lang=lang)
+            df_results_header = Email.f_find_person_in_header(header, df_result=df_results_header)
+            list_words_headers = print_highlighted_text(header, df_results_header)
+        else:
+            list_words_headers = []
+        df_result = nlp.f_ner(text, lang=lang)
+        df_signature = nlp.f_detect_email_signature(text, df_ner=df_result)
+        if df_signature is not None and len(df_signature) > 0:
+            start_signature_position = df_signature["start"].values[0]
+            text_body = text[:start_signature_position]
+            text_signature = text[start_signature_position:]
+            list_words_signature = print_highlighted_text(text_signature, df_result, offset=start_signature_position)
+        else:
+            text_body = text
+            list_words_signature = []
+        list_words_body = print_highlighted_text(text_body, df_result)
+        return None, lang, list_words_headers, list_words_body, list_words_signature
+    else:
+        return f"Email number {int(part)} was requested but only {len(list_emails)} emails was found in this thread", \
+               None, None, None, None
+utils.f_setup_logger(level_sysout=logging.ERROR, level_file=logging.INFO, folder_path="logs")
+iface = gradio.Interface(title="Parser of email",
+                         description="Small application that can extract a specific email in a thread of email,"
+                                     " highlights the entities found in the text (person, organization, date,...)"
+                                     " and extract email signature if any.",
+                         fn=display_email,
+                         inputs=["textbox",
+                             gradio.inputs.Number(default=1, label="Email number in thread")],
+                         outputs=[
+                              gradio.outputs.Textbox(type="str", label="Error"),
+                              gradio.outputs.Textbox(type="str", label="Language"),
+                              gradio.outputs.HighlightedText(label="Header"),
+                              gradio.outputs.HighlightedText(label="Body"),
+                              gradio.outputs.HighlightedText(label="Signature")],
+                        examples=[["""Bonjour Vincent,
+Merci de m’avoir rappelé hier.
+Seriez vous disponible pour un rendez vous la semaine prochaine?
+Merci,
+Jean-Baptiste""", 1],  ["""Hello Jack,
+I hope you had nice holiday as well.
+Please find attached the requested documents,
+Best Regards,
+George
+Vice president of Something
+email: [email protected]
+tel: 512-222-5555
+On Mon, Jan 7, 2022 at 12:39 PM, Jack <[email protected]> wrote:
+Hello George,
+I wish you a happy new year. I hope you had nice holidays.
+Did you see Garry during your vacation?
+Do you have the documents I requested earlier?
+Thanks,
+Jack
+""", 1] ,  ["""Hello Jack,
+I hope you had nice holiday as well.
+Please find attached the requested documents,
+Best Regards,
+George
+Vice president of Something
+email: [email protected]
+tel: 512-222-5555
+On Mon, Jan 7, 2022 at 12:39 PM, Jack <[email protected]> wrote:
+Hello George,
+I wish you a happy new year. I hope you had nice holidays.
+Did you see Garry during your vacation?
+Do you have the documents I requested earlier?
+Thanks,
+Jack
+""", 2] ])
 iface.launch()

email_parser/__init__.py ADDED Viewed

File without changes

email_parser/_models_signatures.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import logging
+import pandas as pd
+import numpy as np
+import regex
+import os
+import configparser
+from sentence_transformers import SentenceTransformer
+from scipy.spatial import distance
+from keras.preprocessing.sequence import pad_sequences
+from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import MinMaxScaler
+from tensorflow import keras
+import pickle
+from . import nlp, utils
+config = configparser.ConfigParser()
+config.read(os.path.join(os.path.dirname(__file__), 'config.ini'))
+model_name = config["DEFAULT"]["name_model_signature"]
+model = keras.models.load_model(filepath=utils.get_model_full_path(model_name))
+minmax_scaler = pickle.load(open(utils.get_model_full_path(model_name +"/minmax_scaler.p"), "rb"))
+standard_scaler = pickle.load(open(utils.get_model_full_path(model_name +"/standard_scaler.p"), "rb"))
+list_name_columns_features = ["line_number",
+                              "text",
+                              "start",
+                              "end",
+                              "PER", "ORG", "LOC", "DATE", "TEL", "EMAIL", "WEB",
+                              "SIGNATURE",
+                              "word_count",
+                              "inv_distance_to_merci",
+                              "inv_distance_to_cordlt",
+                              "inv_distance_to_regards",
+                              "inv_distance_to_sincerely",
+                              "inv_distance_to_sent_from",
+                              "start_with_ps", "position_line",
+                              "special_characters_count", "empty_chars_with_prev_line"]
+list_columns_used_in_model = ["PER", "ORG", "LOC", "DATE", "TEL", "EMAIL",
+                              # "WEB",
+                              "word_count",
+                              "inv_distance_to_merci",
+                              "inv_distance_to_cordlt",
+                              # "inv_distance_to_regards",
+                              "inv_distance_to_sincerely",
+                              "inv_distance_to_sent_from",
+                              "start_with_ps",
+                              "position_line",
+                              "special_characters_count",
+                              "empty_chars_with_prev_line"]
+columns_to_scale_minmax = ["PER", "ORG", "LOC", "DATE", "TEL", "EMAIL", "WEB", "position_line",
+                           "empty_chars_with_prev_line",
+                           "inv_distance_to_merci",
+                           "inv_distance_to_cordlt",
+                           "inv_distance_to_regards",
+                           "inv_distance_to_sincerely",
+                           "inv_distance_to_sent_from",
+                           "start_with_ps"
+                           ]
+columns_to_scale_standard = ["word_count", "special_characters_count"]
+def f_retrieve_entities_for_line(df_ner, start=0, end=1e12):
+    """Retrieve all entities in the previously computed dataframe  for a specific line
+    Args:
+          df_ner:  dataframe containing found entities
+          start:  start position of the line in original text
+          end: end position of the line in original text
+          """
+    if len(df_ner) > 0:
+        df = df_ner.query(f"""(start>= {start}  and end <= {end}) or (start<={start}  and end>={end})""")
+        return df
+embedder_model = SentenceTransformer("distiluse-base-multilingual-cased-v1")
+def f_create_embedding_inv_dist_feature(text1, text2):
+  """ Computing distance between two texts based on their embedding
+  provided by the SentenceTransformer above"""
+  embedding_merci = embedder_model.encode(text1)
+  embedding_line = embedder_model.encode(text2)
+  dist = distance.cosine(embedding_merci, embedding_line)
+  return 1 / (dist + 0.01)
+def f_create_email_lines_features(text, df_ner=None, position_offset=0):
+    list_lines = nlp.f_split_text_by_lines(text, position_offset)
+    list_features_vectors = []
+    if df_ner is None:
+        df_ner = nlp.f_ner(text)
+    for line_number in range(0, len(list_lines)):
+        list_features_vectors.append(f_create_line_features(list_lines, line_number, df_ner))
+    df_features = pd.DataFrame(list_features_vectors, columns=list_name_columns_features)
+    return df_features
+def f_create_line_features(list_lines, line_number, df_ner):
+    current_line = list_lines[line_number]
+    total_lines = len(list_lines)
+    features_vector = [line_number, current_line[2], current_line[0], current_line[1]]
+    logging.debug(f"Creating line features for {current_line}")
+    df_ner_line = f_retrieve_entities_for_line(df_ner=df_ner, start=current_line[0], end=current_line[1])
+    # Adding entity to feature vector
+    for entity in ["PER", "ORG", "LOC", "DATE", "TEL", "EMAIL", "WEB", "SIGNATURE"]:
+        value = len(df_ner_line.query(f"entity=='{entity}'")) if df_ner_line is not None else 0
+        features_vector.append(value)
+    # Adding word count
+    features_vector.append(len(current_line[2].split()))
+    # distance to greeting word "merci"
+    features_vector.append(f_create_embedding_inv_dist_feature("merci", current_line[2].lower()))
+    # distance to greeting word "merci"
+    features_vector.append(f_create_embedding_inv_dist_feature("cordialement", current_line[2].lower()))
+    # distance to greeting word "regards"
+    features_vector.append(f_create_embedding_inv_dist_feature("regards", current_line[2].lower()))
+    # distance to greeting word "regards"
+    features_vector.append(f_create_embedding_inv_dist_feature("sincerely", current_line[2].lower()))
+    # distance to  word "sent from"
+    features_vector.append(f_create_embedding_inv_dist_feature("sent from", current_line[2].lower()))
+    # Line start with ps:
+    features_vector.append(regex.match(r"\s*ps *:", current_line[2],  flags=regex.IGNORECASE ) is not None)
+    # Adding position line in email
+    position_in_email = (line_number + 1) / total_lines
+    features_vector.append(position_in_email)
+    # Adding special character count
+    special_char_count = len(regex.findall(r"[^\p{L}0-9 .,\n]", current_line[2]))
+    features_vector.append(special_char_count)
+    # Number of empty chars with previous line
+    empty_chars_with_prev_line = 0 if line_number == 0 else current_line[0] - list_lines[line_number - 1][1]
+    features_vector.append(empty_chars_with_prev_line)
+    return features_vector
+def generate_x_y(df, minmax_scaler=None, standard_scaler=None, n_last_lines_to_keep=30,
+                 list_columns=list_columns_used_in_model):
+    df, minmax_scaler, standard_scaler = f_scale_parameters(df, minmax_scaler, standard_scaler)
+    x = df[list_columns].to_numpy()[-n_last_lines_to_keep:, :]
+    x = np.expand_dims(x, axis=0)
+    y = df["is_signature"].to_numpy()[-n_last_lines_to_keep:]
+    y = np.expand_dims(y, axis=0)
+    return x, y, minmax_scaler, standard_scaler
+def f_scale_parameters(df_tagged_data, minmax_scaler=None, standard_scaler=None):
+    # df_tagged_data = df_tagged_data.copy(deep=True)
+    if minmax_scaler is None:
+        logging.debug("fitting new min max scaller")
+        minmax_scaler = MinMaxScaler()
+        df_tagged_data.loc[:, columns_to_scale_minmax] = minmax_scaler.fit_transform(
+            df_tagged_data[columns_to_scale_minmax])
+    else:
+        logging.debug("using already fitted minmax scaler")
+        df_tagged_data.loc[:, columns_to_scale_minmax] = minmax_scaler.transform(
+            df_tagged_data[columns_to_scale_minmax])
+    if standard_scaler is None:
+        logging.debug("fitting new standard scaler")
+        standard_scaler = StandardScaler()
+        df_tagged_data.loc[:, columns_to_scale_standard] = standard_scaler.fit_transform(
+            df_tagged_data[columns_to_scale_standard])
+    else:
+        logging.debug("using already fitted scaler")
+        df_tagged_data.loc[:, columns_to_scale_standard] = standard_scaler.transform(
+            df_tagged_data[columns_to_scale_standard])
+    return df_tagged_data, minmax_scaler, standard_scaler

email_parser/config.ini ADDED Viewed

	@@ -0,0 +1,7 @@

+[DEFAULT]
+ner_model_fr = Jean-Baptiste/camembert-ner-with-dates
+ner_model_en = Jean-Baptiste/roberta-large-ner-english
+device = -1
+default_lang = en
+name_model_signature = model_signature_lstm_v10
+path_models = models

email_parser/doc_email.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import regex
+import pandas as pd
+from . import nlp
+class Email:
+    def __init__(self,
+                 raw_text):
+        """ Constructor for email
+        :param raw_text: raw text of email
+        """
+        self.raw_text = raw_text
+        self.list_emails = self.f_split_email_thread()
+    def f_split_email_thread(self):
+        """ Function to split a thread of email into a list of individual email.
+        Two main formats of header are recognized:
+        1) Multi-lines header similar to
+                De : sads Cadsfdsf [mailto:[email protected]]
+                Envoyé : 30 mars 2015 08:33
+                À : asdsad, sadsadasd (CA - asdasd)
+                Objet : Re: TR: sadasdasdsad sa dsa
+        2) Le 2015-03-30 à 08:25, Luc, Archambault (CA - Drummondville) <[email protected]> a écrit :
+        Returns:
+            list of dict. Dict contains for each email: (body, header, start, start_header, date, lang)
+        """
+        pattern = r"(((\n{1}\t*|\n(-{4,}.*-{4,}\s*)|^)(([> *]*(de|from|Exp.diteur|Subject)[\s]*:).*(\n[^A-Z].*)?[\r\n\t\s,]{1,}){1,})(([> *\t]*[\p{L}\p{M}' -]*[\s]*:).*((\n[ ]{3,7}?.*|(\n<.*))*)[\r\n\t\s,]{1,3}?){2,}" \
+                  r"|(\s*((((de|from|Exp.diteur|Subject)[\s]*:).{0,200}?[\r\n\t\s,]{1,}){1})(?!de)(((envoy.|.|to|date).?[\s]*:).*?){1,}(((objet|subject)[\s]*:).*?[!?.><,]){1})" \
+                  r"|((?<=\n)(([ >\t]*)(le|on|el).{0,30}\d{4,}.{0,100}\n*.{0,100}(wrote|.crit|escribió)\s*:))" \
+                  r"|(\b(le|on)\s*((\d{2,4}[- ]){3}|(\d{1,2}.{1,8}\d{4}))[^\n]*?(wrote|.crit)\s*:)" \
+                  r"|$)"
+        results = regex.finditer(pattern, self.raw_text, flags=regex.IGNORECASE)
+        start_of_current_header = 0
+        end_of_current_header = 0
+        part_email = 1
+        if results is not None:
+            list_email = []
+            for result in results:
+                start_of_next_header = result.start()
+                # if header_group is not None and full_email[0:header_group.start()].lstrip() == "":
+                if start_of_current_header != end_of_current_header:
+                    header = self.raw_text[start_of_current_header: end_of_current_header]
+                    body = self.raw_text[end_of_current_header:start_of_next_header]
+                    start = end_of_current_header
+                    start_header = start_of_current_header
+                # Case where no header was found (either last email of thread or regex didn't find it)
+                else:
+                    header = ""
+                    body = self.raw_text[end_of_current_header:start_of_next_header]
+                    start = end_of_current_header
+                    start_header = start_of_current_header
+                #  we detect language for each email of the thread and default to detected thread language otherwise
+                # We detect only on first 150 characters
+                lang = nlp.f_detect_language(body[:150])
+                if body.strip() != "" or header != "":
+                    list_email.append({"body": body,
+                                       "header": header,
+                                       "start": start,
+                                       "start_header": start_header,
+                                       "lang": lang,
+                                       "part": part_email
+                                       })
+                    part_email += 1
+                # previous_from_tag = current_from_tag
+                start_of_current_header = result.start()
+                end_of_current_header = result.end()
+            return list_email
+        # Case were mail is not a thread
+        else:
+            return [{"body": self.raw_text,
+                     "header": "",
+                     "start": 0}]
+    @staticmethod
+    def f_find_person_in_header(header, df_result=pd.DataFrame()):
+        results = []
+        dict_header = Email.f_split_email_headers(header)
+        for key in ["to", "cc", "from"]:
+            if key in dict_header.keys():
+                line_header = dict_header[key][0]
+                start_posit = dict_header[key][1]
+                pattern_person = r"(?<=\s|'|^)[\p{L}\p{M}\s,-]{2,}(?=[\s;']|$)"
+                list_results = regex.finditer(pattern_person, line_header, flags=regex.IGNORECASE)
+                for match in list_results:
+                    value = match.group()
+                    if value.strip() != "":
+                        start = match.start()
+                        end = match.end()
+                        results.append(["PER",
+                                          value,
+                                          start_posit + start,
+                                          start_posit + end,
+                                          1
+                                          ])
+        df_result = nlp.f_concat_results(df_result, results)
+        return df_result
+    @staticmethod
+    def f_split_email_headers(header):
+        """ SPlit headers in from/to/date,...in a dictionnary
+        Args:
+            header:
+        Returns:
+        """
+        matching_header_keywords = {"à": "to",
+                                    "Destinataire": "to",
+                                    "de": "from",
+                                    "envoyé": "date",
+                                    "sent": "date",
+                                    "objet": "subject"}
+        dict_results = {}
+        pattern = r"((?<=\s|^)(à|À|a\p{M}|Cc|To|De|From|Envoy.|Date|Sent|Objet|Subject|Destinataire)\s?:)[ ]*((.*?)[ ]*((\n[ ]{3,7}?.*)*))(?=[\p{L}\p{M}]*\s{1,}:| > |\n|$)"
+        list_results = regex.finditer(pattern, header, flags=regex.IGNORECASE)
+        for match in list_results:
+            key_word = match.group(2).strip().lower()
+            key_word_matched = matching_header_keywords.get(key_word)
+            dict_results[key_word_matched if not key_word_matched is None else key_word] = [match.group(3),
+                                                                                            match.span(3)[0],
+                                                                                            match.span(3)[1]]
+        return dict_results

email_parser/models/model_signature_lstm_v10/keras_metadata.pb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1df1ebcda9b9f2ca0855f67117d5c8b7db0d89c46c346273a536f2eec13c5665
+size 22060

email_parser/models/model_signature_lstm_v10/minmax_scaler.p ADDED Viewed

Binary file (1.16 kB). View file

email_parser/models/model_signature_lstm_v10/saved_model.pb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a28bac82659a6bc1cf949dc04d01a09db681cab64c9388ff1267d53fa3d11fb2
+size 5272723

email_parser/models/model_signature_lstm_v10/standard_scaler.p ADDED Viewed

Binary file (584 Bytes). View file

email_parser/models/model_signature_lstm_v10/variables/variables.data-00000-of-00001 ADDED Viewed

Binary file (116 kB). View file

email_parser/models/model_signature_lstm_v10/variables/variables.index ADDED Viewed

Binary file (3.48 kB). View file

email_parser/nlp.py ADDED Viewed

	@@ -0,0 +1,322 @@

+import logging
+import os
+import regex
+from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
+import pandas as pd
+import numpy as np
+from . import utils, _models_signatures
+from .utils import timing
+from langid.langid import LanguageIdentifier
+from langid.langid import model as model_langid
+# Creating language_identifier object for usage in function f_detect_language
+language_identifier = LanguageIdentifier.from_modelstring(model_langid, norm_probs=True)
+language_identifier.set_languages(['en', 'fr'])
+logging.info(f"Reading config file from folder:{os.path.join(os.path.dirname(__file__))}")
+config = utils.f_read_config(os.path.join(os.path.dirname(__file__), 'config.ini'))
+device = int(config["DEFAULT"]["device"])
+default_lang = config["DEFAULT"]["default_lang"]
+tokenizer_dict = {}
+models_dict = {}
+nlp_dict = {}
+dict_regex_pattern = dict(EMAIL=r'[\p{L}\p{M}\-\d._]{1,}@[\p{L}\p{M}\d\-_]{1,}(\.[\p{L}\p{M}]{1,}){1,}',
+                          TEL=r'(?<!\d)(\+?\d{1,2}[ -]?)?\(?\d{3}\)?[ .-]?\d{3}[ .-]?\d{4}(?!\d|\p{P}\d)',
+                          POST=r'\b([A-z][0-9][A-z][ -]?[0-9][A-z][0-9]|[A-z][0-9][A-z])\b',
+                          PRICE=r"(([\s:,]|^){1}\$*(CA|CAD|USD|EUR|GBP|\$|\€|\£|\¢){1}\$*[\d., ]*[\d]{1,}\b)" +
+                                "|([\d]{1,}[\d., ]*(CA|CAD|USD|EUR|GBP|\$|\€|\£|k|m|\¢){1,}\$*(?=\s|\p{P}|$))",
+                          WEB=r"((www(\.[\p{L}\p{M}\-0-9]]{1,}){2,})" +
+                              "|(https?:[^ ]*)"+
+                              # r"|(([\p{L}\p{M}\.]{3,}){2,})|"
+                              r"|((?<=[\s:]|^)([\p{L}\p{M}\-0-9]{1,}\.){1,}(com|ca|org|fr){1,}\b))")
+                          # WEB=r"(http(s)?:\/\/)?[a-z0-9]{1}[a-z0-9-._~]+[.]{1}(com|ca)(?![\p{L}\p{M}])")
+def f_load_tokenizer_and_model_for_nlp(model_name, pipeline_type='ner'):
+    """
+    Loading model and tokenizer takes a long time.
+    We do it once and store the model and tokenizer in global dict for next usage
+    Args:
+        name: Name of the model that should be loaded and stored
+        pipeline_type: type of pipeline that should be initialized
+    Returns: tokenizer, model
+    """
+    global tokenizer_dict, models_dict, nlp_dict
+    auto_model = None
+    if pipeline_type == "ner":
+        auto_model = AutoModelForTokenClassification
+    if model_name not in tokenizer_dict.keys() or model_name not in models_dict.keys() or model_name not in nlp_dict.keys():
+        logging.info(
+            f"Loading tokenizer and model: {model_name}")
+        tokenizer_dict[model_name] = AutoTokenizer.from_pretrained(model_name)
+        # , add_prefix_space = True
+        models_dict[model_name] = auto_model.from_pretrained(model_name)
+        if pipeline_type == 'ner':
+            nlp_dict[model_name] = pipeline(pipeline_type, model=models_dict[model_name], tokenizer=tokenizer_dict[model_name],
+                                      aggregation_strategy="simple", device=device)
+def f_ner(text, lang=default_lang):
+    df_result = f_ner_regex(text)
+    df_result = f_ner_model(text, lang=lang, df_result=df_result)
+    return df_result
+@timing
+def f_ner_model(text,  lang=default_lang, df_result=pd.DataFrame()):
+    list_result = []
+    # We split the text by sentence and run model on each one
+    sentence_tokenizer = f_split_text_by_lines(text)
+    for start, end, value in sentence_tokenizer:
+        if value != "":
+            results = f_ner_model_by_sentence(value, lang=lang, pos_offset=start)
+            if len(results) != 0:
+                list_result += results
+    return f_concat_results(df_result, list_result)
+@timing
+def f_ner_model_by_sentence(sentence, lang=default_lang, df_result=pd.DataFrame(), pos_offset=0):
+    """ Run ner algorithm
+    Args:
+        sentence : sentence on which to run model
+        lang : lang to determine which model to use
+        df_result : If results of f_ner should be combined with previous value
+        (in this case we will keep the previous values if tags overlapsed)
+    Returns:
+        Dataframe with identified entities
+    """
+    if not config.has_option('DEFAULT', 'ner_model_' + lang):
+        raise ValueError(f"No model was defined for ner in {lang}")
+    model_name = config['DEFAULT']['ner_model_' + lang]
+    f_load_tokenizer_and_model_for_nlp(model_name)
+    logging.debug(f"starting {model_name} on sentence:'{sentence}'")
+    results = nlp_dict[model_name](sentence)
+    list_result = []
+    for result in results:
+        if result["word"] != "" and result['entity_group'] in ["PER", "LOC", "ORG", "DATE"]:
+            # Required because sometimes spaces are included in result["word"] value, but not in start/end position
+            value = sentence[result["start"]:result["end"]]
+            # We remove any special character at the beginning
+            pattern = r"[^.,'’` \":()\n].*"
+            result_regex = regex.search(pattern, value, flags=regex.IGNORECASE)
+            if result_regex is not None:
+                word_raw = result_regex.group()
+                word = word_raw
+                real_word_start = result["start"] + result_regex.start()
+                real_word_end = result["start"] + result_regex.start() + len(word_raw)
+                # We check if entity might be inside a longer word, if this is the case we ignore
+                letter_before = sentence[max(0, real_word_start - 1): real_word_start]
+                letter_after = sentence[real_word_end: min(len(sentence), real_word_end + 1)]
+                if regex.match(r"[A-z]", letter_before) or regex.match(r"[A-z]", letter_after):
+                    logging.debug(f"Ignoring entity {value} because letter before is"
+                                  f" '{letter_before}' or letter after is '{letter_after}'")
+                    continue
+                list_result.append(
+                    [result["entity_group"],
+                     word,
+                     real_word_start + pos_offset,
+                     real_word_end + pos_offset,
+                     result["score"]])
+    return list_result
+@timing
+def f_concat_results(df_result, list_result_new):
+    """ Merge results between existing dataframe and a list of new values
+    Args:
+        df_result: dataframe of entities
+        list_result_new: list of new entities to be added in df_result
+    Returns:
+        Dataframe with all entities. Entities in list_result_new that were overlapping position of another entity in
+        df_result are ignored.
+    """
+    # If df_result and list_result_new are both empty, we return an empty dataframe
+    list_columns_names = ["entity", "value", "start", "end", "score"]
+    if (df_result is None or len(df_result) == 0) and (list_result_new is None or len(list_result_new) == 0):
+        return pd.DataFrame()
+    elif len(list_result_new) > 0:
+        if df_result is None or len(df_result) == 0:
+            return pd.DataFrame(list_result_new,
+                                columns=list_columns_names)
+        list_row = []
+        for row in list_result_new:
+            df_intersect = df_result.query("({1}>=start and {0}<=end)".format(row[2], row[3]))
+            if len(df_intersect) == 0:
+                list_row.append(row)
+        df_final = pd.concat([df_result,
+                              pd.DataFrame(list_row,
+                                           columns=list_columns_names)],
+                             ignore_index=True) \
+            .sort_values(by="start")
+        return df_final
+    else:
+        # If list_result_new was empty we just return df_result
+        return df_result
+@timing
+def f_detect_language(text, default=default_lang):
+    """ Detect language
+    Args:
+        text: text on which language should be detected
+        default: default value if there is an error or score of predicted value is to low (default nlp.default_lang)
+    Returns:
+        "fr" or "en"
+    """
+    lang = default
+    try:
+        if text.strip() != "":
+            lang, score = language_identifier.classify(text.strip().replace("\n"," ").lower())
+            # If scroe is not high enough we will take default value instead
+            if score < 0.8:
+                lang = default_lang
+    except Exception as e:
+        logging.error("following error occurs when trying to detect language: {}".format(e))
+    finally:
+        return lang
+@timing
+def f_find_regex_pattern(text, type_, pattern):
+    """ Find all occurences of a pattern in a text and return a list of results
+    Args:
+        text:  the text to be analyzed
+        type_:  the entity type (value is added in result)
+        pattern: regex pattern to be found
+    Returns:
+        A list containing type, matched value, position start and end of each result
+    """
+    list_result = []
+    results = regex.finditer(pattern, text, flags=regex.IGNORECASE)
+    for match in results:
+        value = match.string[match.start(): match.end()].replace("\n", " ").strip()
+        list_result.append([type_,
+                            value,
+                            match.start(),
+                            match.end(),
+                            1])
+    return list_result
+@timing
+def f_ner_regex(text, dict_pattern=dict_regex_pattern,
+                df_result=pd.DataFrame()):
+    """Run a series of regex expression to detect email, tel and postal codes in a full text.
+    Args:
+        text: the text to be analyzed
+        dict_pattern: dictionary of regex expression to be ran successively (default nlp.dict_regex_pattern)
+        df_result: results of this function will be merged with values provided here.
+                   If value is already found at an overlapping  position in df_results, the existing value will be kept
+    Returns:
+        Dataframe containing results merged with provided argument df_result (if any)
+    """
+    logging.debug("Starting regex")
+    list_result = []
+    # we run f_find_regex_pattern for each pattern in dict_regex
+    for type_, pattern in dict_pattern.items():
+        result = f_find_regex_pattern(text, type_, pattern)
+        if len(result) != 0:
+            list_result += result
+    df_result = f_concat_results(df_result, list_result)
+    return df_result
+@timing
+def f_split_text_by_lines(text, position_offset=0):
+    """
+    :param text: text that should be split
+    :return: list containing for each line:  [position start, position end, sentence]
+    """
+    results = []
+    # iter_lines = regex.finditer(".*(?=\n|$)", text)
+    iter_lines = regex.finditer("[^>\n]((.*?([!?.>] ){1,})|.*(?=\n|$))", text)
+    for line_match in iter_lines:
+        start_line = line_match.start()
+        end_line = line_match.end()
+        line = line_match.group()
+        if len(line.strip()) > 1:
+            results.append([start_line + position_offset, end_line + position_offset, line])
+    return results
+def f_detect_email_signature(text, df_ner=None, cut_off_score=0.6, lang=default_lang):
+    # with tf.device("/cpu:0"):
+    if text.strip() == "":
+        return None
+    if df_ner is None:
+        df_ner = f_ner(text, lang=lang)
+    df_features = _models_signatures.f_create_email_lines_features(text, df_ner=df_ner)
+    if len(df_features)==0:
+        return None
+    #     We add dummy value for signature in order to use same function than for training of the model
+    df_features["is_signature"] = -2
+    x, y_out, _, _ = _models_signatures.generate_x_y(df_features, _models_signatures.minmax_scaler,
+                                                             _models_signatures.standard_scaler)
+    y_predict = _models_signatures.model.predict(x)
+    y_predict_value = (y_predict> cut_off_score).reshape([-1])
+    y_predict_value = np.pad(y_predict_value, (len(df_features) - len(y_predict_value), 0), constant_values=0)[
+                      -len(df_features):]
+    y_predict_score = y_predict.reshape([-1])
+    y_predict_score = np.pad(y_predict_score, (len(df_features) - len(y_predict_score), 0), constant_values=1)[
+                      -len(df_features):]
+    # return(y_predict, y_mask)
+    df_features["prediction"] = y_predict_value
+    df_features["score"] = y_predict_score
+    # return df_features
+    series_position_body = df_features.query(f"""prediction==0""")['end']
+    if len(series_position_body) > 0:
+        body_end_pos = max(series_position_body)
+    else:
+        # In this case everything was detected as a signature
+        body_end_pos = 0
+    score = df_features.query(f"""prediction==1""")["score"].mean()
+    signature_text = text[body_end_pos:].strip().replace("\n", " ")
+    if signature_text != "":
+        list_result = [
+            # ["body", text[:body_end_pos], 0 + pos_start_email, body_end_pos + pos_start_email, 1, ""],
+            ["SIGNATURE", signature_text, body_end_pos, len(text), score]]
+        df_result = f_concat_results(pd.DataFrame(), list_result)
+    else:
+        df_result = None
+    return df_result

email_parser/utils.py ADDED Viewed

	@@ -0,0 +1,74 @@

+from functools import wraps
+import logging
+import os
+from time import time
+import configparser
+timer_functions = {}
+# Loading configuration from config file
+config = configparser.ConfigParser()
+config.read(os.path.join(os.path.dirname(__file__), 'config.ini'))
+def timing(f):
+    @wraps(f)
+    def wrap(*args, **kw):
+        ts = time()
+        result = f(*args, **kw)
+        te = time()
+        if f.__name__ in timer_functions.keys():
+            current_elapsed_time = timer_functions[f.__name__]
+        else:
+            current_elapsed_time = 0
+        timer_functions[f.__name__] = current_elapsed_time + (te - ts)
+        logging.debug('func:%r took: %2.4f sec' % \
+                        (f.__name__, te - ts))
+        return result
+    return wrap
+def f_read_config(path=None):
+    """ read config file from specified file path
+    :param path: file path
+    :return: configparser object
+    """
+    # Loading configuration from config file
+    config = configparser.ConfigParser()
+    if path is None:
+        path = os.path.join(os.path.dirname(__file__), 'config.ini')
+    config.read(path, encoding='utf-8')
+    return config
+def f_setup_logger(level_sysout=logging.INFO, level_file=logging.DEBUG, folder_path="logs"):
+    """Setup logger
+    By default we display only INFO in console, and write everything in file
+    Args:
+        level_sysout: Level that is displayed in console (default INFO)
+        level_file: Level that is written in file (default DEBUG)
+    Returns:
+        Nothing
+    """
+    if not os.path.isdir(folder_path):
+        os.mkdir(folder_path)
+    for handler in logging.root.handlers[:]:
+        logging.root.removeHandler(handler)
+    file_handler = logging.FileHandler(filename=os.path.join(folder_path, "amf_uce_nlp_{}.log".format(time())),
+                                       encoding='utf-8')
+    sysout_handler = logging.StreamHandler()
+    file_handler.setLevel(level_file)
+    sysout_handler.setLevel(level_sysout)
+    logging.basicConfig(handlers=[file_handler, sysout_handler], level=logging.DEBUG,
+                        format='%(asctime)s (%(levelname)s) %(message)s', datefmt='%m/%d/%y %I:%M:%S %p')
+def get_model_full_path(model_name):
+    path_models = config["DEFAULT"]["path_models"]
+    return os.path.join(os.path.dirname(__file__), path_models, model_name)

setup.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from setuptools import find_packages, setup
+from glob import glob
+import os
+setup(name='email_parser',
+      packages=find_packages(include=['email_parser']),
+      version='0.0.1',
+      description='Email parser',
+      author='JB Polle',
+      license='MIT',
+      install_requires=['langid==1.1.6',
+                        'numpy>=1.19.5',
+                        'pandas>=1.2.3',
+                        'regex',
+                        'scikit-learn==0.24.1',
+                        'sentence-transformers==1.0.4',
+                        'tensorflow==2.6.0',
+                        'tensorflow-hub>=0.12.0',
+                        'tensorflow-text==2.6.0',
+                        'tokenizers==0.10.1',
+                        'torch>=1.8.0',
+                        'umap-learn==0.5.1',
+                        'dateparser==1.0.0',
+                        'transformers>=4.3',
+                        'gradio>=2.7'])