Spaces:
Running
Running
jeanpoll
commited on
Commit
•
79e12fd
1
Parent(s):
ba8d0da
first working version of app
Browse files- .gitignore +144 -0
- Untitled.ipynb +275 -0
- app.py +123 -4
- email_parser/__init__.py +0 -0
- email_parser/_models_signatures.py +184 -0
- email_parser/config.ini +7 -0
- email_parser/doc_email.py +142 -0
- email_parser/models/model_signature_lstm_v10/keras_metadata.pb +3 -0
- email_parser/models/model_signature_lstm_v10/minmax_scaler.p +0 -0
- email_parser/models/model_signature_lstm_v10/saved_model.pb +3 -0
- email_parser/models/model_signature_lstm_v10/standard_scaler.p +0 -0
- email_parser/models/model_signature_lstm_v10/variables/variables.data-00000-of-00001 +0 -0
- email_parser/models/model_signature_lstm_v10/variables/variables.index +0 -0
- email_parser/nlp.py +322 -0
- email_parser/utils.py +74 -0
- setup.py +26 -0
.gitignore
ADDED
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.nox/
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
45 |
+
.cache
|
46 |
+
nosetests.xml
|
47 |
+
coverage.xml
|
48 |
+
*.cover
|
49 |
+
*.py,cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
cover/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
.pybuilder/
|
76 |
+
target/
|
77 |
+
|
78 |
+
# Jupyter Notebook
|
79 |
+
.ipynb_checkpoints
|
80 |
+
|
81 |
+
# IPython
|
82 |
+
profile_default/
|
83 |
+
ipython_config.py
|
84 |
+
|
85 |
+
# pyenv
|
86 |
+
# For a library or package, you might want to ignore these files since the code is
|
87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
88 |
+
# .python-version
|
89 |
+
|
90 |
+
# pipenv
|
91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94 |
+
# install all needed dependencies.
|
95 |
+
#Pipfile.lock
|
96 |
+
|
97 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
98 |
+
__pypackages__/
|
99 |
+
|
100 |
+
# Celery stuff
|
101 |
+
celerybeat-schedule
|
102 |
+
celerybeat.pid
|
103 |
+
|
104 |
+
# SageMath parsed files
|
105 |
+
*.sage.py
|
106 |
+
|
107 |
+
# Environments
|
108 |
+
.env
|
109 |
+
.venv
|
110 |
+
env/
|
111 |
+
venv/
|
112 |
+
ENV/
|
113 |
+
env.bak/
|
114 |
+
venv.bak/
|
115 |
+
|
116 |
+
# Pycharm
|
117 |
+
.idea/
|
118 |
+
|
119 |
+
# Spyder project settings
|
120 |
+
.spyderproject
|
121 |
+
.spyproject
|
122 |
+
|
123 |
+
# Rope project settings
|
124 |
+
.ropeproject
|
125 |
+
|
126 |
+
# mkdocs documentation
|
127 |
+
/site
|
128 |
+
|
129 |
+
# mypy
|
130 |
+
.mypy_cache/
|
131 |
+
.dmypy.json
|
132 |
+
dmypy.json
|
133 |
+
|
134 |
+
# Pyre type checker
|
135 |
+
.pyre/
|
136 |
+
|
137 |
+
# pytype static type analyzer
|
138 |
+
.pytype/
|
139 |
+
|
140 |
+
# Cython debug symbols
|
141 |
+
cython_debug/
|
142 |
+
|
143 |
+
# additionnals stuff
|
144 |
+
logs/
|
Untitled.ipynb
ADDED
@@ -0,0 +1,275 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"id": "spiritual-swift",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [],
|
9 |
+
"source": [
|
10 |
+
"%config Completer.use_jedi = False\n",
|
11 |
+
"%load_ext autoreload\n",
|
12 |
+
"%autoreload 2"
|
13 |
+
]
|
14 |
+
},
|
15 |
+
{
|
16 |
+
"cell_type": "code",
|
17 |
+
"execution_count": 1,
|
18 |
+
"id": "stopped-single",
|
19 |
+
"metadata": {},
|
20 |
+
"outputs": [],
|
21 |
+
"source": [
|
22 |
+
"import tensorflow\n",
|
23 |
+
"import regex"
|
24 |
+
]
|
25 |
+
},
|
26 |
+
{
|
27 |
+
"cell_type": "code",
|
28 |
+
"execution_count": 2,
|
29 |
+
"id": "numeric-handle",
|
30 |
+
"metadata": {},
|
31 |
+
"outputs": [],
|
32 |
+
"source": [
|
33 |
+
"from transformers import pipeline"
|
34 |
+
]
|
35 |
+
},
|
36 |
+
{
|
37 |
+
"cell_type": "code",
|
38 |
+
"execution_count": 3,
|
39 |
+
"id": "numerous-overall",
|
40 |
+
"metadata": {},
|
41 |
+
"outputs": [],
|
42 |
+
"source": [
|
43 |
+
"from email_parser import nlp"
|
44 |
+
]
|
45 |
+
},
|
46 |
+
{
|
47 |
+
"cell_type": "code",
|
48 |
+
"execution_count": 4,
|
49 |
+
"id": "studied-oracle",
|
50 |
+
"metadata": {},
|
51 |
+
"outputs": [],
|
52 |
+
"source": [
|
53 |
+
"text = \"\"\"tel: 512 222 5555\"\"\""
|
54 |
+
]
|
55 |
+
},
|
56 |
+
{
|
57 |
+
"cell_type": "code",
|
58 |
+
"execution_count": 5,
|
59 |
+
"id": "pacific-walter",
|
60 |
+
"metadata": {},
|
61 |
+
"outputs": [
|
62 |
+
{
|
63 |
+
"data": {
|
64 |
+
"text/plain": [
|
65 |
+
"'en'"
|
66 |
+
]
|
67 |
+
},
|
68 |
+
"execution_count": 5,
|
69 |
+
"metadata": {},
|
70 |
+
"output_type": "execute_result"
|
71 |
+
}
|
72 |
+
],
|
73 |
+
"source": [
|
74 |
+
"lang = nlp.f_detect_language(text)\n",
|
75 |
+
"lang"
|
76 |
+
]
|
77 |
+
},
|
78 |
+
{
|
79 |
+
"cell_type": "code",
|
80 |
+
"execution_count": 6,
|
81 |
+
"id": "every-gardening",
|
82 |
+
"metadata": {},
|
83 |
+
"outputs": [
|
84 |
+
{
|
85 |
+
"data": {
|
86 |
+
"text/html": [
|
87 |
+
"<div>\n",
|
88 |
+
"<style scoped>\n",
|
89 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
90 |
+
" vertical-align: middle;\n",
|
91 |
+
" }\n",
|
92 |
+
"\n",
|
93 |
+
" .dataframe tbody tr th {\n",
|
94 |
+
" vertical-align: top;\n",
|
95 |
+
" }\n",
|
96 |
+
"\n",
|
97 |
+
" .dataframe thead th {\n",
|
98 |
+
" text-align: right;\n",
|
99 |
+
" }\n",
|
100 |
+
"</style>\n",
|
101 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
102 |
+
" <thead>\n",
|
103 |
+
" <tr style=\"text-align: right;\">\n",
|
104 |
+
" <th></th>\n",
|
105 |
+
" <th>entity</th>\n",
|
106 |
+
" <th>value</th>\n",
|
107 |
+
" <th>start</th>\n",
|
108 |
+
" <th>end</th>\n",
|
109 |
+
" <th>score</th>\n",
|
110 |
+
" </tr>\n",
|
111 |
+
" </thead>\n",
|
112 |
+
" <tbody>\n",
|
113 |
+
" <tr>\n",
|
114 |
+
" <th>0</th>\n",
|
115 |
+
" <td>TEL</td>\n",
|
116 |
+
" <td>512 222 5555</td>\n",
|
117 |
+
" <td>5</td>\n",
|
118 |
+
" <td>17</td>\n",
|
119 |
+
" <td>1</td>\n",
|
120 |
+
" </tr>\n",
|
121 |
+
" </tbody>\n",
|
122 |
+
"</table>\n",
|
123 |
+
"</div>"
|
124 |
+
],
|
125 |
+
"text/plain": [
|
126 |
+
" entity value start end score\n",
|
127 |
+
"0 TEL 512 222 5555 5 17 1"
|
128 |
+
]
|
129 |
+
},
|
130 |
+
"execution_count": 6,
|
131 |
+
"metadata": {},
|
132 |
+
"output_type": "execute_result"
|
133 |
+
}
|
134 |
+
],
|
135 |
+
"source": [
|
136 |
+
"df_result = nlp.f_ner(text, lang=lang)\n",
|
137 |
+
"df_result"
|
138 |
+
]
|
139 |
+
},
|
140 |
+
{
|
141 |
+
"cell_type": "code",
|
142 |
+
"execution_count": null,
|
143 |
+
"id": "operating-recorder",
|
144 |
+
"metadata": {},
|
145 |
+
"outputs": [],
|
146 |
+
"source": []
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"cell_type": "code",
|
150 |
+
"execution_count": 16,
|
151 |
+
"id": "delayed-overhead",
|
152 |
+
"metadata": {},
|
153 |
+
"outputs": [
|
154 |
+
{
|
155 |
+
"data": {
|
156 |
+
"text/html": [
|
157 |
+
"<div>\n",
|
158 |
+
"<style scoped>\n",
|
159 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
160 |
+
" vertical-align: middle;\n",
|
161 |
+
" }\n",
|
162 |
+
"\n",
|
163 |
+
" .dataframe tbody tr th {\n",
|
164 |
+
" vertical-align: top;\n",
|
165 |
+
" }\n",
|
166 |
+
"\n",
|
167 |
+
" .dataframe thead th {\n",
|
168 |
+
" text-align: right;\n",
|
169 |
+
" }\n",
|
170 |
+
"</style>\n",
|
171 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
172 |
+
" <thead>\n",
|
173 |
+
" <tr style=\"text-align: right;\">\n",
|
174 |
+
" <th></th>\n",
|
175 |
+
" <th>entity</th>\n",
|
176 |
+
" <th>value</th>\n",
|
177 |
+
" <th>start</th>\n",
|
178 |
+
" <th>end</th>\n",
|
179 |
+
" <th>score</th>\n",
|
180 |
+
" </tr>\n",
|
181 |
+
" </thead>\n",
|
182 |
+
" <tbody>\n",
|
183 |
+
" <tr>\n",
|
184 |
+
" <th>0</th>\n",
|
185 |
+
" <td>SIGNATURE</td>\n",
|
186 |
+
" <td>JB</td>\n",
|
187 |
+
" <td>119</td>\n",
|
188 |
+
" <td>122</td>\n",
|
189 |
+
" <td>0.955208</td>\n",
|
190 |
+
" </tr>\n",
|
191 |
+
" </tbody>\n",
|
192 |
+
"</table>\n",
|
193 |
+
"</div>"
|
194 |
+
],
|
195 |
+
"text/plain": [
|
196 |
+
" entity value start end score\n",
|
197 |
+
"0 SIGNATURE JB 119 122 0.955208"
|
198 |
+
]
|
199 |
+
},
|
200 |
+
"execution_count": 16,
|
201 |
+
"metadata": {},
|
202 |
+
"output_type": "execute_result"
|
203 |
+
}
|
204 |
+
],
|
205 |
+
"source": [
|
206 |
+
"nlp.f_detect_email_signature(text, lang=\"fr\")"
|
207 |
+
]
|
208 |
+
},
|
209 |
+
{
|
210 |
+
"cell_type": "code",
|
211 |
+
"execution_count": 33,
|
212 |
+
"id": "frozen-jones",
|
213 |
+
"metadata": {},
|
214 |
+
"outputs": [
|
215 |
+
{
|
216 |
+
"data": {
|
217 |
+
"text/plain": [
|
218 |
+
"[('je', None), (\"m'appelle\", None), ('Jean-Baptiste', 'PER')]"
|
219 |
+
]
|
220 |
+
},
|
221 |
+
"execution_count": 33,
|
222 |
+
"metadata": {},
|
223 |
+
"output_type": "execute_result"
|
224 |
+
}
|
225 |
+
],
|
226 |
+
"source": [
|
227 |
+
"iter_match = regex.finditer(\"\\s|$\", text)\n",
|
228 |
+
"list_values = []\n",
|
229 |
+
"start_pos = 0\n",
|
230 |
+
"for match in iter_match:\n",
|
231 |
+
" word = match.string[start_pos:match.start()]\n",
|
232 |
+
" \n",
|
233 |
+
" df_entity = df_result.query(f\"start>={start_pos} & end<={match.start()}\").head(1)\n",
|
234 |
+
" if len(df_entity)==1:\n",
|
235 |
+
" entity = df_entity[\"entity\"].values[0]\n",
|
236 |
+
" else:\n",
|
237 |
+
" entity = None\n",
|
238 |
+
"# list_values\n",
|
239 |
+
" list_values.append((word, entity))\n",
|
240 |
+
" start_pos = match.end()\n",
|
241 |
+
"list_values\n",
|
242 |
+
" "
|
243 |
+
]
|
244 |
+
},
|
245 |
+
{
|
246 |
+
"cell_type": "code",
|
247 |
+
"execution_count": null,
|
248 |
+
"id": "solid-speaker",
|
249 |
+
"metadata": {},
|
250 |
+
"outputs": [],
|
251 |
+
"source": []
|
252 |
+
}
|
253 |
+
],
|
254 |
+
"metadata": {
|
255 |
+
"kernelspec": {
|
256 |
+
"display_name": "Python 3",
|
257 |
+
"language": "python",
|
258 |
+
"name": "python3"
|
259 |
+
},
|
260 |
+
"language_info": {
|
261 |
+
"codemirror_mode": {
|
262 |
+
"name": "ipython",
|
263 |
+
"version": 3
|
264 |
+
},
|
265 |
+
"file_extension": ".py",
|
266 |
+
"mimetype": "text/x-python",
|
267 |
+
"name": "python",
|
268 |
+
"nbconvert_exporter": "python",
|
269 |
+
"pygments_lexer": "ipython3",
|
270 |
+
"version": "3.7.10"
|
271 |
+
}
|
272 |
+
},
|
273 |
+
"nbformat": 4,
|
274 |
+
"nbformat_minor": 5
|
275 |
+
}
|
app.py
CHANGED
@@ -1,7 +1,126 @@
|
|
1 |
-
import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
-
def greet(name):
|
4 |
-
return "Hello " + name + "!!"
|
5 |
|
6 |
-
iface = gr.Interface(fn=greet, inputs="text", outputs="text")
|
7 |
iface.launch()
|
|
|
1 |
+
import logging, regex
|
2 |
+
import gradio
|
3 |
+
from email_parser import utils, nlp
|
4 |
+
from email_parser.doc_email import Email
|
5 |
+
|
6 |
+
def print_highlighted_text(text, df_result, offset=0):
|
7 |
+
iter_match = regex.finditer("\s|$", text)
|
8 |
+
start_pos = 0
|
9 |
+
list_values = []
|
10 |
+
for match in iter_match:
|
11 |
+
word = match.string[start_pos:match.start()]
|
12 |
+
|
13 |
+
df_entity = df_result.query(f"{start_pos + offset}>=start & {match.start() + offset}<=end").head(1)
|
14 |
+
if len(df_entity) == 1:
|
15 |
+
entity = df_entity["entity"].values[0]
|
16 |
+
else:
|
17 |
+
entity = None
|
18 |
+
list_values.append((word, entity))
|
19 |
+
# list_values.append((match.string[match.start():match.end()], None))
|
20 |
+
start_pos = match.end()
|
21 |
+
return list_values
|
22 |
+
|
23 |
+
|
24 |
+
def display_email(text, part=1):
|
25 |
+
doc = Email(text)
|
26 |
+
list_emails = doc.list_emails
|
27 |
+
if part <= len(list_emails):
|
28 |
+
text = list_emails[int(part-1)]["body"]
|
29 |
+
header = list_emails[int(part-1)]["header"]
|
30 |
+
lang = nlp.f_detect_language(text)
|
31 |
+
|
32 |
+
if len(header)>0:
|
33 |
+
df_results_header = nlp.f_ner(header, lang=lang)
|
34 |
+
df_results_header = Email.f_find_person_in_header(header, df_result=df_results_header)
|
35 |
+
list_words_headers = print_highlighted_text(header, df_results_header)
|
36 |
+
else:
|
37 |
+
list_words_headers = []
|
38 |
+
|
39 |
+
df_result = nlp.f_ner(text, lang=lang)
|
40 |
+
df_signature = nlp.f_detect_email_signature(text, df_ner=df_result)
|
41 |
+
if df_signature is not None and len(df_signature) > 0:
|
42 |
+
start_signature_position = df_signature["start"].values[0]
|
43 |
+
text_body = text[:start_signature_position]
|
44 |
+
text_signature = text[start_signature_position:]
|
45 |
+
list_words_signature = print_highlighted_text(text_signature, df_result, offset=start_signature_position)
|
46 |
+
else:
|
47 |
+
text_body = text
|
48 |
+
list_words_signature = []
|
49 |
+
list_words_body = print_highlighted_text(text_body, df_result)
|
50 |
+
|
51 |
+
return None, lang, list_words_headers, list_words_body, list_words_signature
|
52 |
+
else:
|
53 |
+
return f"Email number {int(part)} was requested but only {len(list_emails)} emails was found in this thread", \
|
54 |
+
None, None, None, None
|
55 |
+
|
56 |
+
|
57 |
+
utils.f_setup_logger(level_sysout=logging.ERROR, level_file=logging.INFO, folder_path="logs")
|
58 |
+
|
59 |
+
|
60 |
+
iface = gradio.Interface(title="Parser of email",
|
61 |
+
description="Small application that can extract a specific email in a thread of email,"
|
62 |
+
" highlights the entities found in the text (person, organization, date,...)"
|
63 |
+
" and extract email signature if any.",
|
64 |
+
fn=display_email,
|
65 |
+
inputs=["textbox",
|
66 |
+
gradio.inputs.Number(default=1, label="Email number in thread")],
|
67 |
+
outputs=[
|
68 |
+
gradio.outputs.Textbox(type="str", label="Error"),
|
69 |
+
gradio.outputs.Textbox(type="str", label="Language"),
|
70 |
+
gradio.outputs.HighlightedText(label="Header"),
|
71 |
+
gradio.outputs.HighlightedText(label="Body"),
|
72 |
+
gradio.outputs.HighlightedText(label="Signature")],
|
73 |
+
examples=[["""Bonjour Vincent,
|
74 |
+
Merci de m’avoir rappelé hier.
|
75 |
+
Seriez vous disponible pour un rendez vous la semaine prochaine?
|
76 |
+
Merci,
|
77 |
+
Jean-Baptiste""", 1], ["""Hello Jack,
|
78 |
+
|
79 |
+
I hope you had nice holiday as well.
|
80 |
+
Please find attached the requested documents,
|
81 |
+
|
82 |
+
Best Regards,
|
83 |
+
George
|
84 |
+
Vice president of Something
|
85 |
+
email: [email protected]
|
86 |
+
tel: 512-222-5555
|
87 |
+
|
88 |
+
On Mon, Jan 7, 2022 at 12:39 PM, Jack <[email protected]> wrote:
|
89 |
+
|
90 |
+
Hello George,
|
91 |
+
|
92 |
+
I wish you a happy new year. I hope you had nice holidays.
|
93 |
+
Did you see Garry during your vacation?
|
94 |
+
Do you have the documents I requested earlier?
|
95 |
+
|
96 |
+
Thanks,
|
97 |
+
Jack
|
98 |
+
|
99 |
+
|
100 |
+
""", 1] , ["""Hello Jack,
|
101 |
+
|
102 |
+
I hope you had nice holiday as well.
|
103 |
+
Please find attached the requested documents,
|
104 |
+
|
105 |
+
Best Regards,
|
106 |
+
George
|
107 |
+
Vice president of Something
|
108 |
+
email: [email protected]
|
109 |
+
tel: 512-222-5555
|
110 |
+
|
111 |
+
On Mon, Jan 7, 2022 at 12:39 PM, Jack <[email protected]> wrote:
|
112 |
+
|
113 |
+
Hello George,
|
114 |
+
|
115 |
+
I wish you a happy new year. I hope you had nice holidays.
|
116 |
+
Did you see Garry during your vacation?
|
117 |
+
Do you have the documents I requested earlier?
|
118 |
+
|
119 |
+
Thanks,
|
120 |
+
Jack
|
121 |
+
|
122 |
+
|
123 |
+
""", 2] ])
|
124 |
|
|
|
|
|
125 |
|
|
|
126 |
iface.launch()
|
email_parser/__init__.py
ADDED
File without changes
|
email_parser/_models_signatures.py
ADDED
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
import regex
|
5 |
+
import os
|
6 |
+
import configparser
|
7 |
+
from sentence_transformers import SentenceTransformer
|
8 |
+
from scipy.spatial import distance
|
9 |
+
from keras.preprocessing.sequence import pad_sequences
|
10 |
+
from sklearn.preprocessing import StandardScaler
|
11 |
+
from sklearn.preprocessing import MinMaxScaler
|
12 |
+
|
13 |
+
from tensorflow import keras
|
14 |
+
import pickle
|
15 |
+
|
16 |
+
from . import nlp, utils
|
17 |
+
|
18 |
+
config = configparser.ConfigParser()
|
19 |
+
config.read(os.path.join(os.path.dirname(__file__), 'config.ini'))
|
20 |
+
|
21 |
+
|
22 |
+
|
23 |
+
model_name = config["DEFAULT"]["name_model_signature"]
|
24 |
+
|
25 |
+
model = keras.models.load_model(filepath=utils.get_model_full_path(model_name))
|
26 |
+
minmax_scaler = pickle.load(open(utils.get_model_full_path(model_name +"/minmax_scaler.p"), "rb"))
|
27 |
+
standard_scaler = pickle.load(open(utils.get_model_full_path(model_name +"/standard_scaler.p"), "rb"))
|
28 |
+
|
29 |
+
|
30 |
+
list_name_columns_features = ["line_number",
|
31 |
+
"text",
|
32 |
+
"start",
|
33 |
+
"end",
|
34 |
+
"PER", "ORG", "LOC", "DATE", "TEL", "EMAIL", "WEB",
|
35 |
+
"SIGNATURE",
|
36 |
+
"word_count",
|
37 |
+
"inv_distance_to_merci",
|
38 |
+
"inv_distance_to_cordlt",
|
39 |
+
"inv_distance_to_regards",
|
40 |
+
"inv_distance_to_sincerely",
|
41 |
+
"inv_distance_to_sent_from",
|
42 |
+
"start_with_ps", "position_line",
|
43 |
+
"special_characters_count", "empty_chars_with_prev_line"]
|
44 |
+
|
45 |
+
list_columns_used_in_model = ["PER", "ORG", "LOC", "DATE", "TEL", "EMAIL",
|
46 |
+
# "WEB",
|
47 |
+
"word_count",
|
48 |
+
"inv_distance_to_merci",
|
49 |
+
"inv_distance_to_cordlt",
|
50 |
+
# "inv_distance_to_regards",
|
51 |
+
"inv_distance_to_sincerely",
|
52 |
+
"inv_distance_to_sent_from",
|
53 |
+
"start_with_ps",
|
54 |
+
"position_line",
|
55 |
+
"special_characters_count",
|
56 |
+
"empty_chars_with_prev_line"]
|
57 |
+
|
58 |
+
columns_to_scale_minmax = ["PER", "ORG", "LOC", "DATE", "TEL", "EMAIL", "WEB", "position_line",
|
59 |
+
"empty_chars_with_prev_line",
|
60 |
+
"inv_distance_to_merci",
|
61 |
+
"inv_distance_to_cordlt",
|
62 |
+
"inv_distance_to_regards",
|
63 |
+
"inv_distance_to_sincerely",
|
64 |
+
"inv_distance_to_sent_from",
|
65 |
+
"start_with_ps"
|
66 |
+
]
|
67 |
+
|
68 |
+
columns_to_scale_standard = ["word_count", "special_characters_count"]
|
69 |
+
|
70 |
+
def f_retrieve_entities_for_line(df_ner, start=0, end=1e12):
|
71 |
+
"""Retrieve all entities in the previously computed dataframe for a specific line
|
72 |
+
|
73 |
+
Args:
|
74 |
+
df_ner: dataframe containing found entities
|
75 |
+
start: start position of the line in original text
|
76 |
+
end: end position of the line in original text
|
77 |
+
|
78 |
+
"""
|
79 |
+
|
80 |
+
if len(df_ner) > 0:
|
81 |
+
df = df_ner.query(f"""(start>= {start} and end <= {end}) or (start<={start} and end>={end})""")
|
82 |
+
return df
|
83 |
+
|
84 |
+
embedder_model = SentenceTransformer("distiluse-base-multilingual-cased-v1")
|
85 |
+
|
86 |
+
def f_create_embedding_inv_dist_feature(text1, text2):
|
87 |
+
""" Computing distance between two texts based on their embedding
|
88 |
+
provided by the SentenceTransformer above"""
|
89 |
+
embedding_merci = embedder_model.encode(text1)
|
90 |
+
embedding_line = embedder_model.encode(text2)
|
91 |
+
dist = distance.cosine(embedding_merci, embedding_line)
|
92 |
+
return 1 / (dist + 0.01)
|
93 |
+
|
94 |
+
|
95 |
+
def f_create_email_lines_features(text, df_ner=None, position_offset=0):
|
96 |
+
list_lines = nlp.f_split_text_by_lines(text, position_offset)
|
97 |
+
list_features_vectors = []
|
98 |
+
if df_ner is None:
|
99 |
+
df_ner = nlp.f_ner(text)
|
100 |
+
|
101 |
+
for line_number in range(0, len(list_lines)):
|
102 |
+
list_features_vectors.append(f_create_line_features(list_lines, line_number, df_ner))
|
103 |
+
|
104 |
+
df_features = pd.DataFrame(list_features_vectors, columns=list_name_columns_features)
|
105 |
+
|
106 |
+
return df_features
|
107 |
+
|
108 |
+
|
109 |
+
|
110 |
+
def f_create_line_features(list_lines, line_number, df_ner):
|
111 |
+
current_line = list_lines[line_number]
|
112 |
+
total_lines = len(list_lines)
|
113 |
+
features_vector = [line_number, current_line[2], current_line[0], current_line[1]]
|
114 |
+
logging.debug(f"Creating line features for {current_line}")
|
115 |
+
df_ner_line = f_retrieve_entities_for_line(df_ner=df_ner, start=current_line[0], end=current_line[1])
|
116 |
+
|
117 |
+
# Adding entity to feature vector
|
118 |
+
for entity in ["PER", "ORG", "LOC", "DATE", "TEL", "EMAIL", "WEB", "SIGNATURE"]:
|
119 |
+
value = len(df_ner_line.query(f"entity=='{entity}'")) if df_ner_line is not None else 0
|
120 |
+
features_vector.append(value)
|
121 |
+
# Adding word count
|
122 |
+
features_vector.append(len(current_line[2].split()))
|
123 |
+
# distance to greeting word "merci"
|
124 |
+
features_vector.append(f_create_embedding_inv_dist_feature("merci", current_line[2].lower()))
|
125 |
+
|
126 |
+
# distance to greeting word "merci"
|
127 |
+
features_vector.append(f_create_embedding_inv_dist_feature("cordialement", current_line[2].lower()))
|
128 |
+
|
129 |
+
# distance to greeting word "regards"
|
130 |
+
features_vector.append(f_create_embedding_inv_dist_feature("regards", current_line[2].lower()))
|
131 |
+
|
132 |
+
# distance to greeting word "regards"
|
133 |
+
features_vector.append(f_create_embedding_inv_dist_feature("sincerely", current_line[2].lower()))
|
134 |
+
|
135 |
+
# distance to word "sent from"
|
136 |
+
features_vector.append(f_create_embedding_inv_dist_feature("sent from", current_line[2].lower()))
|
137 |
+
|
138 |
+
# Line start with ps:
|
139 |
+
features_vector.append(regex.match(r"\s*ps *:", current_line[2], flags=regex.IGNORECASE ) is not None)
|
140 |
+
|
141 |
+
# Adding position line in email
|
142 |
+
position_in_email = (line_number + 1) / total_lines
|
143 |
+
features_vector.append(position_in_email)
|
144 |
+
# Adding special character count
|
145 |
+
special_char_count = len(regex.findall(r"[^\p{L}0-9 .,\n]", current_line[2]))
|
146 |
+
features_vector.append(special_char_count)
|
147 |
+
# Number of empty chars with previous line
|
148 |
+
empty_chars_with_prev_line = 0 if line_number == 0 else current_line[0] - list_lines[line_number - 1][1]
|
149 |
+
features_vector.append(empty_chars_with_prev_line)
|
150 |
+
return features_vector
|
151 |
+
|
152 |
+
|
153 |
+
def generate_x_y(df, minmax_scaler=None, standard_scaler=None, n_last_lines_to_keep=30,
|
154 |
+
list_columns=list_columns_used_in_model):
|
155 |
+
df, minmax_scaler, standard_scaler = f_scale_parameters(df, minmax_scaler, standard_scaler)
|
156 |
+
x = df[list_columns].to_numpy()[-n_last_lines_to_keep:, :]
|
157 |
+
x = np.expand_dims(x, axis=0)
|
158 |
+
y = df["is_signature"].to_numpy()[-n_last_lines_to_keep:]
|
159 |
+
y = np.expand_dims(y, axis=0)
|
160 |
+
return x, y, minmax_scaler, standard_scaler
|
161 |
+
|
162 |
+
|
163 |
+
def f_scale_parameters(df_tagged_data, minmax_scaler=None, standard_scaler=None):
|
164 |
+
# df_tagged_data = df_tagged_data.copy(deep=True)
|
165 |
+
if minmax_scaler is None:
|
166 |
+
logging.debug("fitting new min max scaller")
|
167 |
+
minmax_scaler = MinMaxScaler()
|
168 |
+
df_tagged_data.loc[:, columns_to_scale_minmax] = minmax_scaler.fit_transform(
|
169 |
+
df_tagged_data[columns_to_scale_minmax])
|
170 |
+
else:
|
171 |
+
logging.debug("using already fitted minmax scaler")
|
172 |
+
df_tagged_data.loc[:, columns_to_scale_minmax] = minmax_scaler.transform(
|
173 |
+
df_tagged_data[columns_to_scale_minmax])
|
174 |
+
|
175 |
+
if standard_scaler is None:
|
176 |
+
logging.debug("fitting new standard scaler")
|
177 |
+
standard_scaler = StandardScaler()
|
178 |
+
df_tagged_data.loc[:, columns_to_scale_standard] = standard_scaler.fit_transform(
|
179 |
+
df_tagged_data[columns_to_scale_standard])
|
180 |
+
else:
|
181 |
+
logging.debug("using already fitted scaler")
|
182 |
+
df_tagged_data.loc[:, columns_to_scale_standard] = standard_scaler.transform(
|
183 |
+
df_tagged_data[columns_to_scale_standard])
|
184 |
+
return df_tagged_data, minmax_scaler, standard_scaler
|
email_parser/config.ini
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[DEFAULT]
|
2 |
+
ner_model_fr = Jean-Baptiste/camembert-ner-with-dates
|
3 |
+
ner_model_en = Jean-Baptiste/roberta-large-ner-english
|
4 |
+
device = -1
|
5 |
+
default_lang = en
|
6 |
+
name_model_signature = model_signature_lstm_v10
|
7 |
+
path_models = models
|
email_parser/doc_email.py
ADDED
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import regex
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
from . import nlp
|
5 |
+
|
6 |
+
|
7 |
+
|
8 |
+
class Email:
|
9 |
+
|
10 |
+
def __init__(self,
|
11 |
+
raw_text):
|
12 |
+
""" Constructor for email
|
13 |
+
:param raw_text: raw text of email
|
14 |
+
"""
|
15 |
+
self.raw_text = raw_text
|
16 |
+
self.list_emails = self.f_split_email_thread()
|
17 |
+
|
18 |
+
def f_split_email_thread(self):
|
19 |
+
""" Function to split a thread of email into a list of individual email.
|
20 |
+
|
21 |
+
Two main formats of header are recognized:
|
22 |
+
|
23 |
+
1) Multi-lines header similar to
|
24 |
+
De : sads Cadsfdsf [mailto:[email protected]]
|
25 |
+
Envoyé : 30 mars 2015 08:33
|
26 |
+
À : asdsad, sadsadasd (CA - asdasd)
|
27 |
+
Objet : Re: TR: sadasdasdsad sa dsa
|
28 |
+
2) Le 2015-03-30 à 08:25, Luc, Archambault (CA - Drummondville) <[email protected]> a écrit :
|
29 |
+
|
30 |
+
Returns:
|
31 |
+
list of dict. Dict contains for each email: (body, header, start, start_header, date, lang)
|
32 |
+
|
33 |
+
"""
|
34 |
+
|
35 |
+
pattern = r"(((\n{1}\t*|\n(-{4,}.*-{4,}\s*)|^)(([> *]*(de|from|Exp.diteur|Subject)[\s]*:).*(\n[^A-Z].*)?[\r\n\t\s,]{1,}){1,})(([> *\t]*[\p{L}\p{M}' -]*[\s]*:).*((\n[ ]{3,7}?.*|(\n<.*))*)[\r\n\t\s,]{1,3}?){2,}" \
|
36 |
+
r"|(\s*((((de|from|Exp.diteur|Subject)[\s]*:).{0,200}?[\r\n\t\s,]{1,}){1})(?!de)(((envoy.|.|to|date).?[\s]*:).*?){1,}(((objet|subject)[\s]*:).*?[!?.><,]){1})" \
|
37 |
+
r"|((?<=\n)(([ >\t]*)(le|on|el).{0,30}\d{4,}.{0,100}\n*.{0,100}(wrote|.crit|escribió)\s*:))" \
|
38 |
+
r"|(\b(le|on)\s*((\d{2,4}[- ]){3}|(\d{1,2}.{1,8}\d{4}))[^\n]*?(wrote|.crit)\s*:)" \
|
39 |
+
r"|$)"
|
40 |
+
|
41 |
+
results = regex.finditer(pattern, self.raw_text, flags=regex.IGNORECASE)
|
42 |
+
start_of_current_header = 0
|
43 |
+
end_of_current_header = 0
|
44 |
+
part_email = 1
|
45 |
+
|
46 |
+
if results is not None:
|
47 |
+
list_email = []
|
48 |
+
|
49 |
+
for result in results:
|
50 |
+
|
51 |
+
start_of_next_header = result.start()
|
52 |
+
|
53 |
+
# if header_group is not None and full_email[0:header_group.start()].lstrip() == "":
|
54 |
+
if start_of_current_header != end_of_current_header:
|
55 |
+
header = self.raw_text[start_of_current_header: end_of_current_header]
|
56 |
+
body = self.raw_text[end_of_current_header:start_of_next_header]
|
57 |
+
|
58 |
+
start = end_of_current_header
|
59 |
+
start_header = start_of_current_header
|
60 |
+
|
61 |
+
# Case where no header was found (either last email of thread or regex didn't find it)
|
62 |
+
else:
|
63 |
+
header = ""
|
64 |
+
body = self.raw_text[end_of_current_header:start_of_next_header]
|
65 |
+
start = end_of_current_header
|
66 |
+
start_header = start_of_current_header
|
67 |
+
|
68 |
+
|
69 |
+
# we detect language for each email of the thread and default to detected thread language otherwise
|
70 |
+
# We detect only on first 150 characters
|
71 |
+
lang = nlp.f_detect_language(body[:150])
|
72 |
+
|
73 |
+
if body.strip() != "" or header != "":
|
74 |
+
list_email.append({"body": body,
|
75 |
+
"header": header,
|
76 |
+
"start": start,
|
77 |
+
"start_header": start_header,
|
78 |
+
"lang": lang,
|
79 |
+
"part": part_email
|
80 |
+
})
|
81 |
+
part_email += 1
|
82 |
+
# previous_from_tag = current_from_tag
|
83 |
+
start_of_current_header = result.start()
|
84 |
+
end_of_current_header = result.end()
|
85 |
+
|
86 |
+
return list_email
|
87 |
+
# Case were mail is not a thread
|
88 |
+
else:
|
89 |
+
return [{"body": self.raw_text,
|
90 |
+
"header": "",
|
91 |
+
"start": 0}]
|
92 |
+
|
93 |
+
@staticmethod
|
94 |
+
def f_find_person_in_header(header, df_result=pd.DataFrame()):
|
95 |
+
results = []
|
96 |
+
dict_header = Email.f_split_email_headers(header)
|
97 |
+
for key in ["to", "cc", "from"]:
|
98 |
+
if key in dict_header.keys():
|
99 |
+
line_header = dict_header[key][0]
|
100 |
+
start_posit = dict_header[key][1]
|
101 |
+
pattern_person = r"(?<=\s|'|^)[\p{L}\p{M}\s,-]{2,}(?=[\s;']|$)"
|
102 |
+
list_results = regex.finditer(pattern_person, line_header, flags=regex.IGNORECASE)
|
103 |
+
for match in list_results:
|
104 |
+
value = match.group()
|
105 |
+
if value.strip() != "":
|
106 |
+
start = match.start()
|
107 |
+
end = match.end()
|
108 |
+
results.append(["PER",
|
109 |
+
value,
|
110 |
+
start_posit + start,
|
111 |
+
start_posit + end,
|
112 |
+
1
|
113 |
+
])
|
114 |
+
df_result = nlp.f_concat_results(df_result, results)
|
115 |
+
return df_result
|
116 |
+
|
117 |
+
@staticmethod
|
118 |
+
def f_split_email_headers(header):
|
119 |
+
""" SPlit headers in from/to/date,...in a dictionnary
|
120 |
+
|
121 |
+
Args:
|
122 |
+
header:
|
123 |
+
|
124 |
+
Returns:
|
125 |
+
|
126 |
+
"""
|
127 |
+
matching_header_keywords = {"à": "to",
|
128 |
+
"Destinataire": "to",
|
129 |
+
"de": "from",
|
130 |
+
"envoyé": "date",
|
131 |
+
"sent": "date",
|
132 |
+
"objet": "subject"}
|
133 |
+
dict_results = {}
|
134 |
+
pattern = r"((?<=\s|^)(à|À|a\p{M}|Cc|To|De|From|Envoy.|Date|Sent|Objet|Subject|Destinataire)\s?:)[ ]*((.*?)[ ]*((\n[ ]{3,7}?.*)*))(?=[\p{L}\p{M}]*\s{1,}:| > |\n|$)"
|
135 |
+
list_results = regex.finditer(pattern, header, flags=regex.IGNORECASE)
|
136 |
+
for match in list_results:
|
137 |
+
key_word = match.group(2).strip().lower()
|
138 |
+
key_word_matched = matching_header_keywords.get(key_word)
|
139 |
+
dict_results[key_word_matched if not key_word_matched is None else key_word] = [match.group(3),
|
140 |
+
match.span(3)[0],
|
141 |
+
match.span(3)[1]]
|
142 |
+
return dict_results
|
email_parser/models/model_signature_lstm_v10/keras_metadata.pb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1df1ebcda9b9f2ca0855f67117d5c8b7db0d89c46c346273a536f2eec13c5665
|
3 |
+
size 22060
|
email_parser/models/model_signature_lstm_v10/minmax_scaler.p
ADDED
Binary file (1.16 kB). View file
|
|
email_parser/models/model_signature_lstm_v10/saved_model.pb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a28bac82659a6bc1cf949dc04d01a09db681cab64c9388ff1267d53fa3d11fb2
|
3 |
+
size 5272723
|
email_parser/models/model_signature_lstm_v10/standard_scaler.p
ADDED
Binary file (584 Bytes). View file
|
|
email_parser/models/model_signature_lstm_v10/variables/variables.data-00000-of-00001
ADDED
Binary file (116 kB). View file
|
|
email_parser/models/model_signature_lstm_v10/variables/variables.index
ADDED
Binary file (3.48 kB). View file
|
|
email_parser/nlp.py
ADDED
@@ -0,0 +1,322 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
import regex
|
4 |
+
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
|
5 |
+
import pandas as pd
|
6 |
+
import numpy as np
|
7 |
+
|
8 |
+
from . import utils, _models_signatures
|
9 |
+
from .utils import timing
|
10 |
+
from langid.langid import LanguageIdentifier
|
11 |
+
from langid.langid import model as model_langid
|
12 |
+
|
13 |
+
# Creating language_identifier object for usage in function f_detect_language
|
14 |
+
language_identifier = LanguageIdentifier.from_modelstring(model_langid, norm_probs=True)
|
15 |
+
language_identifier.set_languages(['en', 'fr'])
|
16 |
+
|
17 |
+
|
18 |
+
logging.info(f"Reading config file from folder:{os.path.join(os.path.dirname(__file__))}")
|
19 |
+
|
20 |
+
config = utils.f_read_config(os.path.join(os.path.dirname(__file__), 'config.ini'))
|
21 |
+
|
22 |
+
device = int(config["DEFAULT"]["device"])
|
23 |
+
default_lang = config["DEFAULT"]["default_lang"]
|
24 |
+
|
25 |
+
tokenizer_dict = {}
|
26 |
+
models_dict = {}
|
27 |
+
nlp_dict = {}
|
28 |
+
|
29 |
+
|
30 |
+
dict_regex_pattern = dict(EMAIL=r'[\p{L}\p{M}\-\d._]{1,}@[\p{L}\p{M}\d\-_]{1,}(\.[\p{L}\p{M}]{1,}){1,}',
|
31 |
+
TEL=r'(?<!\d)(\+?\d{1,2}[ -]?)?\(?\d{3}\)?[ .-]?\d{3}[ .-]?\d{4}(?!\d|\p{P}\d)',
|
32 |
+
POST=r'\b([A-z][0-9][A-z][ -]?[0-9][A-z][0-9]|[A-z][0-9][A-z])\b',
|
33 |
+
PRICE=r"(([\s:,]|^){1}\$*(CA|CAD|USD|EUR|GBP|\$|\€|\£|\¢){1}\$*[\d., ]*[\d]{1,}\b)" +
|
34 |
+
"|([\d]{1,}[\d., ]*(CA|CAD|USD|EUR|GBP|\$|\€|\£|k|m|\¢){1,}\$*(?=\s|\p{P}|$))",
|
35 |
+
WEB=r"((www(\.[\p{L}\p{M}\-0-9]]{1,}){2,})" +
|
36 |
+
"|(https?:[^ ]*)"+
|
37 |
+
# r"|(([\p{L}\p{M}\.]{3,}){2,})|"
|
38 |
+
r"|((?<=[\s:]|^)([\p{L}\p{M}\-0-9]{1,}\.){1,}(com|ca|org|fr){1,}\b))")
|
39 |
+
# WEB=r"(http(s)?:\/\/)?[a-z0-9]{1}[a-z0-9-._~]+[.]{1}(com|ca)(?![\p{L}\p{M}])")
|
40 |
+
|
41 |
+
def f_load_tokenizer_and_model_for_nlp(model_name, pipeline_type='ner'):
|
42 |
+
"""
|
43 |
+
Loading model and tokenizer takes a long time.
|
44 |
+
We do it once and store the model and tokenizer in global dict for next usage
|
45 |
+
Args:
|
46 |
+
name: Name of the model that should be loaded and stored
|
47 |
+
pipeline_type: type of pipeline that should be initialized
|
48 |
+
|
49 |
+
Returns: tokenizer, model
|
50 |
+
|
51 |
+
"""
|
52 |
+
global tokenizer_dict, models_dict, nlp_dict
|
53 |
+
auto_model = None
|
54 |
+
if pipeline_type == "ner":
|
55 |
+
auto_model = AutoModelForTokenClassification
|
56 |
+
|
57 |
+
if model_name not in tokenizer_dict.keys() or model_name not in models_dict.keys() or model_name not in nlp_dict.keys():
|
58 |
+
logging.info(
|
59 |
+
f"Loading tokenizer and model: {model_name}")
|
60 |
+
tokenizer_dict[model_name] = AutoTokenizer.from_pretrained(model_name)
|
61 |
+
# , add_prefix_space = True
|
62 |
+
models_dict[model_name] = auto_model.from_pretrained(model_name)
|
63 |
+
if pipeline_type == 'ner':
|
64 |
+
nlp_dict[model_name] = pipeline(pipeline_type, model=models_dict[model_name], tokenizer=tokenizer_dict[model_name],
|
65 |
+
aggregation_strategy="simple", device=device)
|
66 |
+
|
67 |
+
|
68 |
+
def f_ner(text, lang=default_lang):
|
69 |
+
df_result = f_ner_regex(text)
|
70 |
+
df_result = f_ner_model(text, lang=lang, df_result=df_result)
|
71 |
+
return df_result
|
72 |
+
|
73 |
+
|
74 |
+
@timing
|
75 |
+
def f_ner_model(text, lang=default_lang, df_result=pd.DataFrame()):
|
76 |
+
list_result = []
|
77 |
+
# We split the text by sentence and run model on each one
|
78 |
+
sentence_tokenizer = f_split_text_by_lines(text)
|
79 |
+
for start, end, value in sentence_tokenizer:
|
80 |
+
if value != "":
|
81 |
+
results = f_ner_model_by_sentence(value, lang=lang, pos_offset=start)
|
82 |
+
if len(results) != 0:
|
83 |
+
list_result += results
|
84 |
+
return f_concat_results(df_result, list_result)
|
85 |
+
|
86 |
+
|
87 |
+
@timing
|
88 |
+
def f_ner_model_by_sentence(sentence, lang=default_lang, df_result=pd.DataFrame(), pos_offset=0):
|
89 |
+
""" Run ner algorithm
|
90 |
+
|
91 |
+
Args:
|
92 |
+
sentence : sentence on which to run model
|
93 |
+
lang : lang to determine which model to use
|
94 |
+
df_result : If results of f_ner should be combined with previous value
|
95 |
+
(in this case we will keep the previous values if tags overlapsed)
|
96 |
+
|
97 |
+
Returns:
|
98 |
+
Dataframe with identified entities
|
99 |
+
|
100 |
+
"""
|
101 |
+
|
102 |
+
if not config.has_option('DEFAULT', 'ner_model_' + lang):
|
103 |
+
raise ValueError(f"No model was defined for ner in {lang}")
|
104 |
+
|
105 |
+
model_name = config['DEFAULT']['ner_model_' + lang]
|
106 |
+
f_load_tokenizer_and_model_for_nlp(model_name)
|
107 |
+
logging.debug(f"starting {model_name} on sentence:'{sentence}'")
|
108 |
+
|
109 |
+
results = nlp_dict[model_name](sentence)
|
110 |
+
list_result = []
|
111 |
+
for result in results:
|
112 |
+
if result["word"] != "" and result['entity_group'] in ["PER", "LOC", "ORG", "DATE"]:
|
113 |
+
|
114 |
+
# Required because sometimes spaces are included in result["word"] value, but not in start/end position
|
115 |
+
value = sentence[result["start"]:result["end"]]
|
116 |
+
|
117 |
+
# We remove any special character at the beginning
|
118 |
+
pattern = r"[^.,'’` \":()\n].*"
|
119 |
+
result_regex = regex.search(pattern, value, flags=regex.IGNORECASE)
|
120 |
+
|
121 |
+
if result_regex is not None:
|
122 |
+
word_raw = result_regex.group()
|
123 |
+
word = word_raw
|
124 |
+
real_word_start = result["start"] + result_regex.start()
|
125 |
+
real_word_end = result["start"] + result_regex.start() + len(word_raw)
|
126 |
+
# We check if entity might be inside a longer word, if this is the case we ignore
|
127 |
+
letter_before = sentence[max(0, real_word_start - 1): real_word_start]
|
128 |
+
letter_after = sentence[real_word_end: min(len(sentence), real_word_end + 1)]
|
129 |
+
if regex.match(r"[A-z]", letter_before) or regex.match(r"[A-z]", letter_after):
|
130 |
+
logging.debug(f"Ignoring entity {value} because letter before is"
|
131 |
+
f" '{letter_before}' or letter after is '{letter_after}'")
|
132 |
+
continue
|
133 |
+
|
134 |
+
list_result.append(
|
135 |
+
[result["entity_group"],
|
136 |
+
word,
|
137 |
+
real_word_start + pos_offset,
|
138 |
+
real_word_end + pos_offset,
|
139 |
+
result["score"]])
|
140 |
+
|
141 |
+
return list_result
|
142 |
+
|
143 |
+
|
144 |
+
@timing
|
145 |
+
def f_concat_results(df_result, list_result_new):
|
146 |
+
""" Merge results between existing dataframe and a list of new values
|
147 |
+
|
148 |
+
Args:
|
149 |
+
df_result: dataframe of entities
|
150 |
+
list_result_new: list of new entities to be added in df_result
|
151 |
+
|
152 |
+
Returns:
|
153 |
+
Dataframe with all entities. Entities in list_result_new that were overlapping position of another entity in
|
154 |
+
df_result are ignored.
|
155 |
+
|
156 |
+
"""
|
157 |
+
# If df_result and list_result_new are both empty, we return an empty dataframe
|
158 |
+
list_columns_names = ["entity", "value", "start", "end", "score"]
|
159 |
+
if (df_result is None or len(df_result) == 0) and (list_result_new is None or len(list_result_new) == 0):
|
160 |
+
return pd.DataFrame()
|
161 |
+
elif len(list_result_new) > 0:
|
162 |
+
if df_result is None or len(df_result) == 0:
|
163 |
+
return pd.DataFrame(list_result_new,
|
164 |
+
columns=list_columns_names)
|
165 |
+
list_row = []
|
166 |
+
for row in list_result_new:
|
167 |
+
df_intersect = df_result.query("({1}>=start and {0}<=end)".format(row[2], row[3]))
|
168 |
+
if len(df_intersect) == 0:
|
169 |
+
list_row.append(row)
|
170 |
+
df_final = pd.concat([df_result,
|
171 |
+
pd.DataFrame(list_row,
|
172 |
+
columns=list_columns_names)],
|
173 |
+
ignore_index=True) \
|
174 |
+
.sort_values(by="start")
|
175 |
+
return df_final
|
176 |
+
else:
|
177 |
+
# If list_result_new was empty we just return df_result
|
178 |
+
return df_result
|
179 |
+
|
180 |
+
|
181 |
+
@timing
|
182 |
+
def f_detect_language(text, default=default_lang):
|
183 |
+
""" Detect language
|
184 |
+
|
185 |
+
Args:
|
186 |
+
text: text on which language should be detected
|
187 |
+
default: default value if there is an error or score of predicted value is to low (default nlp.default_lang)
|
188 |
+
|
189 |
+
Returns:
|
190 |
+
"fr" or "en"
|
191 |
+
|
192 |
+
"""
|
193 |
+
lang = default
|
194 |
+
try:
|
195 |
+
if text.strip() != "":
|
196 |
+
lang, score = language_identifier.classify(text.strip().replace("\n"," ").lower())
|
197 |
+
# If scroe is not high enough we will take default value instead
|
198 |
+
if score < 0.8:
|
199 |
+
lang = default_lang
|
200 |
+
except Exception as e:
|
201 |
+
logging.error("following error occurs when trying to detect language: {}".format(e))
|
202 |
+
finally:
|
203 |
+
return lang
|
204 |
+
|
205 |
+
@timing
|
206 |
+
def f_find_regex_pattern(text, type_, pattern):
|
207 |
+
""" Find all occurences of a pattern in a text and return a list of results
|
208 |
+
Args:
|
209 |
+
text: the text to be analyzed
|
210 |
+
type_: the entity type (value is added in result)
|
211 |
+
pattern: regex pattern to be found
|
212 |
+
|
213 |
+
Returns:
|
214 |
+
A list containing type, matched value, position start and end of each result
|
215 |
+
|
216 |
+
"""
|
217 |
+
list_result = []
|
218 |
+
results = regex.finditer(pattern, text, flags=regex.IGNORECASE)
|
219 |
+
for match in results:
|
220 |
+
value = match.string[match.start(): match.end()].replace("\n", " ").strip()
|
221 |
+
list_result.append([type_,
|
222 |
+
value,
|
223 |
+
match.start(),
|
224 |
+
match.end(),
|
225 |
+
1])
|
226 |
+
return list_result
|
227 |
+
|
228 |
+
|
229 |
+
@timing
|
230 |
+
def f_ner_regex(text, dict_pattern=dict_regex_pattern,
|
231 |
+
df_result=pd.DataFrame()):
|
232 |
+
"""Run a series of regex expression to detect email, tel and postal codes in a full text.
|
233 |
+
|
234 |
+
Args:
|
235 |
+
text: the text to be analyzed
|
236 |
+
dict_pattern: dictionary of regex expression to be ran successively (default nlp.dict_regex_pattern)
|
237 |
+
df_result: results of this function will be merged with values provided here.
|
238 |
+
If value is already found at an overlapping position in df_results, the existing value will be kept
|
239 |
+
|
240 |
+
Returns:
|
241 |
+
Dataframe containing results merged with provided argument df_result (if any)
|
242 |
+
"""
|
243 |
+
logging.debug("Starting regex")
|
244 |
+
list_result = []
|
245 |
+
|
246 |
+
# we run f_find_regex_pattern for each pattern in dict_regex
|
247 |
+
for type_, pattern in dict_pattern.items():
|
248 |
+
result = f_find_regex_pattern(text, type_, pattern)
|
249 |
+
if len(result) != 0:
|
250 |
+
list_result += result
|
251 |
+
|
252 |
+
df_result = f_concat_results(df_result, list_result)
|
253 |
+
return df_result
|
254 |
+
|
255 |
+
@timing
|
256 |
+
def f_split_text_by_lines(text, position_offset=0):
|
257 |
+
"""
|
258 |
+
:param text: text that should be split
|
259 |
+
:return: list containing for each line: [position start, position end, sentence]
|
260 |
+
"""
|
261 |
+
results = []
|
262 |
+
# iter_lines = regex.finditer(".*(?=\n|$)", text)
|
263 |
+
iter_lines = regex.finditer("[^>\n]((.*?([!?.>] ){1,})|.*(?=\n|$))", text)
|
264 |
+
for line_match in iter_lines:
|
265 |
+
start_line = line_match.start()
|
266 |
+
end_line = line_match.end()
|
267 |
+
line = line_match.group()
|
268 |
+
if len(line.strip()) > 1:
|
269 |
+
results.append([start_line + position_offset, end_line + position_offset, line])
|
270 |
+
return results
|
271 |
+
|
272 |
+
|
273 |
+
def f_detect_email_signature(text, df_ner=None, cut_off_score=0.6, lang=default_lang):
|
274 |
+
# with tf.device("/cpu:0"):
|
275 |
+
if text.strip() == "":
|
276 |
+
return None
|
277 |
+
if df_ner is None:
|
278 |
+
df_ner = f_ner(text, lang=lang)
|
279 |
+
|
280 |
+
df_features = _models_signatures.f_create_email_lines_features(text, df_ner=df_ner)
|
281 |
+
|
282 |
+
if len(df_features)==0:
|
283 |
+
return None
|
284 |
+
|
285 |
+
# We add dummy value for signature in order to use same function than for training of the model
|
286 |
+
df_features["is_signature"] = -2
|
287 |
+
|
288 |
+
x, y_out, _, _ = _models_signatures.generate_x_y(df_features, _models_signatures.minmax_scaler,
|
289 |
+
_models_signatures.standard_scaler)
|
290 |
+
|
291 |
+
y_predict = _models_signatures.model.predict(x)
|
292 |
+
y_predict_value = (y_predict> cut_off_score).reshape([-1])
|
293 |
+
y_predict_value = np.pad(y_predict_value, (len(df_features) - len(y_predict_value), 0), constant_values=0)[
|
294 |
+
-len(df_features):]
|
295 |
+
y_predict_score = y_predict.reshape([-1])
|
296 |
+
y_predict_score = np.pad(y_predict_score, (len(df_features) - len(y_predict_score), 0), constant_values=1)[
|
297 |
+
-len(df_features):]
|
298 |
+
|
299 |
+
# return(y_predict, y_mask)
|
300 |
+
df_features["prediction"] = y_predict_value
|
301 |
+
df_features["score"] = y_predict_score
|
302 |
+
# return df_features
|
303 |
+
series_position_body = df_features.query(f"""prediction==0""")['end']
|
304 |
+
if len(series_position_body) > 0:
|
305 |
+
body_end_pos = max(series_position_body)
|
306 |
+
else:
|
307 |
+
# In this case everything was detected as a signature
|
308 |
+
body_end_pos = 0
|
309 |
+
score = df_features.query(f"""prediction==1""")["score"].mean()
|
310 |
+
signature_text = text[body_end_pos:].strip().replace("\n", " ")
|
311 |
+
if signature_text != "":
|
312 |
+
list_result = [
|
313 |
+
# ["body", text[:body_end_pos], 0 + pos_start_email, body_end_pos + pos_start_email, 1, ""],
|
314 |
+
["SIGNATURE", signature_text, body_end_pos, len(text), score]]
|
315 |
+
|
316 |
+
df_result = f_concat_results(pd.DataFrame(), list_result)
|
317 |
+
else:
|
318 |
+
df_result = None
|
319 |
+
|
320 |
+
return df_result
|
321 |
+
|
322 |
+
|
email_parser/utils.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from functools import wraps
|
2 |
+
import logging
|
3 |
+
import os
|
4 |
+
from time import time
|
5 |
+
import configparser
|
6 |
+
|
7 |
+
timer_functions = {}
|
8 |
+
|
9 |
+
# Loading configuration from config file
|
10 |
+
config = configparser.ConfigParser()
|
11 |
+
config.read(os.path.join(os.path.dirname(__file__), 'config.ini'))
|
12 |
+
|
13 |
+
|
14 |
+
def timing(f):
|
15 |
+
@wraps(f)
|
16 |
+
def wrap(*args, **kw):
|
17 |
+
ts = time()
|
18 |
+
result = f(*args, **kw)
|
19 |
+
te = time()
|
20 |
+
if f.__name__ in timer_functions.keys():
|
21 |
+
current_elapsed_time = timer_functions[f.__name__]
|
22 |
+
else:
|
23 |
+
current_elapsed_time = 0
|
24 |
+
timer_functions[f.__name__] = current_elapsed_time + (te - ts)
|
25 |
+
logging.debug('func:%r took: %2.4f sec' % \
|
26 |
+
(f.__name__, te - ts))
|
27 |
+
return result
|
28 |
+
return wrap
|
29 |
+
|
30 |
+
|
31 |
+
def f_read_config(path=None):
|
32 |
+
""" read config file from specified file path
|
33 |
+
|
34 |
+
:param path: file path
|
35 |
+
:return: configparser object
|
36 |
+
"""
|
37 |
+
# Loading configuration from config file
|
38 |
+
config = configparser.ConfigParser()
|
39 |
+
if path is None:
|
40 |
+
path = os.path.join(os.path.dirname(__file__), 'config.ini')
|
41 |
+
config.read(path, encoding='utf-8')
|
42 |
+
return config
|
43 |
+
|
44 |
+
def f_setup_logger(level_sysout=logging.INFO, level_file=logging.DEBUG, folder_path="logs"):
|
45 |
+
"""Setup logger
|
46 |
+
|
47 |
+
By default we display only INFO in console, and write everything in file
|
48 |
+
|
49 |
+
Args:
|
50 |
+
level_sysout: Level that is displayed in console (default INFO)
|
51 |
+
level_file: Level that is written in file (default DEBUG)
|
52 |
+
|
53 |
+
Returns:
|
54 |
+
Nothing
|
55 |
+
|
56 |
+
"""
|
57 |
+
if not os.path.isdir(folder_path):
|
58 |
+
os.mkdir(folder_path)
|
59 |
+
|
60 |
+
for handler in logging.root.handlers[:]:
|
61 |
+
logging.root.removeHandler(handler)
|
62 |
+
|
63 |
+
file_handler = logging.FileHandler(filename=os.path.join(folder_path, "amf_uce_nlp_{}.log".format(time())),
|
64 |
+
encoding='utf-8')
|
65 |
+
sysout_handler = logging.StreamHandler()
|
66 |
+
file_handler.setLevel(level_file)
|
67 |
+
sysout_handler.setLevel(level_sysout)
|
68 |
+
logging.basicConfig(handlers=[file_handler, sysout_handler], level=logging.DEBUG,
|
69 |
+
format='%(asctime)s (%(levelname)s) %(message)s', datefmt='%m/%d/%y %I:%M:%S %p')
|
70 |
+
|
71 |
+
|
72 |
+
def get_model_full_path(model_name):
|
73 |
+
path_models = config["DEFAULT"]["path_models"]
|
74 |
+
return os.path.join(os.path.dirname(__file__), path_models, model_name)
|
setup.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from setuptools import find_packages, setup
|
2 |
+
from glob import glob
|
3 |
+
import os
|
4 |
+
|
5 |
+
|
6 |
+
setup(name='email_parser',
|
7 |
+
packages=find_packages(include=['email_parser']),
|
8 |
+
version='0.0.1',
|
9 |
+
description='Email parser',
|
10 |
+
author='JB Polle',
|
11 |
+
license='MIT',
|
12 |
+
install_requires=['langid==1.1.6',
|
13 |
+
'numpy>=1.19.5',
|
14 |
+
'pandas>=1.2.3',
|
15 |
+
'regex',
|
16 |
+
'scikit-learn==0.24.1',
|
17 |
+
'sentence-transformers==1.0.4',
|
18 |
+
'tensorflow==2.6.0',
|
19 |
+
'tensorflow-hub>=0.12.0',
|
20 |
+
'tensorflow-text==2.6.0',
|
21 |
+
'tokenizers==0.10.1',
|
22 |
+
'torch>=1.8.0',
|
23 |
+
'umap-learn==0.5.1',
|
24 |
+
'dateparser==1.0.0',
|
25 |
+
'transformers>=4.3',
|
26 |
+
'gradio>=2.7'])
|