Spaces:
Runtime error
Runtime error
arogeriogel
commited on
update app
Browse files
app.py
CHANGED
@@ -3,7 +3,7 @@ import streamlit as st
|
|
3 |
import re
|
4 |
import logging
|
5 |
from presidio_anonymizer import AnonymizerEngine
|
6 |
-
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, RecognizerResult
|
7 |
|
8 |
from annotated_text import annotated_text
|
9 |
from flair_recognizer import FlairRecognizer
|
@@ -41,21 +41,21 @@ def analyze(**kwargs):
|
|
41 |
if "entities" not in kwargs or "All" in kwargs["entities"]:
|
42 |
kwargs["entities"] = None
|
43 |
|
44 |
-
if st.session_state.excluded_words:
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
|
|
55 |
|
56 |
results = analyzer_engine().analyze(**kwargs)
|
57 |
-
|
58 |
-
return results
|
59 |
|
60 |
def annotate():
|
61 |
text = st.session_state.text
|
@@ -98,64 +98,76 @@ def analyze_text():
|
|
98 |
logging.info(f"This is the text being analysed: {st.session_state.text}")
|
99 |
st.session_state.text_error = ""
|
100 |
st.session_state.n_requests += 1
|
101 |
-
|
102 |
text=st.session_state.text,
|
103 |
entities=st_entities,
|
104 |
language="en",
|
105 |
return_decision_process=False,
|
106 |
)
|
107 |
|
108 |
-
|
109 |
-
|
110 |
|
111 |
if st.session_state.allowed_words:
|
112 |
-
|
113 |
-
|
114 |
-
st.session_state.analyze_results = analyze_results
|
115 |
|
116 |
logging.info(
|
117 |
f"analyse results: {st.session_state.analyze_results}\n"
|
118 |
)
|
119 |
|
120 |
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
#
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
logging.info(
|
149 |
-
f"analyse results
|
150 |
)
|
151 |
-
|
|
|
|
|
|
|
|
|
152 |
if st.session_state.text[token.start:token.end] not in st.session_state.allowed_words:
|
153 |
analyze_results_fltered.append(token)
|
154 |
logging.info(
|
155 |
f"analyse results after removing allowed words: {analyze_results_fltered}\n"
|
156 |
)
|
157 |
-
|
158 |
-
|
159 |
|
160 |
@st.cache(allow_output_mutation=True)
|
161 |
def anonymizer_engine():
|
@@ -190,8 +202,7 @@ def anonymise_text():
|
|
190 |
def clear_results():
|
191 |
st.session_state.anon_results=""
|
192 |
st.session_state.analyze_results=""
|
193 |
-
#
|
194 |
-
analyzer_engine().registry.remove_recognizer("Excluded words recognizer")
|
195 |
|
196 |
#######################################
|
197 |
#### Initialize "global" variables ####
|
|
|
3 |
import re
|
4 |
import logging
|
5 |
from presidio_anonymizer import AnonymizerEngine
|
6 |
+
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, RecognizerResult, EntityRecognizer
|
7 |
|
8 |
from annotated_text import annotated_text
|
9 |
from flair_recognizer import FlairRecognizer
|
|
|
41 |
if "entities" not in kwargs or "All" in kwargs["entities"]:
|
42 |
kwargs["entities"] = None
|
43 |
|
44 |
+
# if st.session_state.excluded_words:
|
45 |
+
|
46 |
+
# deny_list = [i.strip() for i in st.session_state.excluded_words.split(',')]
|
47 |
+
|
48 |
+
# logging.info(
|
49 |
+
# f"words excluded : {deny_list}\n"
|
50 |
+
# )
|
51 |
+
|
52 |
+
# excluded_words_recognizer = PatternRecognizer(supported_entity="MANUAL ADD",
|
53 |
+
# name="Excluded words recognizer",
|
54 |
+
# deny_list=deny_list)
|
55 |
+
# analyzer_engine().registry.add_recognizer(excluded_words_recognizer)
|
56 |
|
57 |
results = analyzer_engine().analyze(**kwargs)
|
58 |
+
st.session_state.analyze_results = results
|
|
|
59 |
|
60 |
def annotate():
|
61 |
text = st.session_state.text
|
|
|
98 |
logging.info(f"This is the text being analysed: {st.session_state.text}")
|
99 |
st.session_state.text_error = ""
|
100 |
st.session_state.n_requests += 1
|
101 |
+
analyze(
|
102 |
text=st.session_state.text,
|
103 |
entities=st_entities,
|
104 |
language="en",
|
105 |
return_decision_process=False,
|
106 |
)
|
107 |
|
108 |
+
if st.session_state.excluded_words:
|
109 |
+
include_manual_input()
|
110 |
|
111 |
if st.session_state.allowed_words:
|
112 |
+
exclude_manual_input()
|
|
|
|
|
113 |
|
114 |
logging.info(
|
115 |
f"analyse results: {st.session_state.analyze_results}\n"
|
116 |
)
|
117 |
|
118 |
|
119 |
+
def include_manual_input():
|
120 |
+
deny_list = [i.strip() for i in st.session_state.excluded_words.split(',')]
|
121 |
+
def _deny_list_to_regex(deny_list):
|
122 |
+
"""
|
123 |
+
Convert a list of words to a matching regex.
|
124 |
+
To be analyzed by the analyze method as any other regex patterns.
|
125 |
+
:param deny_list: the list of words to detect
|
126 |
+
:return:the regex of the words for detection
|
127 |
+
"""
|
128 |
+
# Escape deny list elements as preparation for regex
|
129 |
+
escaped_deny_list = [re.escape(element) for element in deny_list]
|
130 |
+
regex = r"(?:^|(?<=\W))(" + "|".join(escaped_deny_list) + r")(?:(?=\W)|$)"
|
131 |
+
return regex
|
132 |
+
|
133 |
+
deny_list_pattern = _deny_list_to_regex(deny_list)
|
134 |
+
matches = re.finditer(deny_list_pattern, st.session_state.text)
|
135 |
+
results = []
|
136 |
+
for match in matches:
|
137 |
+
start, end = match.span()
|
138 |
+
current_match = st.session_state.text[start:end]
|
139 |
+
|
140 |
+
# Skip empty results
|
141 |
+
if current_match == "":
|
142 |
+
continue
|
143 |
+
|
144 |
+
pattern_result = RecognizerResult(
|
145 |
+
entity_type='MANUALLY ADDED',
|
146 |
+
start=start,
|
147 |
+
end=end,
|
148 |
+
score=1.0,
|
149 |
+
)
|
150 |
+
|
151 |
+
results.append(pattern_result)
|
152 |
+
|
153 |
+
results = EntityRecognizer.remove_duplicates(results)
|
154 |
+
|
155 |
+
st.session_state.analyze_results = st.session_state.analyze_results.extend(results)
|
156 |
+
|
157 |
logging.info(
|
158 |
+
f"analyse results after adding excluded words: {results}\n"
|
159 |
)
|
160 |
+
|
161 |
+
def exclude_manual_input():
|
162 |
+
analyze_results_fltered=[]
|
163 |
+
|
164 |
+
for token in st.session_state.analyze_results:
|
165 |
if st.session_state.text[token.start:token.end] not in st.session_state.allowed_words:
|
166 |
analyze_results_fltered.append(token)
|
167 |
logging.info(
|
168 |
f"analyse results after removing allowed words: {analyze_results_fltered}\n"
|
169 |
)
|
170 |
+
st.session_state.analyze_results = analyze_results_fltered
|
|
|
171 |
|
172 |
@st.cache(allow_output_mutation=True)
|
173 |
def anonymizer_engine():
|
|
|
202 |
def clear_results():
|
203 |
st.session_state.anon_results=""
|
204 |
st.session_state.analyze_results=""
|
205 |
+
# analyzer_engine().registry.remove_recognizer("Excluded words recognizer")
|
|
|
206 |
|
207 |
#######################################
|
208 |
#### Initialize "global" variables ####
|