arogeriogel commited on
Commit
16e8a0e
·
unverified ·
1 Parent(s): 4439610

update app

Browse files
Files changed (1) hide show
  1. app.py +64 -53
app.py CHANGED
@@ -3,7 +3,7 @@ import streamlit as st
3
  import re
4
  import logging
5
  from presidio_anonymizer import AnonymizerEngine
6
- from presidio_analyzer import AnalyzerEngine, PatternRecognizer, RecognizerResult
7
 
8
  from annotated_text import annotated_text
9
  from flair_recognizer import FlairRecognizer
@@ -41,21 +41,21 @@ def analyze(**kwargs):
41
  if "entities" not in kwargs or "All" in kwargs["entities"]:
42
  kwargs["entities"] = None
43
 
44
- if st.session_state.excluded_words:
45
- logging.info(
46
- f"type of excluded_words_recognizer: {type(st.session_state.excluded_words)}\n"
47
- )
48
- logging.info(
49
- f"excluded words: {st.session_state.excluded_words.split(',')}\n"
50
- )
51
- excluded_words_recognizer = PatternRecognizer(supported_entity="MANUAL ADD",
52
- name="Excluded words recognizer",
53
- deny_list=st.session_state.excluded_words.split(','))
54
- analyzer_engine().registry.add_recognizer(excluded_words_recognizer)
 
55
 
56
  results = analyzer_engine().analyze(**kwargs)
57
-
58
- return results
59
 
60
  def annotate():
61
  text = st.session_state.text
@@ -98,64 +98,76 @@ def analyze_text():
98
  logging.info(f"This is the text being analysed: {st.session_state.text}")
99
  st.session_state.text_error = ""
100
  st.session_state.n_requests += 1
101
- analyze_results = analyze(
102
  text=st.session_state.text,
103
  entities=st_entities,
104
  language="en",
105
  return_decision_process=False,
106
  )
107
 
108
- # if st.session_state.excluded_words:
109
- # analyze_results = include_manual_input(analyze_results)
110
 
111
  if st.session_state.allowed_words:
112
- analyze_results = exclude_manual_input(analyze_results)
113
-
114
- st.session_state.analyze_results = analyze_results
115
 
116
  logging.info(
117
  f"analyse results: {st.session_state.analyze_results}\n"
118
  )
119
 
120
 
121
- # def include_manual_input(analyze_results):
122
- # analyze_results_extended=analyze_results
123
- # logging.info(
124
- # f"analyse results before adding extra words: {analyze_results}\n"
125
- # )
126
- # for word in st.session_state.excluded_words:
127
- # if word in st.session_state.text:
128
- # r = re.compile(word)
129
- # index_entries = [[m.start(),m.end()] for m in r.finditer(st.session_state.text)]
130
- # for entry in index_entries:
131
- # start=entry[0]
132
- # end=entry[1]
133
-
134
- # analyze_results_extended.append("type": "MANUAL ADD", "start": start, "end": end, "score": 1.0})
135
- # logging.info(
136
- # f"analyse results after adding allowed words: {analyze_results_extended}\n"
137
- # )
138
- # logging.info(
139
- # f"type of entries in results: {type(analyze_results[0])}\n"
140
- # )
141
- # return analyze_results_extended
142
-
143
-
144
- ## We might be able to create a new result from json https://github.com/microsoft/presidio/blob/07b854dd7ae247b916aef4d2adbb82f33bba7be8/presidio-analyzer/presidio_analyzer/recognizer_result.py#L72
145
-
146
- def exclude_manual_input(analyze_results):
147
- analyze_results_fltered=[]
 
 
 
 
 
 
 
 
 
 
 
148
  logging.info(
149
- f"analyse results before removing allowed words: {analyze_results}\n"
150
  )
151
- for token in analyze_results:
 
 
 
 
152
  if st.session_state.text[token.start:token.end] not in st.session_state.allowed_words:
153
  analyze_results_fltered.append(token)
154
  logging.info(
155
  f"analyse results after removing allowed words: {analyze_results_fltered}\n"
156
  )
157
- return analyze_results_fltered
158
-
159
 
160
  @st.cache(allow_output_mutation=True)
161
  def anonymizer_engine():
@@ -190,8 +202,7 @@ def anonymise_text():
190
  def clear_results():
191
  st.session_state.anon_results=""
192
  st.session_state.analyze_results=""
193
- # if not st.session_state.excluded_words:
194
- analyzer_engine().registry.remove_recognizer("Excluded words recognizer")
195
 
196
  #######################################
197
  #### Initialize "global" variables ####
 
3
  import re
4
  import logging
5
  from presidio_anonymizer import AnonymizerEngine
6
+ from presidio_analyzer import AnalyzerEngine, PatternRecognizer, RecognizerResult, EntityRecognizer
7
 
8
  from annotated_text import annotated_text
9
  from flair_recognizer import FlairRecognizer
 
41
  if "entities" not in kwargs or "All" in kwargs["entities"]:
42
  kwargs["entities"] = None
43
 
44
+ # if st.session_state.excluded_words:
45
+
46
+ # deny_list = [i.strip() for i in st.session_state.excluded_words.split(',')]
47
+
48
+ # logging.info(
49
+ # f"words excluded : {deny_list}\n"
50
+ # )
51
+
52
+ # excluded_words_recognizer = PatternRecognizer(supported_entity="MANUAL ADD",
53
+ # name="Excluded words recognizer",
54
+ # deny_list=deny_list)
55
+ # analyzer_engine().registry.add_recognizer(excluded_words_recognizer)
56
 
57
  results = analyzer_engine().analyze(**kwargs)
58
+ st.session_state.analyze_results = results
 
59
 
60
  def annotate():
61
  text = st.session_state.text
 
98
  logging.info(f"This is the text being analysed: {st.session_state.text}")
99
  st.session_state.text_error = ""
100
  st.session_state.n_requests += 1
101
+ analyze(
102
  text=st.session_state.text,
103
  entities=st_entities,
104
  language="en",
105
  return_decision_process=False,
106
  )
107
 
108
+ if st.session_state.excluded_words:
109
+ include_manual_input()
110
 
111
  if st.session_state.allowed_words:
112
+ exclude_manual_input()
 
 
113
 
114
  logging.info(
115
  f"analyse results: {st.session_state.analyze_results}\n"
116
  )
117
 
118
 
119
+ def include_manual_input():
120
+ deny_list = [i.strip() for i in st.session_state.excluded_words.split(',')]
121
+ def _deny_list_to_regex(deny_list):
122
+ """
123
+ Convert a list of words to a matching regex.
124
+ To be analyzed by the analyze method as any other regex patterns.
125
+ :param deny_list: the list of words to detect
126
+ :return:the regex of the words for detection
127
+ """
128
+ # Escape deny list elements as preparation for regex
129
+ escaped_deny_list = [re.escape(element) for element in deny_list]
130
+ regex = r"(?:^|(?<=\W))(" + "|".join(escaped_deny_list) + r")(?:(?=\W)|$)"
131
+ return regex
132
+
133
+ deny_list_pattern = _deny_list_to_regex(deny_list)
134
+ matches = re.finditer(deny_list_pattern, st.session_state.text)
135
+ results = []
136
+ for match in matches:
137
+ start, end = match.span()
138
+ current_match = st.session_state.text[start:end]
139
+
140
+ # Skip empty results
141
+ if current_match == "":
142
+ continue
143
+
144
+ pattern_result = RecognizerResult(
145
+ entity_type='MANUALLY ADDED',
146
+ start=start,
147
+ end=end,
148
+ score=1.0,
149
+ )
150
+
151
+ results.append(pattern_result)
152
+
153
+ results = EntityRecognizer.remove_duplicates(results)
154
+
155
+ st.session_state.analyze_results = st.session_state.analyze_results.extend(results)
156
+
157
  logging.info(
158
+ f"analyse results after adding excluded words: {results}\n"
159
  )
160
+
161
+ def exclude_manual_input():
162
+ analyze_results_fltered=[]
163
+
164
+ for token in st.session_state.analyze_results:
165
  if st.session_state.text[token.start:token.end] not in st.session_state.allowed_words:
166
  analyze_results_fltered.append(token)
167
  logging.info(
168
  f"analyse results after removing allowed words: {analyze_results_fltered}\n"
169
  )
170
+ st.session_state.analyze_results = analyze_results_fltered
 
171
 
172
  @st.cache(allow_output_mutation=True)
173
  def anonymizer_engine():
 
202
  def clear_results():
203
  st.session_state.anon_results=""
204
  st.session_state.analyze_results=""
205
+ # analyzer_engine().registry.remove_recognizer("Excluded words recognizer")
 
206
 
207
  #######################################
208
  #### Initialize "global" variables ####