umarigan commited on
Commit
9de7f58
1 Parent(s): 418cafa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -4
app.py CHANGED
@@ -4,6 +4,7 @@ from transformers import pipeline, AutoModelForTokenClassification, AutoTokenize
4
  import PyPDF2
5
  import docx
6
  import io
 
7
 
8
  def chunk_text(text, chunk_size=128):
9
  words = text.split()
@@ -92,9 +93,10 @@ def entity_comb(output):
92
  output_comb.append(entity)
93
  return output_comb
94
 
95
- def create_mask_dict(entities):
96
  mask_dict = {}
97
  entity_counters = {}
 
98
  for entity in entities:
99
  if entity['entity_group'] not in ['CARDINAL', 'EVENT']:
100
  if entity['word'] not in mask_dict:
@@ -103,6 +105,11 @@ def create_mask_dict(entities):
103
  else:
104
  entity_counters[entity['entity_group']] += 1
105
  mask_dict[entity['word']] = f"{entity['entity_group']}_{entity_counters[entity['entity_group']]}"
 
 
 
 
 
106
  return mask_dict
107
 
108
  def replace_words_in_text(input_text, entities):
@@ -111,6 +118,34 @@ def replace_words_in_text(input_text, entities):
111
  input_text = input_text.replace(word, replacement)
112
  return input_text
113
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  Run_Button = st.button("Run")
115
 
116
  if Run_Button and input_text:
@@ -136,13 +171,16 @@ if Run_Button and input_text:
136
  # Combine entities
137
  output_comb = entity_comb(all_outputs)
138
 
 
 
 
139
  # Create masked text and masking dictionary
140
- masked_text = replace_words_in_text(input_text, output_comb)#create_masked_text(input_text, output_comb)
141
- mask_dict = create_mask_dict(output_comb)
142
 
143
  # Display the masked text and masking dictionary
144
  st.subheader("Masked Text Preview")
145
  st.text(masked_text)
146
 
147
  st.subheader("Masking Dictionary")
148
- st.json(mask_dict)
 
4
  import PyPDF2
5
  import docx
6
  import io
7
+ import re
8
 
9
  def chunk_text(text, chunk_size=128):
10
  words = text.split()
 
93
  output_comb.append(entity)
94
  return output_comb
95
 
96
+ def create_mask_dict(entities, additional_masks=None):
97
  mask_dict = {}
98
  entity_counters = {}
99
+
100
  for entity in entities:
101
  if entity['entity_group'] not in ['CARDINAL', 'EVENT']:
102
  if entity['word'] not in mask_dict:
 
105
  else:
106
  entity_counters[entity['entity_group']] += 1
107
  mask_dict[entity['word']] = f"{entity['entity_group']}_{entity_counters[entity['entity_group']]}"
108
+
109
+ if additional_masks:
110
+ for word, replacement in additional_masks.items():
111
+ mask_dict[word] = replacement
112
+
113
  return mask_dict
114
 
115
  def replace_words_in_text(input_text, entities):
 
118
  input_text = input_text.replace(word, replacement)
119
  return input_text
120
 
121
+ # Function to mask email, phone, and address patterns
122
+ def mask_patterns(text):
123
+ masks = {}
124
+
125
+ # Email pattern
126
+ email_pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
127
+ emails = re.findall(email_pattern, text)
128
+ for email in emails:
129
+ masks[email] = "<EMAIL>"
130
+
131
+ # Phone pattern (Turkish)
132
+ phone_pattern = r"\+90\d{10}|\b\d{3}[-.\s]?\d{3}[-.\s]?\d{2}[-.\s]?\d{2}\b"
133
+ phones = re.findall(phone_pattern, text)
134
+ for phone in phones:
135
+ masks[phone] = "<PHONE>"
136
+
137
+ # Address pattern (basic example, can be enhanced)
138
+ address_pattern = r"\d{1,5}\s\w+(\s\w+)*" # Simplified address pattern
139
+ addresses = re.findall(address_pattern, text)
140
+ for address in addresses:
141
+ masks[address] = "<ADDRESS>"
142
+
143
+ # Replace patterns in text
144
+ for word, replacement in masks.items():
145
+ text = text.replace(word, replacement)
146
+
147
+ return text, masks
148
+
149
  Run_Button = st.button("Run")
150
 
151
  if Run_Button and input_text:
 
171
  # Combine entities
172
  output_comb = entity_comb(all_outputs)
173
 
174
+ # Mask emails, phone numbers, and addresses
175
+ masked_text, additional_masks = mask_patterns(input_text)
176
+
177
  # Create masked text and masking dictionary
178
+ masked_text = replace_words_in_text(masked_text, output_comb)
179
+ mask_dict = create_mask_dict(output_comb, additional_masks)
180
 
181
  # Display the masked text and masking dictionary
182
  st.subheader("Masked Text Preview")
183
  st.text(masked_text)
184
 
185
  st.subheader("Masking Dictionary")
186
+ st.json(mask_dict)