Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -4,6 +4,7 @@ from transformers import pipeline, AutoModelForTokenClassification, AutoTokenize
|
|
4 |
import PyPDF2
|
5 |
import docx
|
6 |
import io
|
|
|
7 |
|
8 |
def chunk_text(text, chunk_size=128):
|
9 |
words = text.split()
|
@@ -92,9 +93,10 @@ def entity_comb(output):
|
|
92 |
output_comb.append(entity)
|
93 |
return output_comb
|
94 |
|
95 |
-
def create_mask_dict(entities):
|
96 |
mask_dict = {}
|
97 |
entity_counters = {}
|
|
|
98 |
for entity in entities:
|
99 |
if entity['entity_group'] not in ['CARDINAL', 'EVENT']:
|
100 |
if entity['word'] not in mask_dict:
|
@@ -103,6 +105,11 @@ def create_mask_dict(entities):
|
|
103 |
else:
|
104 |
entity_counters[entity['entity_group']] += 1
|
105 |
mask_dict[entity['word']] = f"{entity['entity_group']}_{entity_counters[entity['entity_group']]}"
|
|
|
|
|
|
|
|
|
|
|
106 |
return mask_dict
|
107 |
|
108 |
def replace_words_in_text(input_text, entities):
|
@@ -111,6 +118,34 @@ def replace_words_in_text(input_text, entities):
|
|
111 |
input_text = input_text.replace(word, replacement)
|
112 |
return input_text
|
113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
Run_Button = st.button("Run")
|
115 |
|
116 |
if Run_Button and input_text:
|
@@ -136,13 +171,16 @@ if Run_Button and input_text:
|
|
136 |
# Combine entities
|
137 |
output_comb = entity_comb(all_outputs)
|
138 |
|
|
|
|
|
|
|
139 |
# Create masked text and masking dictionary
|
140 |
-
masked_text = replace_words_in_text(
|
141 |
-
mask_dict = create_mask_dict(output_comb)
|
142 |
|
143 |
# Display the masked text and masking dictionary
|
144 |
st.subheader("Masked Text Preview")
|
145 |
st.text(masked_text)
|
146 |
|
147 |
st.subheader("Masking Dictionary")
|
148 |
-
st.json(mask_dict)
|
|
|
4 |
import PyPDF2
|
5 |
import docx
|
6 |
import io
|
7 |
+
import re
|
8 |
|
9 |
def chunk_text(text, chunk_size=128):
|
10 |
words = text.split()
|
|
|
93 |
output_comb.append(entity)
|
94 |
return output_comb
|
95 |
|
96 |
+
def create_mask_dict(entities, additional_masks=None):
|
97 |
mask_dict = {}
|
98 |
entity_counters = {}
|
99 |
+
|
100 |
for entity in entities:
|
101 |
if entity['entity_group'] not in ['CARDINAL', 'EVENT']:
|
102 |
if entity['word'] not in mask_dict:
|
|
|
105 |
else:
|
106 |
entity_counters[entity['entity_group']] += 1
|
107 |
mask_dict[entity['word']] = f"{entity['entity_group']}_{entity_counters[entity['entity_group']]}"
|
108 |
+
|
109 |
+
if additional_masks:
|
110 |
+
for word, replacement in additional_masks.items():
|
111 |
+
mask_dict[word] = replacement
|
112 |
+
|
113 |
return mask_dict
|
114 |
|
115 |
def replace_words_in_text(input_text, entities):
|
|
|
118 |
input_text = input_text.replace(word, replacement)
|
119 |
return input_text
|
120 |
|
121 |
+
# Function to mask email, phone, and address patterns
|
122 |
+
def mask_patterns(text):
|
123 |
+
masks = {}
|
124 |
+
|
125 |
+
# Email pattern
|
126 |
+
email_pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
|
127 |
+
emails = re.findall(email_pattern, text)
|
128 |
+
for email in emails:
|
129 |
+
masks[email] = "<EMAIL>"
|
130 |
+
|
131 |
+
# Phone pattern (Turkish)
|
132 |
+
phone_pattern = r"\+90\d{10}|\b\d{3}[-.\s]?\d{3}[-.\s]?\d{2}[-.\s]?\d{2}\b"
|
133 |
+
phones = re.findall(phone_pattern, text)
|
134 |
+
for phone in phones:
|
135 |
+
masks[phone] = "<PHONE>"
|
136 |
+
|
137 |
+
# Address pattern (basic example, can be enhanced)
|
138 |
+
address_pattern = r"\d{1,5}\s\w+(\s\w+)*" # Simplified address pattern
|
139 |
+
addresses = re.findall(address_pattern, text)
|
140 |
+
for address in addresses:
|
141 |
+
masks[address] = "<ADDRESS>"
|
142 |
+
|
143 |
+
# Replace patterns in text
|
144 |
+
for word, replacement in masks.items():
|
145 |
+
text = text.replace(word, replacement)
|
146 |
+
|
147 |
+
return text, masks
|
148 |
+
|
149 |
Run_Button = st.button("Run")
|
150 |
|
151 |
if Run_Button and input_text:
|
|
|
171 |
# Combine entities
|
172 |
output_comb = entity_comb(all_outputs)
|
173 |
|
174 |
+
# Mask emails, phone numbers, and addresses
|
175 |
+
masked_text, additional_masks = mask_patterns(input_text)
|
176 |
+
|
177 |
# Create masked text and masking dictionary
|
178 |
+
masked_text = replace_words_in_text(masked_text, output_comb)
|
179 |
+
mask_dict = create_mask_dict(output_comb, additional_masks)
|
180 |
|
181 |
# Display the masked text and masking dictionary
|
182 |
st.subheader("Masked Text Preview")
|
183 |
st.text(masked_text)
|
184 |
|
185 |
st.subheader("Masking Dictionary")
|
186 |
+
st.json(mask_dict)
|