Spaces:
Sleeping
Sleeping
File size: 6,179 Bytes
337fbc2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
import re
import gradio as gr
from gliner import GLiNER
from cerberus import Validator
# ----------------------------------------------------------------------------
# Load model + labels
# ----------------------------------------------------------------------------
model = GLiNER.from_pretrained("urchade/gliner_multi_pii-v1")
with open("labels.txt", "r", encoding="utf-8") as f:
labels = [line.strip() for line in f.readlines()]
# ----------------------------------------------------------------------------
# Simple Cerberus validation for incoming data
# ----------------------------------------------------------------------------
# We expect a dict with at least {"text": "<some string>"}
schema = {
"text": {
"type": "string",
"empty": False
}
}
validator = Validator(schema)
def validate_input(data: dict) -> str:
"""Validate that data has a non-empty 'text' key."""
if not validator.validate(data):
# If invalid, raise an exception. You could handle this more gracefully if you like.
raise ValueError(f"Invalid input data. Errors: {validator.errors}")
return data["text"]
# ----------------------------------------------------------------------------
# Core anonymize / de-anonymize logic (same as before)
# ----------------------------------------------------------------------------
def anonymize_text(text):
"""
1) Detect PII using GLiNER,
2) Replace each entity with a placeholder (<PII_LABEL_INDEX>)
3) Return anonymized_text + entity_map
"""
entities = model.predict_entities(text, labels=labels, threshold=0.2)
# Sort by start index to apply placeholders in correct order
entities.sort(key=lambda e: e['start'])
entity_map = {} # e.g. {'PERSON': ['Alice', 'Bob']}
anonymized_text = ""
next_start = 0
for entity in entities:
label = entity['label'].replace(" ", "_").upper()
original_text = entity['text']
start_idx, end_idx = entity['start'], entity['end']
if label not in entity_map:
entity_map[label] = [original_text]
idx = 1
else:
# If same exact string repeated, use the same index as before
if original_text in entity_map[label]:
idx = entity_map[label].index(original_text) + 1
else:
entity_map[label].append(original_text)
idx = len(entity_map[label])
# Copy everything before this entity
anonymized_text += text[next_start:start_idx]
# Insert placeholder
anonymized_text += f"<PII_{label}_{idx}>"
next_start = end_idx
# Remainder of the text after last entity
anonymized_text += text[next_start:]
return anonymized_text, entity_map
def deanonymize_text(anonymized_response, entity_map):
"""
Replace <PII_LABEL_INDEX> placeholders in anonymized_response
with their original strings from entity_map.
"""
def replace_match(match):
label = match.group(1) # e.g. "PERSON"
idx_str = match.group(2) # e.g. "1"
idx = int(idx_str) - 1 # 1-based index -> 0-based list index
if label in entity_map and 0 <= idx < len(entity_map[label]):
return entity_map[label][idx]
return match.group(0) # If something is off, return the placeholder as-is
pattern = r"<PII_(\w+)_(\d+)>"
return re.sub(pattern, replace_match, anonymized_response)
# ----------------------------------------------------------------------------
# Gradio Interface
# ----------------------------------------------------------------------------
def anonymize_fn(original_text):
# We’ll do a simple dict so we can pass it to our Cerberus validator:
data = {"text": original_text}
try:
user_text = validate_input(data)
except ValueError as e:
# If invalid, show error in Gradio output
return "", {}, f"Validation error: {str(e)}"
anonymized, entities = anonymize_text(user_text)
return anonymized, entities, "Anonymized successfully!"
def deanonymize_fn(anonymized_llm_response, entity_map):
if not anonymized_llm_response.strip():
return "", "Please provide an anonymized LLM response."
if not entity_map:
return "", "No entity map found; anonymize some text first."
result = deanonymize_text(anonymized_llm_response, entity_map)
return result, "De-anonymized successfully!"
md_text = """# Anonymizing LLM Prompts
Paste text into "Original Text" section to remove sensitive information, using `gliner_multi_pii-v1` for recognition.
The demo is adapted from [Elara](https://github.com/amanvirparhar/elara) by amanvirparhar. If you like this one, give the original a star!
"""
with gr.Blocks() as demo:
gr.Markdown(md_text)
with gr.Row():
with gr.Column():
original_text = gr.Textbox(
lines=6, label="Original Text (Anonymize)"
)
anonymized_text = gr.Textbox(
lines=6, label="Anonymized Text", interactive=False
)
button_anon = gr.Button("Anonymize")
# Hidden state to store the entity map
entity_map_state = gr.State()
message_out = gr.Textbox(label="Status", interactive=False)
button_anon.click(
anonymize_fn,
inputs=[original_text],
outputs=[anonymized_text, entity_map_state, message_out]
)
with gr.Column():
anonymized_llm_response = gr.Textbox(
lines=6, label="Anonymized LLM Response (Paste here)"
)
deanonymized_text = gr.Textbox(
lines=6, label="De-anonymized LLM Response", interactive=False
)
button_deanon = gr.Button("De-anonymize")
message_out_de = gr.Textbox(label="Status", interactive=False)
button_deanon.click(
deanonymize_fn,
inputs=[anonymized_llm_response, entity_map_state],
outputs=[deanonymized_text, message_out_de]
)
if __name__ == "__main__":
demo.launch()
|