Spaces:
Sleeping
Sleeping
import re | |
import gradio as gr | |
from gliner import GLiNER | |
from cerberus import Validator | |
# ---------------------------------------------------------------------------- | |
# Load model + labels | |
# ---------------------------------------------------------------------------- | |
model = GLiNER.from_pretrained("urchade/gliner_multi_pii-v1") | |
with open("labels.txt", "r", encoding="utf-8") as f: | |
labels = [line.strip() for line in f.readlines()] | |
# ---------------------------------------------------------------------------- | |
# Simple Cerberus validation for incoming data | |
# ---------------------------------------------------------------------------- | |
# We expect a dict with at least {"text": "<some string>"} | |
schema = { | |
"text": { | |
"type": "string", | |
"empty": False | |
} | |
} | |
validator = Validator(schema) | |
def validate_input(data: dict) -> str: | |
"""Validate that data has a non-empty 'text' key.""" | |
if not validator.validate(data): | |
# If invalid, raise an exception. You could handle this more gracefully if you like. | |
raise ValueError(f"Invalid input data. Errors: {validator.errors}") | |
return data["text"] | |
# ---------------------------------------------------------------------------- | |
# Core anonymize / de-anonymize logic (same as before) | |
# ---------------------------------------------------------------------------- | |
def anonymize_text(text): | |
""" | |
1) Detect PII using GLiNER, | |
2) Replace each entity with a placeholder (<PII_LABEL_INDEX>) | |
3) Return anonymized_text + entity_map | |
""" | |
entities = model.predict_entities(text, labels=labels, threshold=0.2) | |
# Sort by start index to apply placeholders in correct order | |
entities.sort(key=lambda e: e['start']) | |
entity_map = {} # e.g. {'PERSON': ['Alice', 'Bob']} | |
anonymized_text = "" | |
next_start = 0 | |
for entity in entities: | |
label = entity['label'].replace(" ", "_").upper() | |
original_text = entity['text'] | |
start_idx, end_idx = entity['start'], entity['end'] | |
if label not in entity_map: | |
entity_map[label] = [original_text] | |
idx = 1 | |
else: | |
# If same exact string repeated, use the same index as before | |
if original_text in entity_map[label]: | |
idx = entity_map[label].index(original_text) + 1 | |
else: | |
entity_map[label].append(original_text) | |
idx = len(entity_map[label]) | |
# Copy everything before this entity | |
anonymized_text += text[next_start:start_idx] | |
# Insert placeholder | |
anonymized_text += f"<PII_{label}_{idx}>" | |
next_start = end_idx | |
# Remainder of the text after last entity | |
anonymized_text += text[next_start:] | |
return anonymized_text, entity_map | |
def deanonymize_text(anonymized_response, entity_map): | |
""" | |
Replace <PII_LABEL_INDEX> placeholders in anonymized_response | |
with their original strings from entity_map. | |
""" | |
def replace_match(match): | |
label = match.group(1) # e.g. "PERSON" | |
idx_str = match.group(2) # e.g. "1" | |
idx = int(idx_str) - 1 # 1-based index -> 0-based list index | |
if label in entity_map and 0 <= idx < len(entity_map[label]): | |
return entity_map[label][idx] | |
return match.group(0) # If something is off, return the placeholder as-is | |
pattern = r"<PII_(\w+)_(\d+)>" | |
return re.sub(pattern, replace_match, anonymized_response) | |
# ---------------------------------------------------------------------------- | |
# Gradio Interface | |
# ---------------------------------------------------------------------------- | |
def anonymize_fn(original_text): | |
# We’ll do a simple dict so we can pass it to our Cerberus validator: | |
data = {"text": original_text} | |
try: | |
user_text = validate_input(data) | |
except ValueError as e: | |
# If invalid, show error in Gradio output | |
return "", {}, f"Validation error: {str(e)}" | |
anonymized, entities = anonymize_text(user_text) | |
return anonymized, entities, "Anonymized successfully!" | |
def deanonymize_fn(anonymized_llm_response, entity_map): | |
if not anonymized_llm_response.strip(): | |
return "", "Please provide an anonymized LLM response." | |
if not entity_map: | |
return "", "No entity map found; anonymize some text first." | |
result = deanonymize_text(anonymized_llm_response, entity_map) | |
return result, "De-anonymized successfully!" | |
md_text = """# Anonymizing LLM Prompts | |
Paste text into "Original Text" section to remove sensitive information, using `gliner_multi_pii-v1` for recognition. | |
The demo is adapted from [Elara](https://github.com/amanvirparhar/elara) by amanvirparhar. If you like this one, give the original a star! | |
""" | |
with gr.Blocks() as demo: | |
gr.Markdown(md_text) | |
with gr.Row(): | |
with gr.Column(): | |
original_text = gr.Textbox( | |
lines=6, label="Original Text (Anonymize)" | |
) | |
anonymized_text = gr.Textbox( | |
lines=6, label="Anonymized Text", interactive=False | |
) | |
button_anon = gr.Button("Anonymize") | |
# Hidden state to store the entity map | |
entity_map_state = gr.State() | |
message_out = gr.Textbox(label="Status", interactive=False) | |
button_anon.click( | |
anonymize_fn, | |
inputs=[original_text], | |
outputs=[anonymized_text, entity_map_state, message_out] | |
) | |
with gr.Column(): | |
anonymized_llm_response = gr.Textbox( | |
lines=6, label="Anonymized LLM Response (Paste here)" | |
) | |
deanonymized_text = gr.Textbox( | |
lines=6, label="De-anonymized LLM Response", interactive=False | |
) | |
button_deanon = gr.Button("De-anonymize") | |
message_out_de = gr.Textbox(label="Status", interactive=False) | |
button_deanon.click( | |
deanonymize_fn, | |
inputs=[anonymized_llm_response, entity_map_state], | |
outputs=[deanonymized_text, message_out_de] | |
) | |
if __name__ == "__main__": | |
demo.launch() | |