trojblue's picture
adding space files
337fbc2
raw
history blame
6.18 kB
import re
import gradio as gr
from gliner import GLiNER
from cerberus import Validator
# ----------------------------------------------------------------------------
# Load model + labels
# ----------------------------------------------------------------------------
model = GLiNER.from_pretrained("urchade/gliner_multi_pii-v1")
with open("labels.txt", "r", encoding="utf-8") as f:
labels = [line.strip() for line in f.readlines()]
# ----------------------------------------------------------------------------
# Simple Cerberus validation for incoming data
# ----------------------------------------------------------------------------
# We expect a dict with at least {"text": "<some string>"}
schema = {
"text": {
"type": "string",
"empty": False
}
}
validator = Validator(schema)
def validate_input(data: dict) -> str:
"""Validate that data has a non-empty 'text' key."""
if not validator.validate(data):
# If invalid, raise an exception. You could handle this more gracefully if you like.
raise ValueError(f"Invalid input data. Errors: {validator.errors}")
return data["text"]
# ----------------------------------------------------------------------------
# Core anonymize / de-anonymize logic (same as before)
# ----------------------------------------------------------------------------
def anonymize_text(text):
"""
1) Detect PII using GLiNER,
2) Replace each entity with a placeholder (<PII_LABEL_INDEX>)
3) Return anonymized_text + entity_map
"""
entities = model.predict_entities(text, labels=labels, threshold=0.2)
# Sort by start index to apply placeholders in correct order
entities.sort(key=lambda e: e['start'])
entity_map = {} # e.g. {'PERSON': ['Alice', 'Bob']}
anonymized_text = ""
next_start = 0
for entity in entities:
label = entity['label'].replace(" ", "_").upper()
original_text = entity['text']
start_idx, end_idx = entity['start'], entity['end']
if label not in entity_map:
entity_map[label] = [original_text]
idx = 1
else:
# If same exact string repeated, use the same index as before
if original_text in entity_map[label]:
idx = entity_map[label].index(original_text) + 1
else:
entity_map[label].append(original_text)
idx = len(entity_map[label])
# Copy everything before this entity
anonymized_text += text[next_start:start_idx]
# Insert placeholder
anonymized_text += f"<PII_{label}_{idx}>"
next_start = end_idx
# Remainder of the text after last entity
anonymized_text += text[next_start:]
return anonymized_text, entity_map
def deanonymize_text(anonymized_response, entity_map):
"""
Replace <PII_LABEL_INDEX> placeholders in anonymized_response
with their original strings from entity_map.
"""
def replace_match(match):
label = match.group(1) # e.g. "PERSON"
idx_str = match.group(2) # e.g. "1"
idx = int(idx_str) - 1 # 1-based index -> 0-based list index
if label in entity_map and 0 <= idx < len(entity_map[label]):
return entity_map[label][idx]
return match.group(0) # If something is off, return the placeholder as-is
pattern = r"<PII_(\w+)_(\d+)>"
return re.sub(pattern, replace_match, anonymized_response)
# ----------------------------------------------------------------------------
# Gradio Interface
# ----------------------------------------------------------------------------
def anonymize_fn(original_text):
# We’ll do a simple dict so we can pass it to our Cerberus validator:
data = {"text": original_text}
try:
user_text = validate_input(data)
except ValueError as e:
# If invalid, show error in Gradio output
return "", {}, f"Validation error: {str(e)}"
anonymized, entities = anonymize_text(user_text)
return anonymized, entities, "Anonymized successfully!"
def deanonymize_fn(anonymized_llm_response, entity_map):
if not anonymized_llm_response.strip():
return "", "Please provide an anonymized LLM response."
if not entity_map:
return "", "No entity map found; anonymize some text first."
result = deanonymize_text(anonymized_llm_response, entity_map)
return result, "De-anonymized successfully!"
md_text = """# Anonymizing LLM Prompts
Paste text into "Original Text" section to remove sensitive information, using `gliner_multi_pii-v1` for recognition.
The demo is adapted from [Elara](https://github.com/amanvirparhar/elara) by amanvirparhar. If you like this one, give the original a star!
"""
with gr.Blocks() as demo:
gr.Markdown(md_text)
with gr.Row():
with gr.Column():
original_text = gr.Textbox(
lines=6, label="Original Text (Anonymize)"
)
anonymized_text = gr.Textbox(
lines=6, label="Anonymized Text", interactive=False
)
button_anon = gr.Button("Anonymize")
# Hidden state to store the entity map
entity_map_state = gr.State()
message_out = gr.Textbox(label="Status", interactive=False)
button_anon.click(
anonymize_fn,
inputs=[original_text],
outputs=[anonymized_text, entity_map_state, message_out]
)
with gr.Column():
anonymized_llm_response = gr.Textbox(
lines=6, label="Anonymized LLM Response (Paste here)"
)
deanonymized_text = gr.Textbox(
lines=6, label="De-anonymized LLM Response", interactive=False
)
button_deanon = gr.Button("De-anonymize")
message_out_de = gr.Textbox(label="Status", interactive=False)
button_deanon.click(
deanonymize_fn,
inputs=[anonymized_llm_response, entity_map_state],
outputs=[deanonymized_text, message_out_de]
)
if __name__ == "__main__":
demo.launch()