File size: 6,179 Bytes
337fbc2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
import re
import gradio as gr
from gliner import GLiNER
from cerberus import Validator

# ----------------------------------------------------------------------------
# Load model + labels
# ----------------------------------------------------------------------------

model = GLiNER.from_pretrained("urchade/gliner_multi_pii-v1")

with open("labels.txt", "r", encoding="utf-8") as f:
    labels = [line.strip() for line in f.readlines()]

# ----------------------------------------------------------------------------
# Simple Cerberus validation for incoming data
# ----------------------------------------------------------------------------

# We expect a dict with at least {"text": "<some string>"}
schema = {
    "text": {
        "type": "string",
        "empty": False
    }
}

validator = Validator(schema)


def validate_input(data: dict) -> str:
    """Validate that data has a non-empty 'text' key."""
    if not validator.validate(data):
        # If invalid, raise an exception. You could handle this more gracefully if you like.
        raise ValueError(f"Invalid input data. Errors: {validator.errors}")
    return data["text"]

# ----------------------------------------------------------------------------
# Core anonymize / de-anonymize logic (same as before)
# ----------------------------------------------------------------------------


def anonymize_text(text):
    """
    1) Detect PII using GLiNER,
    2) Replace each entity with a placeholder (<PII_LABEL_INDEX>)
    3) Return anonymized_text + entity_map
    """
    entities = model.predict_entities(text, labels=labels, threshold=0.2)
    # Sort by start index to apply placeholders in correct order
    entities.sort(key=lambda e: e['start'])

    entity_map = {}  # e.g. {'PERSON': ['Alice', 'Bob']}
    anonymized_text = ""
    next_start = 0

    for entity in entities:
        label = entity['label'].replace(" ", "_").upper()
        original_text = entity['text']
        start_idx, end_idx = entity['start'], entity['end']

        if label not in entity_map:
            entity_map[label] = [original_text]
            idx = 1
        else:
            # If same exact string repeated, use the same index as before
            if original_text in entity_map[label]:
                idx = entity_map[label].index(original_text) + 1
            else:
                entity_map[label].append(original_text)
                idx = len(entity_map[label])

        # Copy everything before this entity
        anonymized_text += text[next_start:start_idx]
        # Insert placeholder
        anonymized_text += f"<PII_{label}_{idx}>"
        next_start = end_idx

    # Remainder of the text after last entity
    anonymized_text += text[next_start:]
    return anonymized_text, entity_map


def deanonymize_text(anonymized_response, entity_map):
    """
    Replace <PII_LABEL_INDEX> placeholders in anonymized_response
    with their original strings from entity_map.
    """

    def replace_match(match):
        label = match.group(1)  # e.g. "PERSON"
        idx_str = match.group(2)  # e.g. "1"
        idx = int(idx_str) - 1    # 1-based index -> 0-based list index

        if label in entity_map and 0 <= idx < len(entity_map[label]):
            return entity_map[label][idx]
        return match.group(0)  # If something is off, return the placeholder as-is

    pattern = r"<PII_(\w+)_(\d+)>"
    return re.sub(pattern, replace_match, anonymized_response)

# ----------------------------------------------------------------------------
# Gradio Interface
# ----------------------------------------------------------------------------

def anonymize_fn(original_text):
    # We’ll do a simple dict so we can pass it to our Cerberus validator:
    data = {"text": original_text}
    try:
        user_text = validate_input(data)
    except ValueError as e:
        # If invalid, show error in Gradio output
        return "", {}, f"Validation error: {str(e)}"

    anonymized, entities = anonymize_text(user_text)
    return anonymized, entities, "Anonymized successfully!"


def deanonymize_fn(anonymized_llm_response, entity_map):
    if not anonymized_llm_response.strip():
        return "", "Please provide an anonymized LLM response."
    if not entity_map:
        return "", "No entity map found; anonymize some text first."

    result = deanonymize_text(anonymized_llm_response, entity_map)
    return result, "De-anonymized successfully!"


md_text = """# Anonymizing LLM Prompts

Paste text into "Original Text" section to remove sensitive information, using `gliner_multi_pii-v1` for recognition.

The demo is adapted from [Elara](https://github.com/amanvirparhar/elara) by amanvirparhar. If you like this one, give the original a star! 
"""

with gr.Blocks() as demo:
    gr.Markdown(md_text)

    with gr.Row():
        with gr.Column():
            original_text = gr.Textbox(
                lines=6, label="Original Text (Anonymize)"
            )
            anonymized_text = gr.Textbox(
                lines=6, label="Anonymized Text", interactive=False
            )
            button_anon = gr.Button("Anonymize")

            # Hidden state to store the entity map
            entity_map_state = gr.State()

            message_out = gr.Textbox(label="Status", interactive=False)

            button_anon.click(
                anonymize_fn,
                inputs=[original_text],
                outputs=[anonymized_text, entity_map_state, message_out]
            )

        with gr.Column():
            anonymized_llm_response = gr.Textbox(
                lines=6, label="Anonymized LLM Response (Paste here)"
            )
            deanonymized_text = gr.Textbox(
                lines=6, label="De-anonymized LLM Response", interactive=False
            )
            button_deanon = gr.Button("De-anonymize")

            message_out_de = gr.Textbox(label="Status", interactive=False)

            button_deanon.click(
                deanonymize_fn,
                inputs=[anonymized_llm_response, entity_map_state],
                outputs=[deanonymized_text, message_out_de]
            )

if __name__ == "__main__":
    demo.launch()