Update app.py
#2
by
lhoestq
HF staff
- opened
- analyze.py +5 -6
- app.py +44 -13
analyze.py
CHANGED
@@ -46,11 +46,10 @@ def batched(
|
|
46 |
|
47 |
|
48 |
def mask(text: str) -> str:
|
49 |
-
return
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
# )
|
54 |
|
55 |
|
56 |
def get_strings(row_content: Any) -> str:
|
@@ -101,7 +100,7 @@ def analyze(
|
|
101 |
]
|
102 |
return [
|
103 |
PresidioEntity(
|
104 |
-
text=
|
105 |
type=recognizer_result.entity_type,
|
106 |
row_idx=row_idx,
|
107 |
column_name=column_name,
|
|
|
46 |
|
47 |
|
48 |
def mask(text: str) -> str:
|
49 |
+
return " ".join(
|
50 |
+
word[: min(2, len(word) - 1)] + re.sub("[A-Za-z0-9]", "*", word[min(2, len(word) - 1) :])
|
51 |
+
for word in text.split(" ")
|
52 |
+
)
|
|
|
53 |
|
54 |
|
55 |
def get_strings(row_content: Any) -> str:
|
|
|
100 |
]
|
101 |
return [
|
102 |
PresidioEntity(
|
103 |
+
text=texts[i * len(scanned_columns) + j][recognizer_result.start : recognizer_result.end],
|
104 |
type=recognizer_result.entity_type,
|
105 |
row_idx=row_idx,
|
106 |
column_name=column_name,
|
app.py
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
-
from
|
|
|
|
|
2 |
from typing import Any, Iterable, TypeVar
|
3 |
|
4 |
import gradio as gr
|
@@ -7,7 +9,7 @@ import pandas as pd
|
|
7 |
from datasets import Features
|
8 |
from gradio_huggingfacehub_search import HuggingfaceHubSearch
|
9 |
|
10 |
-
from analyze import analyzer, get_column_description, get_columns_with_strings, presidio_scan_entities
|
11 |
|
12 |
MAX_ROWS = 100
|
13 |
T = TypeVar("T")
|
@@ -24,6 +26,22 @@ DEFAULT_PRESIDIO_ENTITIES = sorted([
|
|
24 |
'IBAN_CODE',
|
25 |
'EMAIL',
|
26 |
])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
def stream_rows(dataset: str, config: str, split: str) -> Iterable[dict[str, Any]]:
|
29 |
batch_size = 100
|
@@ -47,7 +65,16 @@ class track_iter:
|
|
47 |
self.next_idx += 1
|
48 |
yield item
|
49 |
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
info_resp = requests.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
|
52 |
if "error" in info_resp:
|
53 |
yield "β " + info_resp["error"], pd.DataFrame()
|
@@ -65,10 +92,12 @@ def analyze_dataset(dataset: str, enabled_presidio_entities: str) -> pd.DataFram
|
|
65 |
for presidio_entity in presidio_scan_entities(
|
66 |
rows, scanned_columns=scanned_columns, columns_descriptions=columns_descriptions
|
67 |
):
|
|
|
|
|
68 |
if presidio_entity["type"] in enabled_presidio_entities:
|
69 |
presidio_entities.append(presidio_entity)
|
70 |
-
yield
|
71 |
-
yield
|
72 |
|
73 |
with gr.Blocks() as demo:
|
74 |
gr.Markdown("# Scan datasets using Presidio")
|
@@ -85,26 +114,28 @@ with gr.Blocks() as demo:
|
|
85 |
value=DEFAULT_PRESIDIO_ENTITIES,
|
86 |
interactive=True,
|
87 |
),
|
|
|
88 |
]
|
89 |
button = gr.Button("Run Presidio Scan")
|
90 |
outputs = [
|
91 |
-
gr.
|
92 |
gr.DataFrame(),
|
93 |
]
|
94 |
button.click(analyze_dataset, inputs, outputs)
|
95 |
gr.Examples(
|
96 |
[
|
97 |
-
["microsoft/orca-math-word-problems-200k"
|
98 |
-
["tatsu-lab/alpaca"
|
99 |
-
["Anthropic/hh-rlhf"
|
100 |
-
["OpenAssistant/oasst1"
|
101 |
-
["sidhq/email-thread-summary"
|
102 |
-
["lhoestq/fake_name_and_ssn"
|
103 |
],
|
104 |
inputs,
|
105 |
outputs,
|
106 |
fn=analyze_dataset,
|
107 |
-
run_on_click=True
|
|
|
108 |
)
|
109 |
|
110 |
demo.launch()
|
|
|
1 |
+
from collections import Counter
|
2 |
+
from itertools import count, groupby, islice
|
3 |
+
from operator import itemgetter
|
4 |
from typing import Any, Iterable, TypeVar
|
5 |
|
6 |
import gradio as gr
|
|
|
9 |
from datasets import Features
|
10 |
from gradio_huggingfacehub_search import HuggingfaceHubSearch
|
11 |
|
12 |
+
from analyze import PresidioEntity, analyzer, get_column_description, get_columns_with_strings, mask, presidio_scan_entities
|
13 |
|
14 |
MAX_ROWS = 100
|
15 |
T = TypeVar("T")
|
|
|
26 |
'IBAN_CODE',
|
27 |
'EMAIL',
|
28 |
])
|
29 |
+
WARNING_PRESIDIO_ENTITIES = sorted([
|
30 |
+
'PHONE_NUMBER',
|
31 |
+
'US_PASSPORT',
|
32 |
+
'EMAIL_ADDRESS',
|
33 |
+
'IP_ADDRESS',
|
34 |
+
'US_BANK_NUMBER',
|
35 |
+
'IBAN_CODE',
|
36 |
+
'EMAIL',
|
37 |
+
])
|
38 |
+
ALERT_PRESIDIO_ENTITIES = sorted([
|
39 |
+
'CREDIT_CARD',
|
40 |
+
'US_SSN',
|
41 |
+
'US_PASSPORT',
|
42 |
+
'US_BANK_NUMBER',
|
43 |
+
'IBAN_CODE',
|
44 |
+
])
|
45 |
|
46 |
def stream_rows(dataset: str, config: str, split: str) -> Iterable[dict[str, Any]]:
|
47 |
batch_size = 100
|
|
|
65 |
self.next_idx += 1
|
66 |
yield item
|
67 |
|
68 |
+
|
69 |
+
def presidio_report(presidio_entities: list[PresidioEntity], next_row_idx: int, num_rows: int) -> dict[str, float]:
|
70 |
+
title = f"Scan finished: {len(presidio_entities)} entities found" if num_rows == next_row_idx else "Scan in progress..."
|
71 |
+
counter = Counter([title] * next_row_idx)
|
72 |
+
for row_idx, presidio_entities_per_row in groupby(presidio_entities, itemgetter("row_idx")):
|
73 |
+
counter.update(set("% of rows with " + presidio_entity["type"] for presidio_entity in presidio_entities_per_row))
|
74 |
+
return dict((presidio_entity_type, presidio_entity_type_row_count / num_rows) for presidio_entity_type, presidio_entity_type_row_count in counter.most_common())
|
75 |
+
|
76 |
+
|
77 |
+
def analyze_dataset(dataset: str, enabled_presidio_entities: list[str] = DEFAULT_PRESIDIO_ENTITIES, show_texts_without_masks: bool = False) -> pd.DataFrame:
|
78 |
info_resp = requests.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
|
79 |
if "error" in info_resp:
|
80 |
yield "β " + info_resp["error"], pd.DataFrame()
|
|
|
92 |
for presidio_entity in presidio_scan_entities(
|
93 |
rows, scanned_columns=scanned_columns, columns_descriptions=columns_descriptions
|
94 |
):
|
95 |
+
if not show_texts_without_masks:
|
96 |
+
presidio_entity["text"] = mask(presidio_entity["text"])
|
97 |
if presidio_entity["type"] in enabled_presidio_entities:
|
98 |
presidio_entities.append(presidio_entity)
|
99 |
+
yield presidio_report(presidio_entities, next_row_idx=rows.next_idx, num_rows=num_rows), pd.DataFrame(presidio_entities)
|
100 |
+
yield presidio_report(presidio_entities, next_row_idx=rows.next_idx, num_rows=num_rows), pd.DataFrame(presidio_entities)
|
101 |
|
102 |
with gr.Blocks() as demo:
|
103 |
gr.Markdown("# Scan datasets using Presidio")
|
|
|
114 |
value=DEFAULT_PRESIDIO_ENTITIES,
|
115 |
interactive=True,
|
116 |
),
|
117 |
+
gr.Checkbox(label="Show texts without masks", value=False),
|
118 |
]
|
119 |
button = gr.Button("Run Presidio Scan")
|
120 |
outputs = [
|
121 |
+
gr.Label(show_label=False),
|
122 |
gr.DataFrame(),
|
123 |
]
|
124 |
button.click(analyze_dataset, inputs, outputs)
|
125 |
gr.Examples(
|
126 |
[
|
127 |
+
["microsoft/orca-math-word-problems-200k"],
|
128 |
+
["tatsu-lab/alpaca"],
|
129 |
+
["Anthropic/hh-rlhf"],
|
130 |
+
["OpenAssistant/oasst1"],
|
131 |
+
["sidhq/email-thread-summary"],
|
132 |
+
["lhoestq/fake_name_and_ssn"]
|
133 |
],
|
134 |
inputs,
|
135 |
outputs,
|
136 |
fn=analyze_dataset,
|
137 |
+
run_on_click=True,
|
138 |
+
cache_examples=False,
|
139 |
)
|
140 |
|
141 |
demo.launch()
|