Spaces:

CarolXia
/

pii-classification-kd

Sleeping

App Files Files Community

pii-classification-kd / app.py

CarolXia

single thread 50

7961c58 27 days ago

raw

history blame contribute delete

5.82 kB

	import streamlit as st
	# from gliner import GLiNER
	from datasets import load_dataset
	from peft import PeftModel, PeftConfig
	import threading
	import time
	import torch
	from torch.profiler import profile, record_function, ProfilerActivity
	from transformers import DebertaV2ForTokenClassification, DebertaV2Tokenizer, pipeline

	def predict_entities(text, labels, entity_set):
	if labels == []:
	entities = recognizer(text)
	for entity in entities:
	if entity['entity'] in entity_set:
	entity_set[entity['entity']] += 1
	else:
	entity_set[entity['entity']] = 1
	else:
	# Use Gliner labels
	entities = model.predict_entities(text, labels, threshold = 0.7)
	for entity in entities:
	if entity['label'] in entity_set:
	entity_set[entity['label']] += 1
	else:
	entity_set[entity['label']] = 1

	def process_datasets(start, end, unmasked_text, sizes, index, entity_set, labels):
	size = 0
	text = ""
	for i in range(start, end):
	if len(text) < 700:
	text = text + " " + unmasked_text[i]
	else:
	size += len(text)
	predict_entities(text, labels, entity_set)

	text = unmasked_text[i]
	sizes[index] = size

	print(f"Is CUDA available: {torch.cuda.is_available()}")
	# True
	if torch.cuda.is_available():
	print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")


	# Load the fine-tuned GLiNER model
	st.write('Loading the pretrained model ...')
	model_name = "CarolXia/pii-kd-deberta-v2"
	# config = PeftConfig.from_pretrained(model_name)
	model = DebertaV2ForTokenClassification.from_pretrained(model_name, token=st.secrets["HUGGINGFACE_TOKEN"])
	if torch.cuda.is_available():
	model = model.to("cuda")
	# Try quantization instead
	# model = AutoModelForTokenClassification.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
	tokenizer = DebertaV2Tokenizer.from_pretrained("microsoft/mdeberta-v3-base", token=st.secrets["HUGGINGFACE_TOKEN"])
	recognizer = pipeline("ner", model=model, tokenizer=tokenizer)
	# model_name = "urchade/gliner_multi_pii-v1"
	# model = GLiNER.from_pretrained(model_name)

	# print weights
	pytorch_total_params = sum(p.numel() for p in model.parameters())
	torch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
	print(f'total params: {pytorch_total_params}. tunable params: {torch_total_params}')



	# Sample text containing PII/PHI entities
	text = """
	Hello Jane Doe. Your AnyCompany Financial Services, LLC credit card account
	4111-0000-1111-0000 has a minimum payment of $24.53 that is due by July 31st.
	Based on your autopay settings, we will withdraw your payment on the due date from
	your bank account XXXXXX1111 with the routing number XXXXX0000.

	Your latest statement was mailed to 100 Main Street, Anytown, WA 98121.
	After your payment is received, you will receive a confirmation text message
	at 206-555-0100.

	If you have questions about your bill, AnyCompany Customer Service is available by
	phone at 206-555-0199 or email at [email protected].
	"""

	# Define the labels for PII/PHI entities
	labels = [
	"medical_record_number",
	"date_of_birth",
	"ssn",
	"date",
	"first_name",
	"email",
	"last_name",
	"customer_id",
	"employee_id",
	"name",
	"street_address",
	"phone_number",
	"ipv4",
	"credit_card_number",
	"license_plate",
	"address",
	"user_name",
	"device_identifier",
	"bank_routing_number",
	"date_time",
	"company_name",
	"unique_identifier",
	"biometric_identifier",
	"account_number",
	"city",
	"certificate_license_number",
	"time",
	"postcode",
	"vehicle_identifier",
	"coordinate",
	"country",
	"api_key",
	"ipv6",
	"password",
	"health_plan_beneficiary_number",
	"national_id",
	"tax_id",
	"url",
	"state",
	"swift_bic",
	"cvv",
	"pin"
	]

	st.write('Trying a sample first')
	st.write(text)
	# Predict entities with a confidence threshold of 0.7
	# entities = model.predict_entities(text, labels, threshold=0.7)
	entities = recognizer(text)

	# Display the detected entities
	for entity in entities:
	st.write(entity)

	st.write('Processing the full dataset now ...')
	entity_set=dict()
	dataset = load_dataset("Isotonic/pii-masking-200k", split="train")
	unmasked_text = dataset['unmasked_text'] # This will load the entire column inmemory. Must do this to avoid I/O delay later

	st.write('Number of rows in the dataset ', dataset.num_rows)
	sizes = [0] * 5
	start = time.time()
	t0 = threading.Thread(target=process_datasets, args=(0, 10, unmasked_text, sizes, 0, entity_set, []))
	t1 = threading.Thread(target=process_datasets, args=(10, 20, unmasked_text, sizes, 1, entity_set, []))
	t2 = threading.Thread(target=process_datasets, args=(20, 30, unmasked_text, sizes, 2, entity_set, []))
	t3 = threading.Thread(target=process_datasets, args=(30, 40, unmasked_text, sizes, 3, entity_set, []))
	t4 = threading.Thread(target=process_datasets, args=(40, 50, unmasked_text, sizes, 4, entity_set, []))
	# with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], profile_memory=True, record_shapes=True) as prof:
	# process_datasets(0, 50, unmasked_text, sizes, 0, entity_set, [])
	t0.start()
	t1.start()
	t2.start()
	t3.start()
	t4.start()

	t0.join()
	t1.join()
	t2.join()
	t3.join()
	t4.join()

	end = time.time()
	length = end - start

	# Show the results : this can be altered however you like
	st.write('Bytes processed ', sum(sizes))
	st.write("It took", length, "seconds!")

	# Display the summary
	st.write('Total entities found')
	for key in entity_set:
	st.write(key, ' => ', entity_set[key])

	st.write(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))