Spaces:

guymorlan
/

TokenizerLabeller

Sleeping

App Files Files Community

TokenizerLabeller / app.py

guymorlan

Update app.py

396e40e over 1 year ago

raw

history blame

4.1 kB

	from transformers import pipeline
	import requests
	import json
	import gradio as gr

	js = """
	async () => {
	function showCard(event, title, content) {
	document.getElementById('hovercard').style.visibility = 'visible';
	document.getElementById('card_title').innerText = title;
	document.getElementById('card_content').innerText = content;
	}

	function hideCard(event) {
	document.getElementById('hovercard').style.visibility = 'hidden';
	}

	globalThis.showCard = showCard;
	globalThis.hideCard = hideCard;
	}
	"""

	def get_matches(text):
	pred = pipe(text, max_length=5000)[0]["translation_text"]

	def get_mapping(pred):
	pred = pred.split(" = ")
	pred = [x.split("+") for x in pred]
	flat = [x for y in pred for x in y]
	flat = [x.split(":") for x in flat]
	return flat

	mapping = get_mapping(pred)
	# only keep tuples with length 2
	mapping = [x for x in mapping if len(x) == 2]


	matches = []
	cur = mapping.pop(0)
	i = 0
	done = False

	while i < len(text) and not done:
	if text[i:].startswith(cur[0]):
	matches.append({"start": i, "end": i+len(cur[0]), "match": cur[0], "lexicon": cur[1]})
	i += len(cur[0])
	if len(mapping) == 0:
	done = True
	else:
	cur = mapping.pop(0)
	else:
	i += 1

	return (text, pred, matches)

	pipe = pipeline("translation", "guymorlan/TokenizerLabeller")

	r = requests.get("https://huggingface.co/guymorlan/TokenizerLabeller/raw/main/playaling_words.json")
	data = json.loads(r.text)

	def predict(input):
	text, pred, matches = get_matches(input)

	matches = {x["start"]: x for x in matches}

	output = f"""
	<div style='direction: rtl; text-align: right; font-size: 18px; font-family: Arial, sans-serif; line-height: 1.5'>"""

	i = 0
	while i < len(text):
	if i in matches:

	match = matches[i]["lexicon"]
	# if match ends with _R, remove _R suffix
	if match.endswith("_R"):
	match = match[:-2]

	if match in data:
	# match = matches[i]["lexicon"]
	output += f"""
	<span style='background-color: #4CAF50; color: #FFFFFF; border: 1px solid #4CAF50; border-radius: 5px; font-family: "Courier New", Courier, monospace;'
	onmouseover='showCard(event, "{data[match]['translation']}", "{data[match]['features']}")'
	onmouseout='hideCard(event)' onclick='showCard(event, "{data[match]['translation']}", "{data[match]['features']}")'>{matches[i]['match']}</span>
	"""
	else:
	output += matches[i]["match"]
	i = matches[i]["end"]
	else:
	print(f"'{text[i]}'")
	if text[i] == " ":
	output += "&nbsp"
	else:
	output += text[i]
	i += 1

	output += "</div>"

	output += """
	<div id='hovercard' style='position: absolute; visibility: hidden; background: #FFFFFF; padding: 10px;
	border: 1px solid #9E9E9E; border-radius: 5px; font-family: Arial, sans-serif;'>
	<h3 id='card_title' style='color: #000000;'></h3>
	<p id='card_content' style='color: #000000;'></p>
	</div>
	"""
	return output

	with gr.Blocks(theme=gr.themes.Soft(), title="Ammiya Tokenizer and Labeler") as demo:
	gr.HTML("<h2><span style='color: #2563eb'>Colloquial Arabic</span></h2> Tokenizer and Annotator")
	with gr.Row():
	with gr.Column():
	input = gr.Textbox(label="Input", placeholder="Enter English Text", lines=1)
	gr.Examples(["بديش اروح معك", "معملتش اشي"], input)
	btn = gr.Button(label="Analyze")
	with gr.Column():
	with gr.Box():
	html = gr.HTML()
	btn.click(predict, inputs=[input], outputs=[html])
	input.submit(predict, inputs = [input], outputs=[html])

	demo.load(_js=js)
	demo.launch()