guymorlan's picture
Update app.py
396e40e
raw
history blame
4.1 kB
from transformers import pipeline
import requests
import json
import gradio as gr
js = """
async () => {
function showCard(event, title, content) {
document.getElementById('hovercard').style.visibility = 'visible';
document.getElementById('card_title').innerText = title;
document.getElementById('card_content').innerText = content;
}
function hideCard(event) {
document.getElementById('hovercard').style.visibility = 'hidden';
}
globalThis.showCard = showCard;
globalThis.hideCard = hideCard;
}
"""
def get_matches(text):
pred = pipe(text, max_length=5000)[0]["translation_text"]
def get_mapping(pred):
pred = pred.split(" = ")
pred = [x.split("+") for x in pred]
flat = [x for y in pred for x in y]
flat = [x.split(":") for x in flat]
return flat
mapping = get_mapping(pred)
# only keep tuples with length 2
mapping = [x for x in mapping if len(x) == 2]
matches = []
cur = mapping.pop(0)
i = 0
done = False
while i < len(text) and not done:
if text[i:].startswith(cur[0]):
matches.append({"start": i, "end": i+len(cur[0]), "match": cur[0], "lexicon": cur[1]})
i += len(cur[0])
if len(mapping) == 0:
done = True
else:
cur = mapping.pop(0)
else:
i += 1
return (text, pred, matches)
pipe = pipeline("translation", "guymorlan/TokenizerLabeller")
r = requests.get("https://huggingface.co/guymorlan/TokenizerLabeller/raw/main/playaling_words.json")
data = json.loads(r.text)
def predict(input):
text, pred, matches = get_matches(input)
matches = {x["start"]: x for x in matches}
output = f"""
<div style='direction: rtl; text-align: right; font-size: 18px; font-family: Arial, sans-serif; line-height: 1.5'>"""
i = 0
while i < len(text):
if i in matches:
match = matches[i]["lexicon"]
# if match ends with _R, remove _R suffix
if match.endswith("_R"):
match = match[:-2]
if match in data:
# match = matches[i]["lexicon"]
output += f"""
<span style='background-color: #4CAF50; color: #FFFFFF; border: 1px solid #4CAF50; border-radius: 5px; font-family: "Courier New", Courier, monospace;'
onmouseover='showCard(event, "{data[match]['translation']}", "{data[match]['features']}")'
onmouseout='hideCard(event)' onclick='showCard(event, "{data[match]['translation']}", "{data[match]['features']}")'>{matches[i]['match']}</span>
"""
else:
output += matches[i]["match"]
i = matches[i]["end"]
else:
print(f"'{text[i]}'")
if text[i] == " ":
output += "&nbsp"
else:
output += text[i]
i += 1
output += "</div>"
output += """
<div id='hovercard' style='position: absolute; visibility: hidden; background: #FFFFFF; padding: 10px;
border: 1px solid #9E9E9E; border-radius: 5px; font-family: Arial, sans-serif;'>
<h3 id='card_title' style='color: #000000;'></h3>
<p id='card_content' style='color: #000000;'></p>
</div>
"""
return output
with gr.Blocks(theme=gr.themes.Soft(), title="Ammiya Tokenizer and Labeler") as demo:
gr.HTML("<h2><span style='color: #2563eb'>Colloquial Arabic</span></h2> Tokenizer and Annotator")
with gr.Row():
with gr.Column():
input = gr.Textbox(label="Input", placeholder="Enter English Text", lines=1)
gr.Examples(["بديش اروح معك", "معملتش اشي"], input)
btn = gr.Button(label="Analyze")
with gr.Column():
with gr.Box():
html = gr.HTML()
btn.click(predict, inputs=[input], outputs=[html])
input.submit(predict, inputs = [input], outputs=[html])
demo.load(_js=js)
demo.launch()