Spaces:
Sleeping
Sleeping
File size: 4,100 Bytes
ce54c6a 6db2364 ce54c6a 0a66247 9927ce5 ce54c6a 9927ce5 ce54c6a 0555443 9927ce5 0555443 9927ce5 0a66247 9927ce5 6db2364 9927ce5 0555443 0a66247 0555443 ce54c6a 9927ce5 0a66247 396e40e 0a66247 9927ce5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
from transformers import pipeline
import requests
import json
import gradio as gr
js = """
async () => {
function showCard(event, title, content) {
document.getElementById('hovercard').style.visibility = 'visible';
document.getElementById('card_title').innerText = title;
document.getElementById('card_content').innerText = content;
}
function hideCard(event) {
document.getElementById('hovercard').style.visibility = 'hidden';
}
globalThis.showCard = showCard;
globalThis.hideCard = hideCard;
}
"""
def get_matches(text):
pred = pipe(text, max_length=5000)[0]["translation_text"]
def get_mapping(pred):
pred = pred.split(" = ")
pred = [x.split("+") for x in pred]
flat = [x for y in pred for x in y]
flat = [x.split(":") for x in flat]
return flat
mapping = get_mapping(pred)
# only keep tuples with length 2
mapping = [x for x in mapping if len(x) == 2]
matches = []
cur = mapping.pop(0)
i = 0
done = False
while i < len(text) and not done:
if text[i:].startswith(cur[0]):
matches.append({"start": i, "end": i+len(cur[0]), "match": cur[0], "lexicon": cur[1]})
i += len(cur[0])
if len(mapping) == 0:
done = True
else:
cur = mapping.pop(0)
else:
i += 1
return (text, pred, matches)
pipe = pipeline("translation", "guymorlan/TokenizerLabeller")
r = requests.get("https://huggingface.co/guymorlan/TokenizerLabeller/raw/main/playaling_words.json")
data = json.loads(r.text)
def predict(input):
text, pred, matches = get_matches(input)
matches = {x["start"]: x for x in matches}
output = f"""
<div style='direction: rtl; text-align: right; font-size: 18px; font-family: Arial, sans-serif; line-height: 1.5'>"""
i = 0
while i < len(text):
if i in matches:
match = matches[i]["lexicon"]
# if match ends with _R, remove _R suffix
if match.endswith("_R"):
match = match[:-2]
if match in data:
# match = matches[i]["lexicon"]
output += f"""
<span style='background-color: #4CAF50; color: #FFFFFF; border: 1px solid #4CAF50; border-radius: 5px; font-family: "Courier New", Courier, monospace;'
onmouseover='showCard(event, "{data[match]['translation']}", "{data[match]['features']}")'
onmouseout='hideCard(event)' onclick='showCard(event, "{data[match]['translation']}", "{data[match]['features']}")'>{matches[i]['match']}</span>
"""
else:
output += matches[i]["match"]
i = matches[i]["end"]
else:
print(f"'{text[i]}'")
if text[i] == " ":
output += " "
else:
output += text[i]
i += 1
output += "</div>"
output += """
<div id='hovercard' style='position: absolute; visibility: hidden; background: #FFFFFF; padding: 10px;
border: 1px solid #9E9E9E; border-radius: 5px; font-family: Arial, sans-serif;'>
<h3 id='card_title' style='color: #000000;'></h3>
<p id='card_content' style='color: #000000;'></p>
</div>
"""
return output
with gr.Blocks(theme=gr.themes.Soft(), title="Ammiya Tokenizer and Labeler") as demo:
gr.HTML("<h2><span style='color: #2563eb'>Colloquial Arabic</span></h2> Tokenizer and Annotator")
with gr.Row():
with gr.Column():
input = gr.Textbox(label="Input", placeholder="Enter English Text", lines=1)
gr.Examples(["بديش اروح معك", "معملتش اشي"], input)
btn = gr.Button(label="Analyze")
with gr.Column():
with gr.Box():
html = gr.HTML()
btn.click(predict, inputs=[input], outputs=[html])
input.submit(predict, inputs = [input], outputs=[html])
demo.load(_js=js)
demo.launch() |