File size: 4,100 Bytes
ce54c6a
 
 
6db2364
ce54c6a
0a66247
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9927ce5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ce54c6a
 
 
 
 
 
9927ce5
 
 
ce54c6a
0555443
9927ce5
 
 
 
 
 
 
 
 
 
0555443
9927ce5
 
0a66247
9927ce5
 
 
 
6db2364
9927ce5
 
 
 
 
 
 
 
 
 
0555443
 
 
0a66247
 
 
 
 
0555443
 
ce54c6a
9927ce5
0a66247
 
 
 
396e40e
0a66247
 
 
 
 
 
 
 
9927ce5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from transformers import pipeline
import requests
import json
import gradio as gr

js = """
async () => {
    function showCard(event, title, content) {
        document.getElementById('hovercard').style.visibility = 'visible';
        document.getElementById('card_title').innerText = title;
        document.getElementById('card_content').innerText = content;
    }

    function hideCard(event) {
        document.getElementById('hovercard').style.visibility = 'hidden';
    }

    globalThis.showCard = showCard;
    globalThis.hideCard = hideCard;
}
"""

def get_matches(text):
    pred = pipe(text, max_length=5000)[0]["translation_text"]
    
    def get_mapping(pred):
        pred = pred.split(" = ")
        pred = [x.split("+") for x in pred]
        flat = [x for y in pred for x in y]
        flat = [x.split(":") for x in flat]
        return flat

    mapping = get_mapping(pred)
    # only keep tuples with length 2
    mapping = [x for x in mapping if len(x) == 2]


    matches = []
    cur = mapping.pop(0)
    i = 0
    done = False

    while i < len(text) and not done:
        if text[i:].startswith(cur[0]):
            matches.append({"start": i, "end": i+len(cur[0]), "match": cur[0], "lexicon": cur[1]})
            i += len(cur[0])
            if len(mapping) == 0:
                done = True
            else:
                cur = mapping.pop(0)
        else:
            i += 1
            
    return (text, pred, matches)

pipe = pipeline("translation", "guymorlan/TokenizerLabeller")

r = requests.get("https://huggingface.co/guymorlan/TokenizerLabeller/raw/main/playaling_words.json")
data = json.loads(r.text)

def predict(input):
    text, pred, matches = get_matches(input)

    matches = {x["start"]: x for x in matches}

    output = f"""
        <div style='direction: rtl; text-align: right; font-size: 18px; font-family: Arial, sans-serif; line-height: 1.5'>"""

    i = 0
    while i < len(text):
        if i in matches:

            match = matches[i]["lexicon"]
            # if match ends with _R, remove _R suffix
            if match.endswith("_R"):
                match = match[:-2]

            if match in data:
                # match = matches[i]["lexicon"]
                output += f"""
                        <span style='background-color: #4CAF50; color: #FFFFFF; border: 1px solid #4CAF50; border-radius: 5px; font-family: "Courier New", Courier, monospace;' 
                        onmouseover='showCard(event, "{data[match]['translation']}", "{data[match]['features']}")' 
                        onmouseout='hideCard(event)' onclick='showCard(event, "{data[match]['translation']}", "{data[match]['features']}")'>{matches[i]['match']}</span>
                        """
            else:
                output += matches[i]["match"]
            i = matches[i]["end"]
        else:
            print(f"'{text[i]}'")
            if text[i] == " ":
                output += "&nbsp"
            else:
                output += text[i]
            i += 1

    output += "</div>"

    output += """
    <div id='hovercard' style='position: absolute; visibility: hidden; background: #FFFFFF; padding: 10px; 
    border: 1px solid #9E9E9E; border-radius: 5px; font-family: Arial, sans-serif;'>
        <h3 id='card_title' style='color: #000000;'></h3>
        <p id='card_content' style='color: #000000;'></p>
    </div>
    """
    return output

with gr.Blocks(theme=gr.themes.Soft(), title="Ammiya Tokenizer and Labeler") as demo:
    gr.HTML("<h2><span style='color: #2563eb'>Colloquial Arabic</span></h2> Tokenizer and Annotator")
    with gr.Row():
        with gr.Column():
            input = gr.Textbox(label="Input", placeholder="Enter English Text", lines=1)
            gr.Examples(["بديش اروح معك", "معملتش اشي"], input)
            btn = gr.Button(label="Analyze")
        with gr.Column():
            with gr.Box():
                html = gr.HTML()
    btn.click(predict, inputs=[input], outputs=[html])
    input.submit(predict, inputs = [input], outputs=[html])

    demo.load(_js=js)
    demo.launch()