File size: 4,596 Bytes
aa428bd
9ec8a57
 
 
 
b943e82
 
9ec8a57
c40ecde
9ec8a57
 
e642ef1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9ec8a57
 
 
 
 
 
22ef21f
9ec8a57
 
 
 
 
 
aa428bd
9ec8a57
31fce62
 
 
 
 
9ec8a57
 
 
e642ef1
aa428bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9ec8a57
 
aa428bd
9ec8a57
 
e642ef1
9ec8a57
31fce62
 
 
d6da45f
 
9ec8a57
aa428bd
 
d6da45f
9ec8a57
 
31fce62
 
aa428bd
9ec8a57
 
e642ef1
 
c40ecde
 
 
 
 
 
 
 
31fce62
 
 
 
 
 
 
 
 
c40ecde
 
 
31fce62
 
 
 
 
 
 
c40ecde
 
 
fb1a07e
e642ef1
c40ecde
31fce62
 
 
c40ecde
 
 
 
31fce62
 
 
c40ecde
31fce62
 
 
 
 
 
c40ecde
 
31fce62
704a0d2
31fce62
c40ecde
b943e82
 
c40ecde
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import csv
import json
import os
import tempfile

import gradio as gr

from utils import evaluate, report
from transformers import AutoTokenizer


# https://x.com/abidlabs/status/1721548226250371264/photo/1
# https://github.com/gradio-app/gradio/issues/5954
ga_script = """
<script async src="https://www.googletagmanager.com/gtag/js?id=G-0SHLFV3PV0"></script>
"""
ga_load = """
function() {
  window.dataLayer = window.dataLayer || [];
  function gtag(){dataLayer.push(arguments);}
  gtag('js', new Date());

  gtag('config', 'G-0SHLFV3PV0');
}
"""

def process_jsonl_file(jsonl_file_path: str, api_key: str):
    try:
        content = open(jsonl_file_path, "r", encoding="utf-8").readlines()
        json_data = [json.loads(line) for line in content]

        if api_key is not None and api_key != "":
            json_data = evaluate(json_data, api_key)

        html_content = report(tasks=json_data)

        file_name_with_ext = os.path.basename(jsonl_file_path)
        file_name, _ = os.path.splitext(file_name_with_ext)

        output_file = None
        with tempfile.NamedTemporaryFile(
            delete=False,
            prefix=f"{file_name}-report-",
            suffix=".html",
            mode="w",
            encoding="utf-8",
        ) as temp_file:
            temp_file.write(html_content)
            output_file = temp_file.name

        output_csv = None
        keys = json_data[0].keys()
        with tempfile.NamedTemporaryFile(
            delete=False,
            prefix=f"{file_name}-report-",
            suffix=".csv",
            mode="w",
            encoding="utf-8",
        ) as temp_file:
            dict_writer = csv.DictWriter(temp_file, fieldnames=keys)
            dict_writer.writeheader()
            dict_writer.writerows(json_data)
            output_csv = temp_file.name

        return output_file, output_csv, ""

    except Exception as e:
        return None, None, e


with gr.Blocks(head=ga_script) as reporting:
    jsonl_input = gr.File(label="JSONLファイルをアップロード")
    api_key_input = gr.Textbox(
        label="GeminiのAPIキー(スコアのセルフ評価を行う場合)", type="password"
    )
    gr.Markdown("APIキーの発行は[こちら](https://aistudio.google.com/app/apikey)")
    process_button = gr.Button("レポートを作成")

    output_file = gr.File(label="セルフ評価レポート(HTML)")
    output_csv = gr.File(label="セルフ評価レポート(CSV)")
    output_text = gr.Textbox(label="システムメッセージ")

    process_button.click(
        process_jsonl_file,
        inputs=[jsonl_input, api_key_input],
        outputs=[output_file, output_csv, output_text],
    )

    reporting.load(None, js=ga_load)

llm_jp_3 = "llm-jp/llm-jp-3-1.8b"
gemma_2 = "google/gemma-2-2b"

llm_jp_3_tokenizer = AutoTokenizer.from_pretrained(llm_jp_3, trust_remote_code=True)
tokenizers = {
    "LLM-JP-3": llm_jp_3_tokenizer,
}

try:
    gemma_2_tokenizer = AutoTokenizer.from_pretrained(gemma_2, trust_remote_code=True)
    tokenizers["Gemma-2"] = gemma_2_tokenizer
except OSError as e:
    print(e)

tokenizer_names = list(tokenizers.keys())


def tokenize_text(text: str, tokenizer_name: str):
    tokenizer = tokenizers[tokenizer_name]
    tokens = tokenizer.tokenize(text)
    colors = ["#FFCCCC", "#CCFFCC", "#CCCCFF", "#FFFFCC", "#CCFFFF", "#FFCCFF"]
    tokenized_text = "".join(
        [
            f'<span style="background-color:{colors[i % len(colors)]}">{token}</span> '
            for i, token in enumerate(tokens)
        ]
    )
    token_count = len(tokens)
    return f"<p>{tokenized_text}</p><p>Token Count: {token_count}</p>"


with gr.Blocks() as tokenization:
    with gr.Row():
        tokenizer_dropdown = gr.Dropdown(
            label="Tokenizerを選択", choices=tokenizer_names, value=tokenizer_names[0]
        )
    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(label="Input Text")
        with gr.Column():
            tokenized_output = gr.HTML(
                tokenize_text("", tokenizer_names[0]), label="Tokenized Output"
            )

    tokenizer_dropdown.change(
        tokenize_text, inputs=[text_input, tokenizer_dropdown], outputs=tokenized_output
    )
    text_input.change(
        tokenize_text, inputs=[text_input, tokenizer_dropdown], outputs=tokenized_output
    )

tabbed = gr.TabbedInterface(
    [reporting, tokenization],
    tab_names=["ELYZA-tasks-100(-TV) セルフ評価", "トークンの可視化"],
    title="LLM開発支援ツール",
)

if __name__ == "__main__":
    tabbed.launch()