File size: 3,200 Bytes
9ec8a57
 
 
 
b943e82
 
9ec8a57
c40ecde
9ec8a57
 
 
 
 
 
 
 
22ef21f
9ec8a57
 
 
 
 
 
 
 
 
 
 
d6da45f
9ec8a57
 
d6da45f
9ec8a57
 
 
c40ecde
9ec8a57
d6da45f
 
 
9ec8a57
 
d6da45f
9ec8a57
 
d6da45f
9ec8a57
 
c40ecde
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b943e82
 
c40ecde
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import json
import os
import tempfile

import gradio as gr

from utils import evaluate, report
from transformers import AutoTokenizer


def process_jsonl_file(jsonl_file_path: str, api_key: str):
    try:
        content = open(jsonl_file_path, "r", encoding="utf-8").readlines()
        json_data = [json.loads(line) for line in content]

        if api_key is not None and api_key != "":
            json_data = evaluate(json_data, api_key)

        html_content = report(tasks=json_data)

        file_name_with_ext = os.path.basename(jsonl_file_path)
        file_name, _ = os.path.splitext(file_name_with_ext)

        with tempfile.NamedTemporaryFile(
            delete=False, prefix=f"{file_name}-report-", suffix=".html", mode="w", encoding="utf-8"
        ) as temp_file:
            temp_file.write(html_content)
            output_file = temp_file.name
        return output_file, ""

    except Exception as e:
        return None, e


# Gradioデモ
with gr.Blocks() as reporting:
    jsonl_input = gr.File(label="JSONLファイルをアップロード")
    api_key_input = gr.Textbox(label="GeminiのAPIキー(スコアのセルフ評価を行う場合)", type="password")
    gr.Markdown("APIキーの発行は[こちら](https://aistudio.google.com/app/apikey)")
    process_button = gr.Button("レポートを作成")

    output_file = gr.File(label="セルフ評価レポート")
    output_text = gr.Textbox(label="システムメッセージ")

    process_button.click(
        process_jsonl_file, inputs=[jsonl_input, api_key_input], outputs=[output_file, output_text]
    )

llm_jp_3 = "llm-jp/llm-jp-3-1.8b"
gemma_2 = "google/gemma-2-2b"

llm_jp_3_tokenizer = AutoTokenizer.from_pretrained(llm_jp_3, trust_remote_code=True)
gemma_2_tokenizer = AutoTokenizer.from_pretrained(gemma_2, trust_remote_code=True)

tokenizers = {
    "LLM-JP-3": llm_jp_3_tokenizer,
    "Gemma-2": gemma_2_tokenizer
}

def tokenize_text(text: str, tokenizer_name: str):
    tokenizer = tokenizers[tokenizer_name]
    tokens = tokenizer.tokenize(text)
    colors = ['#FFCCCC', '#CCFFCC', '#CCCCFF', '#FFFFCC', '#CCFFFF', '#FFCCFF']
    tokenized_text = ''.join([f'<span style="background-color:{colors[i % len(colors)]}">{token}</span> ' for i, token in enumerate(tokens)])
    token_count = len(tokens)
    return f"<p>{tokenized_text}</p><p>Token Count: {token_count}</p>"

with gr.Blocks() as tokenization:
    with gr.Row():
        tokenizer_dropdown = gr.Dropdown(label="Tokenizerを選択", choices=["LLM-JP-3", "Gemma-2"], value="LLM-JP-3")
    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(label="Input Text")
        with gr.Column():
            tokenized_output = gr.HTML(label="Tokenized Output")

    tokenizer_dropdown.change(tokenize_text, inputs=[text_input, tokenizer_dropdown], outputs=tokenized_output)
    text_input.change(tokenize_text, inputs=[text_input, tokenizer_dropdown], outputs=tokenized_output)

tabbed = gr.TabbedInterface(
    [reporting, tokenization], 
    tab_names=["ELYZA-tasks-100(-TV) セルフ評価", "トークン化の可視化"],
    title="LLM開発支援ツール"
)

if __name__ == "__main__":
    tabbed.launch()