Spaces:

xhiroga
/

llm-exercise-report

Running

File size: 3,200 Bytes

9ec8a57
 
 
 
b943e82
 
9ec8a57
c40ecde
9ec8a57
 
 
 
 
 
 
 
22ef21f
9ec8a57
 
 
 
 
 
 
 
 
 
 
d6da45f
9ec8a57
 
d6da45f
9ec8a57
 
 
c40ecde
9ec8a57
d6da45f
 
 
9ec8a57
 
d6da45f
9ec8a57
 
d6da45f
9ec8a57
 
c40ecde
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b943e82
 
c40ecde

import json
import os
import tempfile

import gradio as gr

from utils import evaluate, report
from transformers import AutoTokenizer


def process_jsonl_file(jsonl_file_path: str, api_key: str):
    try:
        content = open(jsonl_file_path, "r", encoding="utf-8").readlines()
        json_data = [json.loads(line) for line in content]

        if api_key is not None and api_key != "":
            json_data = evaluate(json_data, api_key)

        html_content = report(tasks=json_data)

        file_name_with_ext = os.path.basename(jsonl_file_path)
        file_name, _ = os.path.splitext(file_name_with_ext)

        with tempfile.NamedTemporaryFile(
            delete=False, prefix=f"{file_name}-report-", suffix=".html", mode="w", encoding="utf-8"
        ) as temp_file:
            temp_file.write(html_content)
            output_file = temp_file.name
        return output_file, ""

    except Exception as e:
        return None, e


# Gradioデモ
with gr.Blocks() as reporting:
    jsonl_input = gr.File(label="JSONLファイルをアップロード")
    api_key_input = gr.Textbox(label="GeminiのAPIキー（スコアのセルフ評価を行う場合）", type="password")
    gr.Markdown("APIキーの発行は[こちら](https://aistudio.google.com/app/apikey)")
    process_button = gr.Button("レポートを作成")

    output_file = gr.File(label="セルフ評価レポート")
    output_text = gr.Textbox(label="システムメッセージ")

    process_button.click(
        process_jsonl_file, inputs=[jsonl_input, api_key_input], outputs=[output_file, output_text]
    )

llm_jp_3 = "llm-jp/llm-jp-3-1.8b"
gemma_2 = "google/gemma-2-2b"

llm_jp_3_tokenizer = AutoTokenizer.from_pretrained(llm_jp_3, trust_remote_code=True)
gemma_2_tokenizer = AutoTokenizer.from_pretrained(gemma_2, trust_remote_code=True)

tokenizers = {
    "LLM-JP-3": llm_jp_3_tokenizer,
    "Gemma-2": gemma_2_tokenizer
}

def tokenize_text(text: str, tokenizer_name: str):
    tokenizer = tokenizers[tokenizer_name]
    tokens = tokenizer.tokenize(text)
    colors = ['#FFCCCC', '#CCFFCC', '#CCCCFF', '#FFFFCC', '#CCFFFF', '#FFCCFF']
    tokenized_text = ''.join([f'<span style="background-color:{colors[i % len(colors)]}">{token}</span> ' for i, token in enumerate(tokens)])
    token_count = len(tokens)
    return f"<p>{tokenized_text}</p><p>Token Count: {token_count}</p>"

with gr.Blocks() as tokenization:
    with gr.Row():
        tokenizer_dropdown = gr.Dropdown(label="Tokenizerを選択", choices=["LLM-JP-3", "Gemma-2"], value="LLM-JP-3")
    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(label="Input Text")
        with gr.Column():
            tokenized_output = gr.HTML(label="Tokenized Output")

    tokenizer_dropdown.change(tokenize_text, inputs=[text_input, tokenizer_dropdown], outputs=tokenized_output)
    text_input.change(tokenize_text, inputs=[text_input, tokenizer_dropdown], outputs=tokenized_output)

tabbed = gr.TabbedInterface(
    [reporting, tokenization], 
    tab_names=["ELYZA-tasks-100(-TV) セルフ評価", "トークン化の可視化"],
    title="LLM開発支援ツール"
)

if __name__ == "__main__":
    tabbed.launch()