Spaces:
Running
Running
File size: 4,596 Bytes
aa428bd 9ec8a57 b943e82 9ec8a57 c40ecde 9ec8a57 e642ef1 9ec8a57 22ef21f 9ec8a57 aa428bd 9ec8a57 31fce62 9ec8a57 e642ef1 aa428bd 9ec8a57 aa428bd 9ec8a57 e642ef1 9ec8a57 31fce62 d6da45f 9ec8a57 aa428bd d6da45f 9ec8a57 31fce62 aa428bd 9ec8a57 e642ef1 c40ecde 31fce62 c40ecde 31fce62 c40ecde fb1a07e e642ef1 c40ecde 31fce62 c40ecde 31fce62 c40ecde 31fce62 c40ecde 31fce62 704a0d2 31fce62 c40ecde b943e82 c40ecde |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
import csv
import json
import os
import tempfile
import gradio as gr
from utils import evaluate, report
from transformers import AutoTokenizer
# https://x.com/abidlabs/status/1721548226250371264/photo/1
# https://github.com/gradio-app/gradio/issues/5954
ga_script = """
<script async src="https://www.googletagmanager.com/gtag/js?id=G-0SHLFV3PV0"></script>
"""
ga_load = """
function() {
window.dataLayer = window.dataLayer || [];
function gtag(){dataLayer.push(arguments);}
gtag('js', new Date());
gtag('config', 'G-0SHLFV3PV0');
}
"""
def process_jsonl_file(jsonl_file_path: str, api_key: str):
try:
content = open(jsonl_file_path, "r", encoding="utf-8").readlines()
json_data = [json.loads(line) for line in content]
if api_key is not None and api_key != "":
json_data = evaluate(json_data, api_key)
html_content = report(tasks=json_data)
file_name_with_ext = os.path.basename(jsonl_file_path)
file_name, _ = os.path.splitext(file_name_with_ext)
output_file = None
with tempfile.NamedTemporaryFile(
delete=False,
prefix=f"{file_name}-report-",
suffix=".html",
mode="w",
encoding="utf-8",
) as temp_file:
temp_file.write(html_content)
output_file = temp_file.name
output_csv = None
keys = json_data[0].keys()
with tempfile.NamedTemporaryFile(
delete=False,
prefix=f"{file_name}-report-",
suffix=".csv",
mode="w",
encoding="utf-8",
) as temp_file:
dict_writer = csv.DictWriter(temp_file, fieldnames=keys)
dict_writer.writeheader()
dict_writer.writerows(json_data)
output_csv = temp_file.name
return output_file, output_csv, ""
except Exception as e:
return None, None, e
with gr.Blocks(head=ga_script) as reporting:
jsonl_input = gr.File(label="JSONLファイルをアップロード")
api_key_input = gr.Textbox(
label="GeminiのAPIキー(スコアのセルフ評価を行う場合)", type="password"
)
gr.Markdown("APIキーの発行は[こちら](https://aistudio.google.com/app/apikey)")
process_button = gr.Button("レポートを作成")
output_file = gr.File(label="セルフ評価レポート(HTML)")
output_csv = gr.File(label="セルフ評価レポート(CSV)")
output_text = gr.Textbox(label="システムメッセージ")
process_button.click(
process_jsonl_file,
inputs=[jsonl_input, api_key_input],
outputs=[output_file, output_csv, output_text],
)
reporting.load(None, js=ga_load)
llm_jp_3 = "llm-jp/llm-jp-3-1.8b"
gemma_2 = "google/gemma-2-2b"
llm_jp_3_tokenizer = AutoTokenizer.from_pretrained(llm_jp_3, trust_remote_code=True)
tokenizers = {
"LLM-JP-3": llm_jp_3_tokenizer,
}
try:
gemma_2_tokenizer = AutoTokenizer.from_pretrained(gemma_2, trust_remote_code=True)
tokenizers["Gemma-2"] = gemma_2_tokenizer
except OSError as e:
print(e)
tokenizer_names = list(tokenizers.keys())
def tokenize_text(text: str, tokenizer_name: str):
tokenizer = tokenizers[tokenizer_name]
tokens = tokenizer.tokenize(text)
colors = ["#FFCCCC", "#CCFFCC", "#CCCCFF", "#FFFFCC", "#CCFFFF", "#FFCCFF"]
tokenized_text = "".join(
[
f'<span style="background-color:{colors[i % len(colors)]}">{token}</span> '
for i, token in enumerate(tokens)
]
)
token_count = len(tokens)
return f"<p>{tokenized_text}</p><p>Token Count: {token_count}</p>"
with gr.Blocks() as tokenization:
with gr.Row():
tokenizer_dropdown = gr.Dropdown(
label="Tokenizerを選択", choices=tokenizer_names, value=tokenizer_names[0]
)
with gr.Row():
with gr.Column():
text_input = gr.Textbox(label="Input Text")
with gr.Column():
tokenized_output = gr.HTML(
tokenize_text("", tokenizer_names[0]), label="Tokenized Output"
)
tokenizer_dropdown.change(
tokenize_text, inputs=[text_input, tokenizer_dropdown], outputs=tokenized_output
)
text_input.change(
tokenize_text, inputs=[text_input, tokenizer_dropdown], outputs=tokenized_output
)
tabbed = gr.TabbedInterface(
[reporting, tokenization],
tab_names=["ELYZA-tasks-100(-TV) セルフ評価", "トークンの可視化"],
title="LLM開発支援ツール",
)
if __name__ == "__main__":
tabbed.launch()
|