Spaces:
Sleeping
Sleeping
| import csv | |
| import json | |
| import os | |
| import tempfile | |
| import gradio as gr | |
| from utils import evaluate, report | |
| from transformers import AutoTokenizer | |
| # https://x.com/abidlabs/status/1721548226250371264/photo/1 | |
| # https://github.com/gradio-app/gradio/issues/5954 | |
| ga_script = """ | |
| <script async src="https://www.googletagmanager.com/gtag/js?id=G-0SHLFV3PV0"></script> | |
| """ | |
| ga_load = """ | |
| function() { | |
| window.dataLayer = window.dataLayer || []; | |
| function gtag(){dataLayer.push(arguments);} | |
| gtag('js', new Date()); | |
| gtag('config', 'G-0SHLFV3PV0'); | |
| } | |
| """ | |
| def process_jsonl_file(jsonl_file_path: str, api_key: str): | |
| try: | |
| content = open(jsonl_file_path, "r", encoding="utf-8").readlines() | |
| json_data = [json.loads(line) for line in content] | |
| if api_key is not None and api_key != "": | |
| json_data = evaluate(json_data, api_key) | |
| html_content = report(tasks=json_data) | |
| file_name_with_ext = os.path.basename(jsonl_file_path) | |
| file_name, _ = os.path.splitext(file_name_with_ext) | |
| output_file = None | |
| with tempfile.NamedTemporaryFile( | |
| delete=False, | |
| prefix=f"{file_name}-report-", | |
| suffix=".html", | |
| mode="w", | |
| encoding="utf-8", | |
| ) as temp_file: | |
| temp_file.write(html_content) | |
| output_file = temp_file.name | |
| output_csv = None | |
| keys = json_data[0].keys() | |
| with tempfile.NamedTemporaryFile( | |
| delete=False, | |
| prefix=f"{file_name}-report-", | |
| suffix=".csv", | |
| mode="w", | |
| encoding="utf-8", | |
| ) as temp_file: | |
| dict_writer = csv.DictWriter(temp_file, fieldnames=keys) | |
| dict_writer.writeheader() | |
| dict_writer.writerows(json_data) | |
| output_csv = temp_file.name | |
| return output_file, output_csv, "" | |
| except Exception as e: | |
| return None, None, e | |
| with gr.Blocks(head=ga_script) as reporting: | |
| jsonl_input = gr.File(label="JSONLファイルをアップロード") | |
| api_key_input = gr.Textbox( | |
| label="GeminiのAPIキー(スコアのセルフ評価を行う場合)", type="password" | |
| ) | |
| gr.Markdown("APIキーの発行は[こちら](https://aistudio.google.com/app/apikey)") | |
| process_button = gr.Button("レポートを作成") | |
| output_file = gr.File(label="セルフ評価レポート(HTML)") | |
| output_csv = gr.File(label="セルフ評価レポート(CSV)") | |
| output_text = gr.Textbox(label="システムメッセージ") | |
| process_button.click( | |
| process_jsonl_file, | |
| inputs=[jsonl_input, api_key_input], | |
| outputs=[output_file, output_csv, output_text], | |
| ) | |
| reporting.load(None, js=ga_load) | |
| llm_jp_3 = "llm-jp/llm-jp-3-1.8b" | |
| gemma_2 = "google/gemma-2-2b" | |
| llm_jp_3_tokenizer = AutoTokenizer.from_pretrained(llm_jp_3, trust_remote_code=True) | |
| tokenizers = { | |
| "LLM-JP-3": llm_jp_3_tokenizer, | |
| } | |
| try: | |
| gemma_2_tokenizer = AutoTokenizer.from_pretrained(gemma_2, trust_remote_code=True) | |
| tokenizers["Gemma-2"] = gemma_2_tokenizer | |
| except OSError as e: | |
| print(e) | |
| tokenizer_names = list(tokenizers.keys()) | |
| def tokenize_text(text: str, tokenizer_name: str): | |
| tokenizer = tokenizers[tokenizer_name] | |
| tokens = tokenizer.tokenize(text) | |
| colors = ["#FFCCCC", "#CCFFCC", "#CCCCFF", "#FFFFCC", "#CCFFFF", "#FFCCFF"] | |
| tokenized_text = "".join( | |
| [ | |
| f'<span style="background-color:{colors[i % len(colors)]}">{token}</span> ' | |
| for i, token in enumerate(tokens) | |
| ] | |
| ) | |
| token_count = len(tokens) | |
| return f"<p>{tokenized_text}</p><p>Token Count: {token_count}</p>" | |
| with gr.Blocks() as tokenization: | |
| with gr.Row(): | |
| tokenizer_dropdown = gr.Dropdown( | |
| label="Tokenizerを選択", choices=tokenizer_names, value=tokenizer_names[0] | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| text_input = gr.Textbox(label="Input Text") | |
| with gr.Column(): | |
| tokenized_output = gr.HTML( | |
| tokenize_text("", tokenizer_names[0]), label="Tokenized Output" | |
| ) | |
| tokenizer_dropdown.change( | |
| tokenize_text, inputs=[text_input, tokenizer_dropdown], outputs=tokenized_output | |
| ) | |
| text_input.change( | |
| tokenize_text, inputs=[text_input, tokenizer_dropdown], outputs=tokenized_output | |
| ) | |
| tabbed = gr.TabbedInterface( | |
| [reporting, tokenization], | |
| tab_names=["ELYZA-tasks-100(-TV) セルフ評価", "トークンの可視化"], | |
| title="LLM開発支援ツール", | |
| ) | |
| if __name__ == "__main__": | |
| tabbed.launch() | |