Spaces:

xhiroga
/

llm-exercise-report

Running

Hiroaki Ogasawara

chore: separate load script

e642ef1 18 days ago

4.6 kB

	import csv
	import json
	import os
	import tempfile

	import gradio as gr

	from utils import evaluate, report
	from transformers import AutoTokenizer


	# https://x.com/abidlabs/status/1721548226250371264/photo/1
	# https://github.com/gradio-app/gradio/issues/5954
	ga_script = """
	<script async src="https://www.googletagmanager.com/gtag/js?id=G-0SHLFV3PV0"></script>
	"""
	ga_load = """
	function() {
	window.dataLayer = window.dataLayer \|\| [];
	function gtag(){dataLayer.push(arguments);}
	gtag('js', new Date());

	gtag('config', 'G-0SHLFV3PV0');
	}
	"""

	def process_jsonl_file(jsonl_file_path: str, api_key: str):
	try:
	content = open(jsonl_file_path, "r", encoding="utf-8").readlines()
	json_data = [json.loads(line) for line in content]

	if api_key is not None and api_key != "":
	json_data = evaluate(json_data, api_key)

	html_content = report(tasks=json_data)

	file_name_with_ext = os.path.basename(jsonl_file_path)
	file_name, _ = os.path.splitext(file_name_with_ext)

	output_file = None
	with tempfile.NamedTemporaryFile(
	delete=False,
	prefix=f"{file_name}-report-",
	suffix=".html",
	mode="w",
	encoding="utf-8",
	) as temp_file:
	temp_file.write(html_content)
	output_file = temp_file.name

	output_csv = None
	keys = json_data[0].keys()
	with tempfile.NamedTemporaryFile(
	delete=False,
	prefix=f"{file_name}-report-",
	suffix=".csv",
	mode="w",
	encoding="utf-8",
	) as temp_file:
	dict_writer = csv.DictWriter(temp_file, fieldnames=keys)
	dict_writer.writeheader()
	dict_writer.writerows(json_data)
	output_csv = temp_file.name

	return output_file, output_csv, ""

	except Exception as e:
	return None, None, e


	with gr.Blocks(head=ga_script) as reporting:
	jsonl_input = gr.File(label="JSONLファイルをアップロード")
	api_key_input = gr.Textbox(
	label="GeminiのAPIキー（スコアのセルフ評価を行う場合）", type="password"
	)
	gr.Markdown("APIキーの発行は[こちら](https://aistudio.google.com/app/apikey)")
	process_button = gr.Button("レポートを作成")

	output_file = gr.File(label="セルフ評価レポート(HTML)")
	output_csv = gr.File(label="セルフ評価レポート(CSV)")
	output_text = gr.Textbox(label="システムメッセージ")

	process_button.click(
	process_jsonl_file,
	inputs=[jsonl_input, api_key_input],
	outputs=[output_file, output_csv, output_text],
	)

	reporting.load(None, js=ga_load)

	llm_jp_3 = "llm-jp/llm-jp-3-1.8b"
	gemma_2 = "google/gemma-2-2b"

	llm_jp_3_tokenizer = AutoTokenizer.from_pretrained(llm_jp_3, trust_remote_code=True)
	tokenizers = {
	"LLM-JP-3": llm_jp_3_tokenizer,
	}

	try:
	gemma_2_tokenizer = AutoTokenizer.from_pretrained(gemma_2, trust_remote_code=True)
	tokenizers["Gemma-2"] = gemma_2_tokenizer
	except OSError as e:
	print(e)

	tokenizer_names = list(tokenizers.keys())


	def tokenize_text(text: str, tokenizer_name: str):
	tokenizer = tokenizers[tokenizer_name]
	tokens = tokenizer.tokenize(text)
	colors = ["#FFCCCC", "#CCFFCC", "#CCCCFF", "#FFFFCC", "#CCFFFF", "#FFCCFF"]
	tokenized_text = "".join(
	[
	f'<span style="background-color:{colors[i % len(colors)]}">{token}</span> '
	for i, token in enumerate(tokens)
	]
	)
	token_count = len(tokens)
	return f"<p>{tokenized_text}</p><p>Token Count: {token_count}</p>"


	with gr.Blocks() as tokenization:
	with gr.Row():
	tokenizer_dropdown = gr.Dropdown(
	label="Tokenizerを選択", choices=tokenizer_names, value=tokenizer_names[0]
	)
	with gr.Row():
	with gr.Column():
	text_input = gr.Textbox(label="Input Text")
	with gr.Column():
	tokenized_output = gr.HTML(
	tokenize_text("", tokenizer_names[0]), label="Tokenized Output"
	)

	tokenizer_dropdown.change(
	tokenize_text, inputs=[text_input, tokenizer_dropdown], outputs=tokenized_output
	)
	text_input.change(
	tokenize_text, inputs=[text_input, tokenizer_dropdown], outputs=tokenized_output
	)

	tabbed = gr.TabbedInterface(
	[reporting, tokenization],
	tab_names=["ELYZA-tasks-100(-TV) セルフ評価", "トークンの可視化"],
	title="LLM開発支援ツール",
	)

	if __name__ == "__main__":
	tabbed.launch()