Hiroaki Ogasawara commited on
Commit
31fce62
1 Parent(s): 2638c2c

chore: Gemma2 as option, refactor

Browse files
Files changed (1) hide show
  1. app.py +42 -14
app.py CHANGED
@@ -22,7 +22,11 @@ def process_jsonl_file(jsonl_file_path: str, api_key: str):
22
  file_name, _ = os.path.splitext(file_name_with_ext)
23
 
24
  with tempfile.NamedTemporaryFile(
25
- delete=False, prefix=f"{file_name}-report-", suffix=".html", mode="w", encoding="utf-8"
 
 
 
 
26
  ) as temp_file:
27
  temp_file.write(html_content)
28
  output_file = temp_file.name
@@ -35,7 +39,9 @@ def process_jsonl_file(jsonl_file_path: str, api_key: str):
35
  # Gradioデモ
36
  with gr.Blocks() as reporting:
37
  jsonl_input = gr.File(label="JSONLファイルをアップロード")
38
- api_key_input = gr.Textbox(label="GeminiのAPIキー(スコアのセルフ評価を行う場合)", type="password")
 
 
39
  gr.Markdown("APIキーの発行は[こちら](https://aistudio.google.com/app/apikey)")
40
  process_button = gr.Button("レポートを作成")
41
 
@@ -43,44 +49,66 @@ with gr.Blocks() as reporting:
43
  output_text = gr.Textbox(label="システムメッセージ")
44
 
45
  process_button.click(
46
- process_jsonl_file, inputs=[jsonl_input, api_key_input], outputs=[output_file, output_text]
 
 
47
  )
48
 
49
  llm_jp_3 = "llm-jp/llm-jp-3-1.8b"
50
  gemma_2 = "google/gemma-2-2b"
51
 
52
  llm_jp_3_tokenizer = AutoTokenizer.from_pretrained(llm_jp_3, trust_remote_code=True)
53
- gemma_2_tokenizer = AutoTokenizer.from_pretrained(gemma_2, trust_remote_code=True)
54
-
55
  tokenizers = {
56
  "LLM-JP-3": llm_jp_3_tokenizer,
57
- "Gemma-2": gemma_2_tokenizer
58
  }
59
 
 
 
 
 
 
 
 
 
 
60
  def tokenize_text(text: str, tokenizer_name: str):
61
  tokenizer = tokenizers[tokenizer_name]
62
  tokens = tokenizer.tokenize(text)
63
- colors = ['#FFCCCC', '#CCFFCC', '#CCCCFF', '#FFFFCC', '#CCFFFF', '#FFCCFF']
64
- tokenized_text = ''.join([f'<span style="background-color:{colors[i % len(colors)]}">{token}</span> ' for i, token in enumerate(tokens)])
 
 
 
 
 
65
  token_count = len(tokens)
66
  return f"<p>{tokenized_text}</p><p>Token Count: {token_count}</p>"
67
 
 
68
  with gr.Blocks() as tokenization:
69
  with gr.Row():
70
- tokenizer_dropdown = gr.Dropdown(label="Tokenizerを選択", choices=["LLM-JP-3", "Gemma-2"], value="LLM-JP-3")
 
 
71
  with gr.Row():
72
  with gr.Column():
73
  text_input = gr.Textbox(label="Input Text")
74
  with gr.Column():
75
- tokenized_output = gr.HTML(label="Tokenized Output")
 
 
76
 
77
- tokenizer_dropdown.change(tokenize_text, inputs=[text_input, tokenizer_dropdown], outputs=tokenized_output)
78
- text_input.change(tokenize_text, inputs=[text_input, tokenizer_dropdown], outputs=tokenized_output)
 
 
 
 
79
 
80
  tabbed = gr.TabbedInterface(
81
- [reporting, tokenization],
82
  tab_names=["ELYZA-tasks-100(-TV) セルフ評価", "トークン化の可視化"],
83
- title="LLM開発支援ツール"
84
  )
85
 
86
  if __name__ == "__main__":
 
22
  file_name, _ = os.path.splitext(file_name_with_ext)
23
 
24
  with tempfile.NamedTemporaryFile(
25
+ delete=False,
26
+ prefix=f"{file_name}-report-",
27
+ suffix=".html",
28
+ mode="w",
29
+ encoding="utf-8",
30
  ) as temp_file:
31
  temp_file.write(html_content)
32
  output_file = temp_file.name
 
39
  # Gradioデモ
40
  with gr.Blocks() as reporting:
41
  jsonl_input = gr.File(label="JSONLファイルをアップロード")
42
+ api_key_input = gr.Textbox(
43
+ label="GeminiのAPIキー(スコアのセルフ評価を行う場合)", type="password"
44
+ )
45
  gr.Markdown("APIキーの発行は[こちら](https://aistudio.google.com/app/apikey)")
46
  process_button = gr.Button("レポートを作成")
47
 
 
49
  output_text = gr.Textbox(label="システムメッセージ")
50
 
51
  process_button.click(
52
+ process_jsonl_file,
53
+ inputs=[jsonl_input, api_key_input],
54
+ outputs=[output_file, output_text],
55
  )
56
 
57
  llm_jp_3 = "llm-jp/llm-jp-3-1.8b"
58
  gemma_2 = "google/gemma-2-2b"
59
 
60
  llm_jp_3_tokenizer = AutoTokenizer.from_pretrained(llm_jp_3, trust_remote_code=True)
 
 
61
  tokenizers = {
62
  "LLM-JP-3": llm_jp_3_tokenizer,
 
63
  }
64
 
65
+ try:
66
+ gemma_2_tokenizer = AutoTokenizer.from_pretrained(gemma_2, trust_remote_code=True)
67
+ tokenizers["Gemma-2"] = gemma_2_tokenizer
68
+ except OSError as e:
69
+ print(e)
70
+
71
+ tokenizer_names = list(tokenizers.keys())
72
+
73
+
74
  def tokenize_text(text: str, tokenizer_name: str):
75
  tokenizer = tokenizers[tokenizer_name]
76
  tokens = tokenizer.tokenize(text)
77
+ colors = ["#FFCCCC", "#CCFFCC", "#CCCCFF", "#FFFFCC", "#CCFFFF", "#FFCCFF"]
78
+ tokenized_text = "".join(
79
+ [
80
+ f'<span style="background-color:{colors[i % len(colors)]}">{token}</span> '
81
+ for i, token in enumerate(tokens)
82
+ ]
83
+ )
84
  token_count = len(tokens)
85
  return f"<p>{tokenized_text}</p><p>Token Count: {token_count}</p>"
86
 
87
+
88
  with gr.Blocks() as tokenization:
89
  with gr.Row():
90
+ tokenizer_dropdown = gr.Dropdown(
91
+ label="Tokenizerを選択", choices=tokenizer_names, value=tokenizer_names[0]
92
+ )
93
  with gr.Row():
94
  with gr.Column():
95
  text_input = gr.Textbox(label="Input Text")
96
  with gr.Column():
97
+ tokenized_output = gr.HTML(
98
+ tokenize_text("", tokenizer_names[0]), label="Tokenized Output"
99
+ )
100
 
101
+ tokenizer_dropdown.change(
102
+ tokenize_text, inputs=[text_input, tokenizer_dropdown], outputs=tokenized_output
103
+ )
104
+ text_input.change(
105
+ tokenize_text, inputs=[text_input, tokenizer_dropdown], outputs=tokenized_output
106
+ )
107
 
108
  tabbed = gr.TabbedInterface(
109
+ [reporting, tokenization],
110
  tab_names=["ELYZA-tasks-100(-TV) セルフ評価", "トークン化の可視化"],
111
+ title="LLM開発支援ツール",
112
  )
113
 
114
  if __name__ == "__main__":