Jordan Legg commited on
Commit
ed8e391
β€’
1 Parent(s): a71870f
Files changed (2) hide show
  1. app.py +16 -11
  2. test.py +18 -7
app.py CHANGED
@@ -1,29 +1,34 @@
1
  import gradio as gr
2
  from transformers import T5TokenizerFast, CLIPTokenizer
3
 
 
4
  def count_tokens(text):
 
5
  # Load the common tokenizers
6
  t5_tokenizer = T5TokenizerFast.from_pretrained("google/t5-v1_1-xxl", legacy=False)
7
  clip_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
8
 
9
  # Get tokens and their IDs
10
- t5_tokens = t5_tokenizer.encode(text, return_tensors="pt")[0].tolist()
11
- clip_tokens = clip_tokenizer.encode(text)
12
 
13
- # Decode individual tokens for display, replacing whitespace with visible characters
14
  t5_decoded = []
15
  for token in t5_tokens:
16
- decoded = t5_tokenizer.decode([token])
17
- # Replace whitespace with visible characters and empty strings with special markers
18
  if decoded.isspace():
19
- decoded = "␣" # visible space marker
20
  elif decoded == "":
21
- decoded = "βˆ…" # empty token marker
 
 
 
 
22
  t5_decoded.append(decoded)
23
 
24
  clip_decoded = []
25
  for token in clip_tokens:
26
- decoded = clip_tokenizer.decode([token])
27
  if decoded.isspace():
28
  decoded = "␣"
29
  elif decoded == "":
@@ -31,8 +36,8 @@ def count_tokens(text):
31
  clip_decoded.append(decoded)
32
 
33
  # Create highlighted text tuples (text, label)
34
- t5_highlights = [(token, f"Token {i}") for i, token in enumerate(t5_decoded)]
35
- clip_highlights = [(token, f"Token {i}") for i, token in enumerate(clip_decoded)]
36
 
37
  return (
38
  # T5 outputs
@@ -75,4 +80,4 @@ with gr.Blocks(title="Common Diffusion Model Token Counter") as iface:
75
  )
76
 
77
  # Launch the app
78
- iface.launch(show_error=True)
 
1
  import gradio as gr
2
  from transformers import T5TokenizerFast, CLIPTokenizer
3
 
4
+
5
  def count_tokens(text):
6
+
7
  # Load the common tokenizers
8
  t5_tokenizer = T5TokenizerFast.from_pretrained("google/t5-v1_1-xxl", legacy=False)
9
  clip_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
10
 
11
  # Get tokens and their IDs
12
+ t5_tokens = t5_tokenizer.encode(text, return_tensors="pt", add_special_tokens=True)[0].tolist()
13
+ clip_tokens = clip_tokenizer.encode(text, add_special_tokens=True)
14
 
15
+ # Decode individual tokens for display, explicitly setting skip_special_tokens=False
16
  t5_decoded = []
17
  for token in t5_tokens:
18
+ decoded = t5_tokenizer.decode([token], skip_special_tokens=False)
 
19
  if decoded.isspace():
20
+ decoded = "␣"
21
  elif decoded == "":
22
+ # Handle special tokens explicitly for T5
23
+ if token == 3:
24
+ decoded = "▁" # Represent token ID 3 as ▁
25
+ else:
26
+ decoded = "βˆ…" # Default for other empty tokens
27
  t5_decoded.append(decoded)
28
 
29
  clip_decoded = []
30
  for token in clip_tokens:
31
+ decoded = clip_tokenizer.decode([token], skip_special_tokens=False)
32
  if decoded.isspace():
33
  decoded = "␣"
34
  elif decoded == "":
 
36
  clip_decoded.append(decoded)
37
 
38
  # Create highlighted text tuples (text, label)
39
+ t5_highlights = [(token, f"{i + 1}") for i, token in enumerate(t5_decoded)]
40
+ clip_highlights = [(token, f"{i + 1}") for i, token in enumerate(clip_decoded)]
41
 
42
  return (
43
  # T5 outputs
 
80
  )
81
 
82
  # Launch the app
83
+ iface.launch(show_error=True, ssr_mode = False)
test.py CHANGED
@@ -1,10 +1,21 @@
1
- from huggingface_hub import hf_hub_download
2
 
3
- # Replace "model_name" with the actual model name
4
- model_info_path = hf_hub_download("shuttleai/shuttle-3-diffusion", filename="model_index.json")
5
 
6
- # Now you can read the contents of the file
7
- with open(model_info_path, "r") as f:
8
- model_info_content = f.read()
9
 
10
- print(model_info_content)
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import T5TokenizerFast
2
 
3
+ # Initialize the tokenizer
4
+ tokenizer = T5TokenizerFast.from_pretrained("google/t5-v1_1-xxl", legacy=False)
5
 
6
+ # Your specific token IDs
7
+ token_ids = [3, 23, 31, 51, 3, 12775, 3768, 5, 1]
 
8
 
9
+ # Decode the full sequence
10
+ full_text = tokenizer.decode(token_ids, skip_special_tokens=True)
11
+ print("\nFull decoded text:", full_text)
12
+
13
+ # Decode each token individually and print its text value
14
+ for token_id in token_ids:
15
+ # Decode each token without skipping special tokens
16
+ token_text = tokenizer.decode([token_id], skip_special_tokens=False)
17
+ print(f"Decoded token {token_id}: {token_text}")
18
+
19
+ # Convert token ID 3 to its token string
20
+ token_3_name = tokenizer.convert_ids_to_tokens(3)
21
+ print(f"Token ID 3 corresponds to: {token_3_name}")