Spaces:

takarajordan
/

DiffusionTokenizer

Running

App Files Files Community

Jordan Legg commited on Nov 27, 2024

Commit

ed8e391

1 Parent(s): a71870f

working

Browse files

Files changed (2) hide show

app.py +16 -11
test.py +18 -7

app.py CHANGED Viewed

@@ -1,29 +1,34 @@
 import gradio as gr
 from transformers import T5TokenizerFast, CLIPTokenizer
 def count_tokens(text):
     # Load the common tokenizers
     t5_tokenizer = T5TokenizerFast.from_pretrained("google/t5-v1_1-xxl", legacy=False)
     clip_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
     # Get tokens and their IDs
-    t5_tokens = t5_tokenizer.encode(text, return_tensors="pt")[0].tolist()
-    clip_tokens = clip_tokenizer.encode(text)
-    # Decode individual tokens for display, replacing whitespace with visible characters
     t5_decoded = []
     for token in t5_tokens:
-        decoded = t5_tokenizer.decode([token])
-        # Replace whitespace with visible characters and empty strings with special markers
         if decoded.isspace():
-            decoded = "␣"  # visible space marker
         elif decoded == "":
-            decoded = "∅"  # empty token marker
         t5_decoded.append(decoded)
     clip_decoded = []
     for token in clip_tokens:
-        decoded = clip_tokenizer.decode([token])
         if decoded.isspace():
             decoded = "␣"
         elif decoded == "":
@@ -31,8 +36,8 @@ def count_tokens(text):
         clip_decoded.append(decoded)
     # Create highlighted text tuples (text, label)
-    t5_highlights = [(token, f"Token {i}") for i, token in enumerate(t5_decoded)]
-    clip_highlights = [(token, f"Token {i}") for i, token in enumerate(clip_decoded)]
     return (
         # T5 outputs
@@ -75,4 +80,4 @@ with gr.Blocks(title="Common Diffusion Model Token Counter") as iface:
     )
 # Launch the app
-iface.launch(show_error=True)

 import gradio as gr
 from transformers import T5TokenizerFast, CLIPTokenizer
 def count_tokens(text):
     # Load the common tokenizers
     t5_tokenizer = T5TokenizerFast.from_pretrained("google/t5-v1_1-xxl", legacy=False)
     clip_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
     # Get tokens and their IDs
+    t5_tokens = t5_tokenizer.encode(text, return_tensors="pt", add_special_tokens=True)[0].tolist()
+    clip_tokens = clip_tokenizer.encode(text, add_special_tokens=True)
+    # Decode individual tokens for display, explicitly setting skip_special_tokens=False
     t5_decoded = []
     for token in t5_tokens:
+        decoded = t5_tokenizer.decode([token], skip_special_tokens=False)
         if decoded.isspace():
+            decoded = "␣"
         elif decoded == "":
+            # Handle special tokens explicitly for T5
+            if token == 3:
+                decoded = "▁"  # Represent token ID 3 as ▁
+            else:
+                decoded = "∅"  # Default for other empty tokens
         t5_decoded.append(decoded)
     clip_decoded = []
     for token in clip_tokens:
+        decoded = clip_tokenizer.decode([token], skip_special_tokens=False)
         if decoded.isspace():
             decoded = "␣"
         elif decoded == "":
         clip_decoded.append(decoded)
     # Create highlighted text tuples (text, label)
+    t5_highlights = [(token, f"{i + 1}") for i, token in enumerate(t5_decoded)]
+    clip_highlights = [(token, f"{i + 1}") for i, token in enumerate(clip_decoded)]
     return (
         # T5 outputs
     )
 # Launch the app
+iface.launch(show_error=True, ssr_mode = False)

test.py CHANGED Viewed

@@ -1,10 +1,21 @@
-from huggingface_hub import hf_hub_download
-# Replace "model_name" with the actual model name
-model_info_path = hf_hub_download("shuttleai/shuttle-3-diffusion", filename="model_index.json")
-# Now you can read the contents of the file
-with open(model_info_path, "r") as f:
-    model_info_content = f.read()
-print(model_info_content)

+from transformers import T5TokenizerFast
+# Initialize the tokenizer
+tokenizer = T5TokenizerFast.from_pretrained("google/t5-v1_1-xxl", legacy=False)
+# Your specific token IDs
+token_ids = [3, 23, 31, 51, 3, 12775, 3768, 5, 1]
+# Decode the full sequence
+full_text = tokenizer.decode(token_ids, skip_special_tokens=True)
+print("\nFull decoded text:", full_text)
+# Decode each token individually and print its text value
+for token_id in token_ids:
+    # Decode each token without skipping special tokens
+    token_text = tokenizer.decode([token_id], skip_special_tokens=False)
+    print(f"Decoded token {token_id}: {token_text}")
+# Convert token ID 3 to its token string
+token_3_name = tokenizer.convert_ids_to_tokens(3)
+print(f"Token ID 3 corresponds to: {token_3_name}")