Error when using newly pretrained FAST
#14
by
pengzhenghao97
- opened
Hi, how can I use the newly pretraine FAST? Thanks!
import numpy as np
from transformers import AutoProcessor
# First, we download the tokenizer from the Hugging Face model hub
# Here, we will not use the pre-trained tokenizer weights, but only the source code
# to train a new tokenizer on our own data.
tokenizer = AutoProcessor.from_pretrained("physical-intelligence/fast", trust_remote_code=True)
# Load your action data for tokenizer training
# Chunks do not need to be of the same length, we will use dummy data
action_data = np.random.rand(4000, 50, 14)
# # Train the new tokenizer, depending on your dataset size this can take a few minutes
tokenizer = tokenizer.fit(action_data)
# # Save the new tokenizer, optionally push it to the Hugging Face model hub
tokenizer.save_pretrained("debug")
tokenizer = AutoProcessor.from_pretrained("debug", trust_remote_code=True)
tokens = tokenizer(action_data)
decoded_actions = tokenizer.decode(tokens)
Error messasge:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[6], line 22
17 tokenizer.save_pretrained("debug")
21 tokenizer = AutoProcessor.from_pretrained("debug", trust_remote_code=True)
---> 22 tokens = tokenizer(action_data)
23 decoded_actions = tokenizer.decode(tokens)
File ~/anaconda3/envs/go/envs/infgen/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2877, in PreTrainedTokenizerBase.__call__(self, text, text_pair, text_target, text_pair_target, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, padding_side, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)
2875 if not self._in_target_context_manager:
2876 self._switch_to_input_mode()
-> 2877 encodings = self._call_one(text=text, text_pair=text_pair, **all_kwargs)
2878 if text_target is not None:
2879 self._switch_to_target_mode()
File ~/anaconda3/envs/go/envs/infgen/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2937, in PreTrainedTokenizerBase._call_one(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, padding_side, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, split_special_tokens, **kwargs)
2934 return False
2936 if not _is_valid_text_input(text):
-> 2937 raise ValueError(
2938 "text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) "
2939 "or `List[List[str]]` (batch of pretokenized examples)."
2940 )
2942 if text_pair is not None and not _is_valid_text_input(text_pair):
2943 raise ValueError(
2944 "text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) "
2945 "or `List[List[str]]` (batch of pretokenized examples)."
2946 )
ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).