Upload tokenization_deepseek_fast.py with huggingface_hub
Browse files
tokenization_deepseek_fast.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Optional, Union
|
2 |
+
|
3 |
+
|
4 |
+
from transformers.models.llama import LlamaTokenizerFast
|
5 |
+
|
6 |
+
|
7 |
+
class DeepseekTokenizerFast(LlamaTokenizerFast):
|
8 |
+
|
9 |
+
def convert_ids_to_tokens(
|
10 |
+
self, ids: Union[int, List[int]], skip_special_tokens: bool = False
|
11 |
+
) -> Union[str, List[str]]:
|
12 |
+
"""
|
13 |
+
Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
|
14 |
+
added tokens.
|
15 |
+
|
16 |
+
Args:
|
17 |
+
ids (`int` or `List[int]`):
|
18 |
+
The token id (or token ids) to convert to tokens.
|
19 |
+
skip_special_tokens (`bool`, *optional*, defaults to `False`):
|
20 |
+
Whether or not to remove special tokens in the decoding.
|
21 |
+
|
22 |
+
Returns:
|
23 |
+
`str` or `List[str]`: The decoded token(s).
|
24 |
+
"""
|
25 |
+
if isinstance(ids, int):
|
26 |
+
return self._convert_id_to_token(ids)
|
27 |
+
tokens = []
|
28 |
+
for index in ids:
|
29 |
+
index = int(index)
|
30 |
+
if skip_special_tokens and index in self.all_special_ids:
|
31 |
+
continue
|
32 |
+
token = self._tokenizer.id_to_token(index)
|
33 |
+
tokens.append(token if token is not None else "")
|
34 |
+
return tokens
|
35 |
+
|
36 |
+
def _convert_id_to_token(self, index: int) -> Optional[str]:
|
37 |
+
token = self._tokenizer.id_to_token(int(index))
|
38 |
+
return token if token is not None else ""
|