Rocketknight1 HF staff commited on
Commit
e3c4338
·
1 Parent(s): 343a9de

Upload tokenizer

Browse files
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "[BOS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "[CLS]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "[SEP]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "[MASK]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "[PAD]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "[SEP]",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "[UNK]",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenization_hyena.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PreTrainedTokenizer, AddedToken
2
+ from typing import List, Optional, Union, Dict, Sequence, Tuple
3
+ from pathlib import Path
4
+ import json
5
+ import os
6
+
7
+
8
+ class HyenaDNATokenizer(PreTrainedTokenizer):
9
+ model_input_names = ["input_ids", "attention_mask"]
10
+
11
+ def __init__(self,
12
+ model_max_length: int,
13
+ bos_token="[BOS]",
14
+ eos_token="[SEP]",
15
+ sep_token="[SEP]",
16
+ cls_token="[CLS]",
17
+ pad_token="[PAD]",
18
+ mask_token="[MASK]",
19
+ unk_token="[UNK]",
20
+ **kwargs):
21
+ """Character tokenizer for Hugging Face transformers.
22
+ Args:
23
+ characters (Sequence[str]): List of desired characters. Any character which
24
+ is not included in this list will be replaced by a special token called
25
+ [UNK] with id=6. Following are list of all of the special tokens with
26
+ their corresponding ids:
27
+ "[CLS]": 0
28
+ "[SEP]": 1
29
+ "[BOS]": 2
30
+ "[MASK]": 3
31
+ "[PAD]": 4
32
+ "[RESERVED]": 5
33
+ "[UNK]": 6
34
+ an id (starting at 7) will be assigned to each character.
35
+ model_max_length (int): Model maximum sequence length.
36
+ """
37
+ self.characters = ('A', 'C', 'G', 'T', 'N')
38
+ self.model_max_length = model_max_length
39
+
40
+ self._vocab_str_to_int = {
41
+ "[CLS]": 0,
42
+ "[SEP]": 1,
43
+ "[BOS]": 2,
44
+ "[MASK]": 3,
45
+ "[PAD]": 4,
46
+ "[RESERVED]": 5,
47
+ "[UNK]": 6,
48
+ **{ch: i + 7 for i, ch in enumerate(self.characters)},
49
+ }
50
+ self._vocab_int_to_str = {v: k for k, v in self._vocab_str_to_int.items()}
51
+ add_prefix_space = kwargs.pop("add_prefix_space", False)
52
+ padding_side = kwargs.pop("padding_side", "left")
53
+
54
+ super().__init__(
55
+ bos_token=bos_token,
56
+ eos_token=eos_token,
57
+ sep_token=sep_token,
58
+ cls_token=cls_token,
59
+ pad_token=pad_token,
60
+ mask_token=mask_token,
61
+ unk_token=unk_token,
62
+ add_prefix_space=add_prefix_space,
63
+ model_max_length=model_max_length,
64
+ padding_side=padding_side,
65
+ **kwargs,
66
+ )
67
+
68
+ @property
69
+ def vocab_size(self) -> int:
70
+ return len(self._vocab_str_to_int)
71
+
72
+ def _tokenize(self, text: str) -> List[str]:
73
+ return list(text)
74
+
75
+ def _convert_token_to_id(self, token: str) -> int:
76
+ return self._vocab_str_to_int.get(token, self._vocab_str_to_int["[UNK]"])
77
+
78
+ def _convert_id_to_token(self, index: int) -> str:
79
+ return self._vocab_int_to_str[index]
80
+
81
+ def convert_tokens_to_string(self, tokens):
82
+ return "".join(tokens)
83
+
84
+ def get_special_tokens_mask(
85
+ self,
86
+ token_ids_0: List[int],
87
+ token_ids_1: Optional[List[int]] = None,
88
+ already_has_special_tokens: bool = False,
89
+ ) -> List[int]:
90
+ if already_has_special_tokens:
91
+ return super().get_special_tokens_mask(
92
+ token_ids_0=token_ids_0,
93
+ token_ids_1=token_ids_1,
94
+ already_has_special_tokens=True,
95
+ )
96
+
97
+ result = [1] + ([0] * len(token_ids_0)) + [1]
98
+ if token_ids_1 is not None:
99
+ result += ([0] * len(token_ids_1)) + [1]
100
+ return result
101
+
102
+ def build_inputs_with_special_tokens(
103
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
104
+ ) -> List[int]:
105
+ sep = [self.sep_token_id]
106
+ # cls = [self.cls_token_id]
107
+ result = token_ids_0 + sep
108
+ if token_ids_1 is not None:
109
+ result += token_ids_1 + sep
110
+ return result
111
+
112
+ def get_vocab(self) -> Dict[str, int]:
113
+ return self._vocab_str_to_int
114
+
115
+ # HyenaDNA has a fixed vocabulary with no vocab file
116
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple:
117
+ return ()
tokenizer_config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "[CLS]",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "[SEP]",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "[BOS]",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "[MASK]",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "4": {
37
+ "content": "[PAD]",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "6": {
45
+ "content": "[UNK]",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ }
52
+ },
53
+ "auto_map": {
54
+ "AutoTokenizer": [
55
+ "tokenization_hyena.HyenaDNATokenizer",
56
+ null
57
+ ]
58
+ },
59
+ "bos_token": "[BOS]",
60
+ "clean_up_tokenization_spaces": true,
61
+ "cls_token": "[CLS]",
62
+ "eos_token": "[SEP]",
63
+ "mask_token": "[MASK]",
64
+ "model_max_length": 450002,
65
+ "pad_token": "[PAD]",
66
+ "padding_side": "left",
67
+ "sep_token": "[SEP]",
68
+ "tokenizer_class": "HyenaDNATokenizer",
69
+ "unk_token": "[UNK]"
70
+ }