speecht5_tts_jvs_ver1_e20_openjtalk_longer_20230809-031157_tokenizer
Browse files- speecht5_openjtalk_tokenizer.py +43 -21
speecht5_openjtalk_tokenizer.py
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
import json
|
|
|
|
|
2 |
from pathlib import Path
|
3 |
import re
|
4 |
from transformers import SpeechT5Tokenizer
|
@@ -6,9 +8,11 @@ from transformers.models.speecht5.tokenization_speecht5 import (
|
|
6 |
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES,
|
7 |
)
|
8 |
from itertools import chain
|
9 |
-
from typing import List, Optional
|
10 |
|
11 |
|
|
|
|
|
12 |
NP_CHARCTERS = " !\"#$%&'()=~|`{+*}<>?_-^\\@[;:],./ !”#$%&’()=~|`{+*}<>?_ー^¥@「;:」、。・`"
|
13 |
|
14 |
|
@@ -28,9 +32,21 @@ def _g2p_with_np(text: str, np_lsit: str) -> List[str]:
|
|
28 |
)
|
29 |
|
30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
class SpeechT5OpenjtalkTokenizer(SpeechT5Tokenizer):
|
32 |
-
vocab_files_names =
|
33 |
-
pretrained_vocab_files_map =
|
34 |
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
35 |
model_input_names = ["input_ids", "attention_mask"]
|
36 |
|
@@ -41,7 +57,6 @@ class SpeechT5OpenjtalkTokenizer(SpeechT5Tokenizer):
|
|
41 |
eos_token: str = "</s>",
|
42 |
unk_token: str = "<unk>",
|
43 |
pad_token: str = "<pad>",
|
44 |
-
mask_token: str = "<mask>",
|
45 |
non_phenome_characters: str = NP_CHARCTERS,
|
46 |
**kwargs,
|
47 |
):
|
@@ -93,28 +108,35 @@ class SpeechT5OpenjtalkTokenizer(SpeechT5Tokenizer):
|
|
93 |
):
|
94 |
if filename_prefix is None:
|
95 |
filename_prefix = ".json"
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
vocab_path = Path(save_directory) / Path(f"vocab{filename_prefix}")
|
97 |
vocab_path.parent.mkdir(parents=True, exist_ok=True)
|
98 |
with open(vocab_path, "w", encoding="utf-8") as f:
|
99 |
json.dump(self.label2id, f, ensure_ascii=False, indent=2)
|
100 |
|
101 |
-
special_tokens_path = Path(save_directory) / Path(
|
102 |
-
|
103 |
-
)
|
104 |
-
with open(special_tokens_path, "w", encoding="utf-8") as f:
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
|
|
118 |
|
119 |
def _tokenize(self, text: str) -> List[str]:
|
120 |
return _g2p_with_np(text, self.non_phenome_characters)
|
|
|
1 |
import json
|
2 |
+
import logging
|
3 |
+
import os
|
4 |
from pathlib import Path
|
5 |
import re
|
6 |
from transformers import SpeechT5Tokenizer
|
|
|
8 |
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES,
|
9 |
)
|
10 |
from itertools import chain
|
11 |
+
from typing import List, Optional, Tuple
|
12 |
|
13 |
|
14 |
+
logger = logging.getLogger(__name__)
|
15 |
+
|
16 |
NP_CHARCTERS = " !\"#$%&'()=~|`{+*}<>?_-^\\@[;:],./ !”#$%&’()=~|`{+*}<>?_ー^¥@「;:」、。・`"
|
17 |
|
18 |
|
|
|
32 |
)
|
33 |
|
34 |
|
35 |
+
VOCAB_FILES_NAMES = {
|
36 |
+
"vocab_file": "vocab.json",
|
37 |
+
"tokenizer_file": "tokenizer.json",
|
38 |
+
}
|
39 |
+
|
40 |
+
PRETRAINED_VOCAB_FILES_MAP = {
|
41 |
+
"vocab_file": {
|
42 |
+
"esnya/japanese_speecht5_tts": "https://huggingface.co/esnya/japanese_speecht5_tts/resolve/main/vocab.json",
|
43 |
+
},
|
44 |
+
}
|
45 |
+
|
46 |
+
|
47 |
class SpeechT5OpenjtalkTokenizer(SpeechT5Tokenizer):
|
48 |
+
vocab_files_names = VOCAB_FILES_NAMES
|
49 |
+
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
50 |
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
51 |
model_input_names = ["input_ids", "attention_mask"]
|
52 |
|
|
|
57 |
eos_token: str = "</s>",
|
58 |
unk_token: str = "<unk>",
|
59 |
pad_token: str = "<pad>",
|
|
|
60 |
non_phenome_characters: str = NP_CHARCTERS,
|
61 |
**kwargs,
|
62 |
):
|
|
|
108 |
):
|
109 |
if filename_prefix is None:
|
110 |
filename_prefix = ".json"
|
111 |
+
|
112 |
+
save_path = Path(save_directory)
|
113 |
+
if not save_path.is_dir():
|
114 |
+
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
|
115 |
+
return
|
116 |
+
|
117 |
vocab_path = Path(save_directory) / Path(f"vocab{filename_prefix}")
|
118 |
vocab_path.parent.mkdir(parents=True, exist_ok=True)
|
119 |
with open(vocab_path, "w", encoding="utf-8") as f:
|
120 |
json.dump(self.label2id, f, ensure_ascii=False, indent=2)
|
121 |
|
122 |
+
# special_tokens_path = Path(save_directory) / Path(
|
123 |
+
# f"special_tokens_map{filename_prefix}"
|
124 |
+
# )
|
125 |
+
# with open(special_tokens_path, "w", encoding="utf-8") as f:
|
126 |
+
# json.dump(
|
127 |
+
# {
|
128 |
+
# "bos_token": self.bos_token,
|
129 |
+
# "eos_token": self.eos_token,
|
130 |
+
# "unk_token": self.unk_token,
|
131 |
+
# "pad_token": self.pad_token,
|
132 |
+
# "mask_token": self.mask_token,
|
133 |
+
# },
|
134 |
+
# f,
|
135 |
+
# ensure_ascii=False,
|
136 |
+
# indent=2,
|
137 |
+
# )
|
138 |
+
|
139 |
+
return str(vocab_path), None # str(special_tokens_path)
|
140 |
|
141 |
def _tokenize(self, text: str) -> List[str]:
|
142 |
return _g2p_with_np(text, self.non_phenome_characters)
|