esnya commited on
Commit
ab5108b
·
1 Parent(s): 12d4898

speecht5_tts_jvs_ver1_e20_openjtalk_longer_20230809-031157_tokenizer

Browse files
Files changed (1) hide show
  1. speecht5_openjtalk_tokenizer.py +43 -21
speecht5_openjtalk_tokenizer.py CHANGED
@@ -1,4 +1,6 @@
1
  import json
 
 
2
  from pathlib import Path
3
  import re
4
  from transformers import SpeechT5Tokenizer
@@ -6,9 +8,11 @@ from transformers.models.speecht5.tokenization_speecht5 import (
6
  PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES,
7
  )
8
  from itertools import chain
9
- from typing import List, Optional
10
 
11
 
 
 
12
  NP_CHARCTERS = " !\"#$%&'()=~|`{+*}<>?_-^\\@[;:],./ !”#$%&’()=~|`{+*}<>?_ー^¥@「;:」、。・`"
13
 
14
 
@@ -28,9 +32,21 @@ def _g2p_with_np(text: str, np_lsit: str) -> List[str]:
28
  )
29
 
30
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  class SpeechT5OpenjtalkTokenizer(SpeechT5Tokenizer):
32
- vocab_files_names = {"vocab_file": "vocab.json"}
33
- pretrained_vocab_files_map = {}
34
  max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
35
  model_input_names = ["input_ids", "attention_mask"]
36
 
@@ -41,7 +57,6 @@ class SpeechT5OpenjtalkTokenizer(SpeechT5Tokenizer):
41
  eos_token: str = "</s>",
42
  unk_token: str = "<unk>",
43
  pad_token: str = "<pad>",
44
- mask_token: str = "<mask>",
45
  non_phenome_characters: str = NP_CHARCTERS,
46
  **kwargs,
47
  ):
@@ -93,28 +108,35 @@ class SpeechT5OpenjtalkTokenizer(SpeechT5Tokenizer):
93
  ):
94
  if filename_prefix is None:
95
  filename_prefix = ".json"
 
 
 
 
 
 
96
  vocab_path = Path(save_directory) / Path(f"vocab{filename_prefix}")
97
  vocab_path.parent.mkdir(parents=True, exist_ok=True)
98
  with open(vocab_path, "w", encoding="utf-8") as f:
99
  json.dump(self.label2id, f, ensure_ascii=False, indent=2)
100
 
101
- special_tokens_path = Path(save_directory) / Path(
102
- f"special_tokens_map{filename_prefix}"
103
- )
104
- with open(special_tokens_path, "w", encoding="utf-8") as f:
105
- json.dump(
106
- {
107
- "bos_token": self.bos_token,
108
- "eos_token": self.eos_token,
109
- "unk_token": self.unk_token,
110
- "pad_token": self.pad_token,
111
- "mask_token": self.mask_token,
112
- },
113
- f,
114
- ensure_ascii=False,
115
- indent=2,
116
- )
117
- return str(vocab_path), str(special_tokens_path)
 
118
 
119
  def _tokenize(self, text: str) -> List[str]:
120
  return _g2p_with_np(text, self.non_phenome_characters)
 
1
  import json
2
+ import logging
3
+ import os
4
  from pathlib import Path
5
  import re
6
  from transformers import SpeechT5Tokenizer
 
8
  PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES,
9
  )
10
  from itertools import chain
11
+ from typing import List, Optional, Tuple
12
 
13
 
14
+ logger = logging.getLogger(__name__)
15
+
16
  NP_CHARCTERS = " !\"#$%&'()=~|`{+*}<>?_-^\\@[;:],./ !”#$%&’()=~|`{+*}<>?_ー^¥@「;:」、。・`"
17
 
18
 
 
32
  )
33
 
34
 
35
+ VOCAB_FILES_NAMES = {
36
+ "vocab_file": "vocab.json",
37
+ "tokenizer_file": "tokenizer.json",
38
+ }
39
+
40
+ PRETRAINED_VOCAB_FILES_MAP = {
41
+ "vocab_file": {
42
+ "esnya/japanese_speecht5_tts": "https://huggingface.co/esnya/japanese_speecht5_tts/resolve/main/vocab.json",
43
+ },
44
+ }
45
+
46
+
47
  class SpeechT5OpenjtalkTokenizer(SpeechT5Tokenizer):
48
+ vocab_files_names = VOCAB_FILES_NAMES
49
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
50
  max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
51
  model_input_names = ["input_ids", "attention_mask"]
52
 
 
57
  eos_token: str = "</s>",
58
  unk_token: str = "<unk>",
59
  pad_token: str = "<pad>",
 
60
  non_phenome_characters: str = NP_CHARCTERS,
61
  **kwargs,
62
  ):
 
108
  ):
109
  if filename_prefix is None:
110
  filename_prefix = ".json"
111
+
112
+ save_path = Path(save_directory)
113
+ if not save_path.is_dir():
114
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
115
+ return
116
+
117
  vocab_path = Path(save_directory) / Path(f"vocab{filename_prefix}")
118
  vocab_path.parent.mkdir(parents=True, exist_ok=True)
119
  with open(vocab_path, "w", encoding="utf-8") as f:
120
  json.dump(self.label2id, f, ensure_ascii=False, indent=2)
121
 
122
+ # special_tokens_path = Path(save_directory) / Path(
123
+ # f"special_tokens_map{filename_prefix}"
124
+ # )
125
+ # with open(special_tokens_path, "w", encoding="utf-8") as f:
126
+ # json.dump(
127
+ # {
128
+ # "bos_token": self.bos_token,
129
+ # "eos_token": self.eos_token,
130
+ # "unk_token": self.unk_token,
131
+ # "pad_token": self.pad_token,
132
+ # "mask_token": self.mask_token,
133
+ # },
134
+ # f,
135
+ # ensure_ascii=False,
136
+ # indent=2,
137
+ # )
138
+
139
+ return str(vocab_path), None # str(special_tokens_path)
140
 
141
  def _tokenize(self, text: str) -> List[str]:
142
  return _g2p_with_np(text, self.non_phenome_characters)