esnya
/

japanese_speecht5_tts

@@ -1,4 +1,6 @@
 import json
 from pathlib import Path
 import re
 from transformers import SpeechT5Tokenizer
@@ -6,9 +8,11 @@ from transformers.models.speecht5.tokenization_speecht5 import (
     PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES,
 )
 from itertools import chain
-from typing import List, Optional
 NP_CHARCTERS = " !\"#$%&'()=~|`{+*}<>?_-^\\@[;:],./　！”＃＄％＆’（）＝～｜｀｛＋＊｝＜＞？＿ー＾￥＠「；：」、。・`"
@@ -28,9 +32,21 @@ def _g2p_with_np(text: str, np_lsit: str) -> List[str]:
     )
 class SpeechT5OpenjtalkTokenizer(SpeechT5Tokenizer):
-    vocab_files_names = {"vocab_file": "vocab.json"}
-    pretrained_vocab_files_map = {}
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
     model_input_names = ["input_ids", "attention_mask"]
@@ -41,7 +57,6 @@ class SpeechT5OpenjtalkTokenizer(SpeechT5Tokenizer):
         eos_token: str = "</s>",
         unk_token: str = "<unk>",
         pad_token: str = "<pad>",
-        mask_token: str = "<mask>",
         non_phenome_characters: str = NP_CHARCTERS,
         **kwargs,
     ):
@@ -93,28 +108,35 @@ class SpeechT5OpenjtalkTokenizer(SpeechT5Tokenizer):
     ):
         if filename_prefix is None:
             filename_prefix = ".json"
         vocab_path = Path(save_directory) / Path(f"vocab{filename_prefix}")
         vocab_path.parent.mkdir(parents=True, exist_ok=True)
         with open(vocab_path, "w", encoding="utf-8") as f:
             json.dump(self.label2id, f, ensure_ascii=False, indent=2)
-        special_tokens_path = Path(save_directory) / Path(
-            f"special_tokens_map{filename_prefix}"
-        )
-        with open(special_tokens_path, "w", encoding="utf-8") as f:
-            json.dump(
-                {
-                    "bos_token": self.bos_token,
-                    "eos_token": self.eos_token,
-                    "unk_token": self.unk_token,
-                    "pad_token": self.pad_token,
-                    "mask_token": self.mask_token,
-                },
-                f,
-                ensure_ascii=False,
-                indent=2,
-            )
-        return str(vocab_path), str(special_tokens_path)
     def _tokenize(self, text: str) -> List[str]:
         return _g2p_with_np(text, self.non_phenome_characters)

 import json
+import logging
+import os
 from pathlib import Path
 import re
 from transformers import SpeechT5Tokenizer
     PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES,
 )
 from itertools import chain
+from typing import List, Optional, Tuple
+logger = logging.getLogger(__name__)
 NP_CHARCTERS = " !\"#$%&'()=~|`{+*}<>?_-^\\@[;:],./　！”＃＄％＆’（）＝～｜｀｛＋＊｝＜＞？＿ー＾￥＠「；：」、。・`"
     )
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "tokenizer_file": "tokenizer.json",
+}
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "esnya/japanese_speecht5_tts": "https://huggingface.co/esnya/japanese_speecht5_tts/resolve/main/vocab.json",
+    },
+}
 class SpeechT5OpenjtalkTokenizer(SpeechT5Tokenizer):
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
     model_input_names = ["input_ids", "attention_mask"]
         eos_token: str = "</s>",
         unk_token: str = "<unk>",
         pad_token: str = "<pad>",
         non_phenome_characters: str = NP_CHARCTERS,
         **kwargs,
     ):
     ):
         if filename_prefix is None:
             filename_prefix = ".json"
+        save_path = Path(save_directory)
+        if not save_path.is_dir():
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
         vocab_path = Path(save_directory) / Path(f"vocab{filename_prefix}")
         vocab_path.parent.mkdir(parents=True, exist_ok=True)
         with open(vocab_path, "w", encoding="utf-8") as f:
             json.dump(self.label2id, f, ensure_ascii=False, indent=2)
+        # special_tokens_path = Path(save_directory) / Path(
+        #     f"special_tokens_map{filename_prefix}"
+        # )
+        # with open(special_tokens_path, "w", encoding="utf-8") as f:
+        #     json.dump(
+        #         {
+        #             "bos_token": self.bos_token,
+        #             "eos_token": self.eos_token,
+        #             "unk_token": self.unk_token,
+        #             "pad_token": self.pad_token,
+        #             "mask_token": self.mask_token,
+        #         },
+        #         f,
+        #         ensure_ascii=False,
+        #         indent=2,
+        #     )
+        return str(vocab_path), None  # str(special_tokens_path)
     def _tokenize(self, text: str) -> List[str]:
         return _g2p_with_np(text, self.non_phenome_characters)