Add converted SpeechT5-TTS model

Browse files

Files changed (12) hide show

added_tokens.json +4 -0
config.json +92 -0
decoder_model.onnx +3 -0
decoder_postnet_and_vocoder.onnx +3 -0
decoder_with_past_model.onnx +3 -0
encoder_model.onnx +3 -0
generation_config.json +9 -0
preprocessor_config.json +19 -0
special_tokens_map.json +13 -0
spm_char.model +3 -0
tokenizer.json +231 -0
tokenizer_config.json +64 -0

added_tokens.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "<ctc_blank>": 80,
+  "<mask>": 79
+}

config.json ADDED Viewed

	@@ -0,0 +1,92 @@

+{
+  "_attn_implementation_autoset": true,
+  "_name_or_path": "microsoft/speecht5_tts",
+  "activation_dropout": 0.1,
+  "apply_spec_augment": true,
+  "architectures": [
+    "SpeechT5ForTextToSpeech"
+  ],
+  "attention_dropout": 0.1,
+  "bos_token_id": 0,
+  "conv_bias": false,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "decoder_attention_heads": 12,
+  "decoder_ffn_dim": 3072,
+  "decoder_layerdrop": 0.1,
+  "decoder_layers": 6,
+  "decoder_start_token_id": 2,
+  "encoder_attention_heads": 12,
+  "encoder_ffn_dim": 3072,
+  "encoder_layerdrop": 0.1,
+  "encoder_layers": 12,
+  "encoder_max_relative_position": 160,
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_norm": "group",
+  "feat_proj_dropout": 0.0,
+  "guided_attention_loss_num_heads": 2,
+  "guided_attention_loss_scale": 10.0,
+  "guided_attention_loss_sigma": 0.4,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "is_encoder_decoder": true,
+  "layer_norm_eps": 1e-05,
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_prob": 0.05,
+  "max_length": null,
+  "max_speech_positions": 1876,
+  "max_text_positions": 600,
+  "model_type": "speecht5",
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_feat_extract_layers": 7,
+  "num_mel_bins": 80,
+  "pad_token_id": 1,
+  "positional_dropout": 0.1,
+  "reduction_factor": 2,
+  "scale_embedding": false,
+  "speaker_embedding_dim": 512,
+  "speech_decoder_postnet_dropout": 0.5,
+  "speech_decoder_postnet_kernel": 5,
+  "speech_decoder_postnet_layers": 5,
+  "speech_decoder_postnet_units": 256,
+  "speech_decoder_prenet_dropout": 0.5,
+  "speech_decoder_prenet_layers": 2,
+  "speech_decoder_prenet_units": 256,
+  "transformers_version": "4.47.1",
+  "use_cache": true,
+  "use_guided_attention_loss": true,
+  "vocab_size": 81
+}

decoder_model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b728cc3f6690877746a4812756df943bcfb905809be36f11c9eb99c5134b0549
+size 238458898

decoder_postnet_and_vocoder.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:38d3063378521355d0d584a659cc9081c23c2a0990eab00124b4c438bd8a3d54
+size 55432027

decoder_with_past_model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:074b4da50603fd2db23db3321d39d9599f3e9e2bf3564bf073cec11fd77d6892
+size 210090457

encoder_model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:092ee327c5d7d6691d34feadde6b21fc2b280f8e5658543388d40ee69f967b94
+size 342803471

generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "decoder_start_token_id": 2,
+  "eos_token_id": 2,
+  "max_length": 1876,
+  "pad_token_id": 1,
+  "transformers_version": "4.47.1"
+}

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "do_normalize": false,
+  "feature_extractor_type": "SpeechT5FeatureExtractor",
+  "feature_size": 1,
+  "fmax": 7600,
+  "fmin": 80,
+  "frame_signal_scale": 1.0,
+  "hop_length": 16,
+  "mel_floor": 1e-10,
+  "num_mel_bins": 80,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "processor_class": "SpeechT5Processor",
+  "reduction_factor": 2,
+  "return_attention_mask": true,
+  "sampling_rate": 16000,
+  "win_function": "hann_window",
+  "win_length": 64
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "bos_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
+}

spm_char.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7fcc48f3e225f627b1641db410ceb0c8649bd2b0c982e150b03f8be3728ab560
+size 238473

tokenizer.json ADDED Viewed

	@@ -0,0 +1,231 @@

+{
+    "version": "1.0",
+    "truncation": null,
+    "padding": null,
+    "added_tokens": [
+        {
+            "id": 0,
+            "content": "<s>",
+            "single_word": false,
+            "lstrip": false,
+            "rstrip": false,
+            "normalized": false,
+            "special": true
+        },
+        {
+            "id": 1,
+            "content": "<pad>",
+            "single_word": false,
+            "lstrip": false,
+            "rstrip": false,
+            "normalized": false,
+            "special": true
+        },
+        {
+            "id": 2,
+            "content": "</s>",
+            "single_word": false,
+            "lstrip": false,
+            "rstrip": false,
+            "normalized": false,
+            "special": true
+        },
+        {
+            "id": 3,
+            "content": "<unk>",
+            "single_word": false,
+            "lstrip": false,
+            "rstrip": false,
+            "normalized": false,
+            "special": true
+        },
+        {
+            "id": 79,
+            "content": "<mask>",
+            "single_word": false,
+            "lstrip": false,
+            "rstrip": false,
+            "normalized": false,
+            "special": true
+        },
+        {
+            "id": 80,
+            "content": "<ctc_blank>",
+            "single_word": false,
+            "lstrip": false,
+            "rstrip": false,
+            "normalized": false,
+            "special": true
+        }
+    ],
+    "normalizer": {
+        "type": "Precompiled",
+        "precompiled_charsmap": null
+    },
+    "pre_tokenizer": {
+        "type": "Sequence",
+        "pretokenizers": [
+            {
+                "type": "WhitespaceSplit"
+            },
+            {
+                "type": "Metaspace",
+                "replacement": "\u2581",
+                "add_prefix_space": true
+            },
+            {
+                "type": "Split",
+                "pattern": {
+                    "Regex": ""
+                },
+                "behavior": "Isolated",
+                "invert": false
+            }
+        ]
+    },
+    "post_processor": {
+        "type": "TemplateProcessing",
+        "single": [
+            {
+                "Sequence": {
+                    "id": "A",
+                    "type_id": 0
+                }
+            },
+            {
+                "SpecialToken": {
+                    "id": "</s>",
+                    "type_id": 0
+                }
+            }
+        ],
+        "pair": [
+            {
+                "Sequence": {
+                    "id": "A",
+                    "type_id": 0
+                }
+            },
+            {
+                "SpecialToken": {
+                    "id": "</s>",
+                    "type_id": 0
+                }
+            },
+            {
+                "Sequence": {
+                    "id": "B",
+                    "type_id": 0
+                }
+            },
+            {
+                "SpecialToken": {
+                    "id": "</s>",
+                    "type_id": 0
+                }
+            }
+        ],
+        "special_tokens": {
+            "</s>": {
+                "id": "</s>",
+                "ids": [
+                    2
+                ],
+                "tokens": [
+                    "</s>"
+                ]
+            }
+        }
+    },
+    "decoder": {
+        "type": "Metaspace",
+        "replacement": "\u2581",
+        "add_prefix_space": true
+    },
+    "model": {
+        "unk_id": 2,
+        "vocab": {
+            "<s>": 0,
+            "<pad>": 1,
+            "</s>": 2,
+            "<unk>": 3,
+            "\u2581": 4,
+            "e": 5,
+            "t": 6,
+            "a": 7,
+            "o": 8,
+            "n": 9,
+            "i": 10,
+            "h": 11,
+            "s": 12,
+            "r": 13,
+            "d": 14,
+            "l": 15,
+            "u": 16,
+            "c": 17,
+            "m": 18,
+            "f": 19,
+            "w": 20,
+            "g": 21,
+            "y": 22,
+            ",": 23,
+            "p": 24,
+            "b": 25,
+            ".": 26,
+            "v": 27,
+            "k": 28,
+            "\"": 29,
+            "I": 30,
+            "'": 31,
+            "T": 32,
+            "A": 33,
+            "S": 34,
+            "H": 35,
+            ";": 36,
+            "x": 37,
+            "W": 38,
+            "-": 39,
+            "B": 40,
+            "?": 41,
+            "C": 42,
+            "M": 43,
+            "!": 44,
+            "q": 45,
+            "j": 46,
+            "E": 47,
+            "N": 48,
+            "P": 49,
+            "O": 50,
+            "D": 51,
+            "L": 52,
+            "G": 53,
+            "R": 54,
+            "F": 55,
+            "Y": 56,
+            "z": 57,
+            "J": 58,
+            ":": 59,
+            "K": 60,
+            "U": 61,
+            "V": 62,
+            ")": 63,
+            "(": 64,
+            "Q": 65,
+            "Z": 66,
+            "]": 67,
+            "[": 68,
+            "X": 69,
+            "\u2014": 70,
+            "/": 71,
+            "\u00e6": 72,
+            "\u00e9": 73,
+            "{": 74,
+            "}": 75,
+            "\u00ea": 76,
+            "\u0153": 77,
+            "\u0304": 78,
+            "<mask>": 79,
+            "<ctc_blank>": 80
+        }
+    }
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,64 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "79": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "80": {
+      "content": "<ctc_blank>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "mask_token": "<mask>",
+  "model_max_length": 600,
+  "normalize": false,
+  "pad_token": "<pad>",
+  "processor_class": "SpeechT5Processor",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "SpeechT5Tokenizer",
+  "unk_token": "<unk>"
+}