Siddhant commited on
Commit
b014b02
·
1 Parent(s): fe9f27b

import from zenodo

Browse files
README.md ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - text-to-speech
6
+ language: ja
7
+ datasets:
8
+ - jsut
9
+ license: cc-by-4.0
10
+ ---
11
+ ## Example ESPnet2 TTS model
12
+ ### `kan-bayashi/jsut_tacotron2`
13
+ ♻️ Imported from https://zenodo.org/record/3963886/
14
+
15
+ This model was trained by kan-bayashi using jsut/tts1 recipe in [espnet](https://github.com/espnet/espnet/).
16
+ ### Demo: How to use in ESPnet2
17
+ ```python
18
+ # coming soon
19
+ ```
20
+ ### Citing ESPnet
21
+ ```BibTex
22
+ @inproceedings{watanabe2018espnet,
23
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson {Enrique Yalta Soplin} and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
24
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
25
+ year={2018},
26
+ booktitle={Proceedings of Interspeech},
27
+ pages={2207--2211},
28
+ doi={10.21437/Interspeech.2018-1456},
29
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
30
+ }
31
+ @inproceedings{hayashi2020espnet,
32
+ title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
33
+ author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},
34
+ booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
35
+ pages={7654--7658},
36
+ year={2020},
37
+ organization={IEEE}
38
+ }
39
+ ```
40
+ or arXiv:
41
+ ```bibtex
42
+ @misc{watanabe2018espnet,
43
+ title={ESPnet: End-to-End Speech Processing Toolkit},
44
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Enrique Yalta Soplin and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
45
+ year={2018},
46
+ eprint={1804.00015},
47
+ archivePrefix={arXiv},
48
+ primaryClass={cs.CL}
49
+ }
50
+ ```
exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/feats_stats.npz ADDED
Binary file (1.4 kB). View file
 
exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk/199epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0340ab7d52b1898cacfb8815134a83ddc2318b8f26fe6eed78b106c48673c67
3
+ size 106939103
exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk/config.yaml ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_tacotron2.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_train_tacotron2_raw_batch_bins3750000
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: 0
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ cudnn_enabled: true
21
+ cudnn_benchmark: false
22
+ cudnn_deterministic: true
23
+ collect_stats: false
24
+ write_collected_feats: false
25
+ max_epoch: 200
26
+ patience: null
27
+ val_scheduler_criterion:
28
+ - valid
29
+ - loss
30
+ early_stopping_criterion:
31
+ - valid
32
+ - loss
33
+ - min
34
+ best_model_criterion:
35
+ - - valid
36
+ - loss
37
+ - min
38
+ - - train
39
+ - loss
40
+ - min
41
+ keep_nbest_models: 5
42
+ grad_clip: 1.0
43
+ grad_noise: false
44
+ accum_grad: 1
45
+ no_forward_run: false
46
+ resume: true
47
+ train_dtype: float32
48
+ log_interval: null
49
+ pretrain_path: []
50
+ pretrain_key: []
51
+ num_iters_per_epoch: null
52
+ batch_size: 20
53
+ valid_batch_size: null
54
+ batch_bins: 3750000
55
+ valid_batch_bins: null
56
+ train_shape_file:
57
+ - exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/text_shape.phn
58
+ - exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/speech_shape
59
+ valid_shape_file:
60
+ - exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/text_shape.phn
61
+ - exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/speech_shape
62
+ batch_type: numel
63
+ valid_batch_type: null
64
+ fold_length:
65
+ - 150
66
+ - 240000
67
+ sort_in_batch: descending
68
+ sort_batch: descending
69
+ multiple_iterator: false
70
+ chunk_length: 500
71
+ chunk_shift_ratio: 0.5
72
+ num_cache_chunks: 1024
73
+ train_data_path_and_name_and_type:
74
+ - - dump/raw/tr_no_dev/text
75
+ - text
76
+ - text
77
+ - - dump/raw/tr_no_dev/wav.scp
78
+ - speech
79
+ - sound
80
+ valid_data_path_and_name_and_type:
81
+ - - dump/raw/dev/text
82
+ - text
83
+ - text
84
+ - - dump/raw/dev/wav.scp
85
+ - speech
86
+ - sound
87
+ allow_variable_data_keys: false
88
+ max_cache_size: 0.0
89
+ valid_max_cache_size: null
90
+ optim: adam
91
+ optim_conf:
92
+ lr: 0.001
93
+ eps: 1.0e-06
94
+ weight_decay: 0.0
95
+ scheduler: null
96
+ scheduler_conf: {}
97
+ token_list:
98
+ - <blank>
99
+ - <unk>
100
+ - ty
101
+ - dy
102
+ - v
103
+ - py
104
+ - my
105
+ - by
106
+ - ny
107
+ - hy
108
+ - gy
109
+ - ry
110
+ - ky
111
+ - f
112
+ - p
113
+ - z
114
+ - ch
115
+ - ts
116
+ - j
117
+ - b
118
+ - y
119
+ - h
120
+ - cl
121
+ - I
122
+ - U
123
+ - w
124
+ - g
125
+ - d
126
+ - sh
127
+ - pau
128
+ - m
129
+ - N
130
+ - s
131
+ - r
132
+ - t
133
+ - n
134
+ - k
135
+ - e
136
+ - u
137
+ - i
138
+ - o
139
+ - a
140
+ - <sos/eos>
141
+ odim: null
142
+ model_conf: {}
143
+ use_preprocessor: true
144
+ token_type: phn
145
+ bpemodel: null
146
+ non_linguistic_symbols: null
147
+ cleaner: jaconv
148
+ g2p: pyopenjtalk
149
+ feats_extract: fbank
150
+ feats_extract_conf:
151
+ fs: 24000
152
+ fmin: 80
153
+ fmax: 7600
154
+ n_mels: 80
155
+ hop_length: 300
156
+ n_fft: 2048
157
+ win_length: 1200
158
+ normalize: global_mvn
159
+ normalize_conf:
160
+ stats_file: exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/feats_stats.npz
161
+ tts: tacotron2
162
+ tts_conf:
163
+ embed_dim: 512
164
+ elayers: 1
165
+ eunits: 512
166
+ econv_layers: 3
167
+ econv_chans: 512
168
+ econv_filts: 5
169
+ atype: location
170
+ adim: 512
171
+ aconv_chans: 32
172
+ aconv_filts: 15
173
+ cumulate_att_w: true
174
+ dlayers: 2
175
+ dunits: 1024
176
+ prenet_layers: 2
177
+ prenet_units: 256
178
+ postnet_layers: 5
179
+ postnet_chans: 512
180
+ postnet_filts: 5
181
+ output_activation: null
182
+ use_batch_norm: true
183
+ use_concate: true
184
+ use_residual: false
185
+ dropout_rate: 0.5
186
+ zoneout_rate: 0.1
187
+ reduction_factor: 1
188
+ spk_embed_dim: null
189
+ use_masking: true
190
+ bce_pos_weight: 5.0
191
+ use_guided_attn_loss: true
192
+ guided_attn_loss_sigma: 0.4
193
+ guided_attn_loss_lambda: 1.0
194
+ required:
195
+ - output_dir
196
+ - token_list
197
+ distributed: false
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: 0.8.0
2
+ files:
3
+ model_file: exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk/199epoch.pth
4
+ python: "3.7.3 (default, Mar 27 2019, 22:11:17) \n[GCC 7.3.0]"
5
+ timestamp: 1595923764.402962
6
+ torch: 1.5.1
7
+ yaml_files:
8
+ train_config: exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk/config.yaml