akhaliq3 commited on
Commit
2b7bf83
1 Parent(s): d25d456

spaces demo

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. ParallelWaveGAN/.github/FUNDING.yml +1 -0
  2. ParallelWaveGAN/.github/workflows/ci.yaml +97 -0
  3. ParallelWaveGAN/.gitignore +36 -0
  4. ParallelWaveGAN/LICENSE +21 -0
  5. ParallelWaveGAN/egs/README.md +165 -0
  6. ParallelWaveGAN/egs/arctic/voc1/cmd.sh +91 -0
  7. ParallelWaveGAN/egs/arctic/voc1/conf/parallel_wavegan.v1.yaml +122 -0
  8. ParallelWaveGAN/egs/arctic/voc1/conf/slurm.conf +12 -0
  9. ParallelWaveGAN/egs/arctic/voc1/local/data_download.sh +40 -0
  10. ParallelWaveGAN/egs/arctic/voc1/local/data_prep.sh +113 -0
  11. ParallelWaveGAN/egs/arctic/voc1/path.sh +33 -0
  12. ParallelWaveGAN/egs/arctic/voc1/run.sh +167 -0
  13. ParallelWaveGAN/egs/arctic/voc1/utils +1 -0
  14. ParallelWaveGAN/egs/csmsc/voc1/cmd.sh +91 -0
  15. ParallelWaveGAN/egs/csmsc/voc1/conf/hifigan.v1.yaml +180 -0
  16. ParallelWaveGAN/egs/csmsc/voc1/conf/multi_band_melgan.v2.yaml +150 -0
  17. ParallelWaveGAN/egs/csmsc/voc1/conf/parallel_wavegan.v1.yaml +122 -0
  18. ParallelWaveGAN/egs/csmsc/voc1/conf/slurm.conf +12 -0
  19. ParallelWaveGAN/egs/csmsc/voc1/conf/style_melgan.v1.yaml +147 -0
  20. ParallelWaveGAN/egs/csmsc/voc1/local/data_download.sh +32 -0
  21. ParallelWaveGAN/egs/csmsc/voc1/local/data_prep.sh +94 -0
  22. ParallelWaveGAN/egs/csmsc/voc1/path.sh +33 -0
  23. ParallelWaveGAN/egs/csmsc/voc1/run.sh +164 -0
  24. ParallelWaveGAN/egs/csmsc/voc1/utils +1 -0
  25. ParallelWaveGAN/egs/jnas/voc1/cmd.sh +91 -0
  26. ParallelWaveGAN/egs/jnas/voc1/conf/parallel_wavegan.v1.long.yaml +123 -0
  27. ParallelWaveGAN/egs/jnas/voc1/conf/parallel_wavegan.v1.yaml +122 -0
  28. ParallelWaveGAN/egs/jnas/voc1/conf/slurm.conf +12 -0
  29. ParallelWaveGAN/egs/jnas/voc1/conf/train_speakers.txt +261 -0
  30. ParallelWaveGAN/egs/jnas/voc1/local/data_prep.sh +89 -0
  31. ParallelWaveGAN/egs/jnas/voc1/path.sh +33 -0
  32. ParallelWaveGAN/egs/jnas/voc1/run.sh +158 -0
  33. ParallelWaveGAN/egs/jnas/voc1/utils +1 -0
  34. ParallelWaveGAN/egs/jsss/voc1/cmd.sh +91 -0
  35. ParallelWaveGAN/egs/jsss/voc1/conf/parallel_wavegan.v1.yaml +122 -0
  36. ParallelWaveGAN/egs/jsss/voc1/conf/slurm.conf +12 -0
  37. ParallelWaveGAN/egs/jsss/voc1/local/data_download.sh +41 -0
  38. ParallelWaveGAN/egs/jsss/voc1/local/data_prep.sh +180 -0
  39. ParallelWaveGAN/egs/jsss/voc1/path.sh +33 -0
  40. ParallelWaveGAN/egs/jsss/voc1/run.sh +186 -0
  41. ParallelWaveGAN/egs/jsss/voc1/utils +1 -0
  42. ParallelWaveGAN/egs/jsut/voc1/cmd.sh +91 -0
  43. ParallelWaveGAN/egs/jsut/voc1/conf/hifigan.v1.yaml +180 -0
  44. ParallelWaveGAN/egs/jsut/voc1/conf/multi_band_melgan.v2.yaml +150 -0
  45. ParallelWaveGAN/egs/jsut/voc1/conf/parallel_wavegan.v1.yaml +122 -0
  46. ParallelWaveGAN/egs/jsut/voc1/conf/slurm.conf +12 -0
  47. ParallelWaveGAN/egs/jsut/voc1/conf/style_melgan.v1.yaml +147 -0
  48. ParallelWaveGAN/egs/jsut/voc1/local/data_download.sh +39 -0
  49. ParallelWaveGAN/egs/jsut/voc1/local/data_prep.sh +93 -0
  50. ParallelWaveGAN/egs/jsut/voc1/path.sh +33 -0
ParallelWaveGAN/.github/FUNDING.yml ADDED
@@ -0,0 +1 @@
 
 
1
+ github: kan-bayashi
ParallelWaveGAN/.github/workflows/ci.yaml ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - master
7
+ pull_request:
8
+ branches:
9
+ - master
10
+ schedule:
11
+ - cron: 0 0 * * 1
12
+
13
+ jobs:
14
+ linter_and_test:
15
+ runs-on: ubuntu-20.04
16
+ strategy:
17
+ max-parallel: 5
18
+ matrix:
19
+ python-version: [3.6]
20
+ # 1.6 is failed on cpu: https://github.com/kan-bayashi/ParallelWaveGAN/issues/198
21
+ pytorch-version: [1.4, 1.5.1, 1.7.1, 1.8.1, 1.9]
22
+ steps:
23
+ - uses: actions/checkout@master
24
+ - uses: actions/setup-python@v2
25
+ with:
26
+ python-version: ${{ matrix.python-version }}
27
+ architecture: 'x64'
28
+ - uses: actions/cache@v2
29
+ with:
30
+ path: ~/.cache/pip
31
+ key: ${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.pytorch-version }}-pip-${{ hashFiles('**/setup.py') }}
32
+ restore-keys: |
33
+ ${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.pytorch-version }}-pip-
34
+ - name: Install dependencies
35
+ run: |
36
+ sudo apt-get install libsndfile-dev
37
+ # make python env
38
+ cd tools; make CUDA_VERSION="" PYTHON=python${{ matrix.python-version }} PYTORCH_VERSION=${{ matrix.pytorch-version }}
39
+ # install shell check
40
+ wget https://github.com/koalaman/shellcheck/releases/download/stable/shellcheck-stable.linux.x86_64.tar.xz
41
+ tar -xvf shellcheck-stable.linux.x86_64.tar.xz
42
+ - name: ShellCheck
43
+ run: |
44
+ export PATH=shellcheck-stable:$PATH
45
+ find egs -name "*.sh" | grep -v path.sh | while read line; do shellcheck -x --shell=bash -P $(dirname $line) ${line}; done
46
+ - name: Black & Flake8
47
+ run: |
48
+ source tools/venv/bin/activate
49
+ black --diff parallel_wavegan
50
+ flake8 parallel_wavegan
51
+ flake8 --extend-ignore=D test
52
+ - name: Pytest
53
+ run: |
54
+ source tools/venv/bin/activate
55
+ pytest test
56
+
57
+ integration:
58
+ runs-on: ubuntu-20.04
59
+ strategy:
60
+ max-parallel: 10
61
+ matrix:
62
+ python-version: [3.7]
63
+ pytorch-version: [1.9]
64
+ config:
65
+ - "parallel_wavegan.v1.debug.yaml"
66
+ - "melgan.v1.debug.yaml"
67
+ - "melgan.v3.debug.yaml"
68
+ - "multi_band_melgan.v1.debug.yaml"
69
+ - "parallel_wavegan.v1.debug.npy.yaml"
70
+ - "parallel_wavegan.v1.debug.diff_fs.yaml"
71
+ - "hifigan.v1.debug.yaml"
72
+ - "style_melgan.v1.debug.yaml"
73
+ steps:
74
+ - uses: actions/checkout@master
75
+ - uses: actions/setup-python@v2
76
+ with:
77
+ python-version: ${{ matrix.python-version }}
78
+ architecture: 'x64'
79
+ - uses: actions/cache@v2
80
+ with:
81
+ path: ~/.cache/pip
82
+ key: ${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.pytorch-version }}-pip-${{ hashFiles('**/setup.py') }}
83
+ restore-keys: |
84
+ ${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.pytorch-version }}-pip-
85
+ - name: Install dependencies
86
+ run: |
87
+ sudo apt-get install libsndfile-dev jq
88
+ # make python env
89
+ cd tools; make CUDA_VERSION="" PYTHON=python${{ matrix.python-version }} PYTORCH_VERSION=${{ matrix.pytorch-version }}
90
+ - name: Integration
91
+ run: |
92
+ cd egs/yesno/voc1 && ./run.sh --conf conf/${{ matrix.config }}
93
+ - uses: actions/upload-artifact@v1
94
+ if: failure()
95
+ with:
96
+ name: artifacts-${{ matrix.config }}
97
+ path: egs/yesno/voc1
ParallelWaveGAN/.gitignore ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # general
2
+ *~
3
+ *.pyc
4
+ \#*\#
5
+ .\#*
6
+ *DS_Store
7
+ out.txt
8
+ parallel_wavegan.egg-info/
9
+ doc/_build
10
+ slurm-*.out
11
+ tmp*
12
+ .eggs/
13
+ .hypothesis/
14
+ .idea
15
+ .backup/
16
+ .pytest_cache/
17
+ __pycache__/
18
+ .coverage*
19
+ coverage.xml*
20
+ .vscode*
21
+ .nfs*
22
+ .ipynb_checkpoints
23
+ .d000*
24
+ *.out
25
+ *.err
26
+
27
+ # recipe related
28
+ egs/*/*/data
29
+ egs/*/*/downloads
30
+ egs/*/*/dump
31
+ egs/*/*/exp
32
+ egs/*/*/conf/tuning
33
+
34
+ # tools related
35
+ tools/venv/
36
+ tools/apex/
ParallelWaveGAN/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2020 Tomoki Hayashi <[email protected]>
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
ParallelWaveGAN/egs/README.md ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Kaldi-style all-in-one recipes
2
+
3
+ This repository provides [Kaldi](https://github.com/kaldi-asr/kaldi)-style recipes, as the same as [ESPnet](https://github.com/espnet/espnet).
4
+ Currently, the following recipes are supported.
5
+
6
+ - [LJSpeech](https://keithito.com/LJ-Speech-Dataset/): English female speaker
7
+ - [JSUT](https://sites.google.com/site/shinnosuketakamichi/publication/jsut): Japanese female speaker
8
+ - [JSSS](https://sites.google.com/site/shinnosuketakamichi/research-topics/jsss_corpus): Japanese female speaker
9
+ - [CSMSC](https://www.data-baker.com/open_source.html): Mandarin female speaker
10
+ - [CMU Arctic](http://www.festvox.org/cmu_arctic/): English speakers
11
+ - [JNAS](http://research.nii.ac.jp/src/en/JNAS.html): Japanese multi-speaker
12
+ - [VCTK](https://homepages.inf.ed.ac.uk/jyamagis/page3/page58/page58.html): English multi-speaker
13
+ - [LibriTTS](https://arxiv.org/abs/1904.02882): English multi-speaker
14
+ - [YesNo](https://arxiv.org/abs/1904.02882): English speaker (For debugging)
15
+
16
+
17
+ ## How to run the recipe
18
+
19
+ ```bash
20
+ # Let us move on the recipe directory
21
+ $ cd egs/ljspeech/voc1
22
+
23
+ # Run the recipe from scratch
24
+ $ ./run.sh
25
+
26
+ # You can change config via command line
27
+ $ ./run.sh --conf <your_customized_yaml_config>
28
+
29
+ # You can select the stage to start and stop
30
+ $ ./run.sh --stage 2 --stop_stage 2
31
+
32
+ # If you want to specify the gpu
33
+ $ CUDA_VISIBLE_DEVICES=1 ./run.sh --stage 2
34
+
35
+ # If you want to resume training from 10000 steps checkpoint
36
+ $ ./run.sh --stage 2 --resume <path>/<to>/checkpoint-10000steps.pkl
37
+ ```
38
+
39
+ You can check the command line options in `run.sh`.
40
+
41
+ The integration with job schedulers such as [slurm](https://slurm.schedmd.com/documentation.html) can be done via `cmd.sh` and `conf/slurm.conf`.
42
+ If you want to use it, please check [this page](https://kaldi-asr.org/doc/queue.html).
43
+
44
+ All of the hyperparameters are written in a single yaml format configuration file.
45
+ Please check [this example](https://github.com/kan-bayashi/ParallelWaveGAN/blob/master/egs/ljspeech/voc1/conf/parallel_wavegan.v1.yaml) in ljspeech recipe.
46
+
47
+ You can monitor the training progress via tensorboard.
48
+
49
+ ```bash
50
+ $ tensorboard --logdir exp
51
+ ```
52
+
53
+ ![](https://user-images.githubusercontent.com/22779813/68100080-58bbc500-ff09-11e9-9945-c835186fd7c2.png)
54
+
55
+ If you want to accelerate the training, you can try distributed multi-gpu training based on apex.
56
+ You need to install apex for distributed training. Please make sure you already installed it.
57
+ Then you can run distributed multi-gpu training via following command:
58
+
59
+ ```bash
60
+ # in the case of the number of gpus = 8
61
+ $ CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" ./run.sh --stage 2 --n_gpus 8
62
+ ```
63
+
64
+ In the case of distributed training, the batch size will be automatically multiplied by the number of gpus.
65
+ Please be careful.
66
+
67
+ ## How to make the recipe for your own dateset
68
+
69
+ Here, I will show how to make the recipe for your own dataset.
70
+
71
+ 1. Setup your dataset to be the following structure.
72
+
73
+ ```bash
74
+ # For single-speaker case
75
+ $ tree /path/to/databse
76
+ /path/to/database
77
+ ├── utt_1.wav
78
+ ├── utt_2.wav
79
+ │ ...
80
+ └── utt_N.wav
81
+ # The directory can be nested, but each filename must be unique
82
+
83
+ # For multi-speaker case
84
+ $ tree /path/to/databse
85
+ /path/to/database
86
+ ├── spk_1
87
+ │ ├── utt1.wav
88
+ ├── spk_2
89
+ │ ├── utt1.wav
90
+ │ ...
91
+ └── spk_N
92
+ ├── utt1.wav
93
+ ...
94
+ # The directory under each speaker can be nested, but each filename in each speaker directory must be unique
95
+ ```
96
+
97
+ 2. Copy the template directory.
98
+
99
+ ```bash
100
+ cd egs
101
+
102
+ # For single speaker case
103
+ cp -r template_single_spk <your_dataset_name>
104
+
105
+ # For multi speaker case
106
+ cp -r template_multi_spk <your_dataset_name>
107
+
108
+ # Move on your recipe
109
+ cd egs/<your_dataset_name>/voc1
110
+ ```
111
+
112
+ 3. Modify the options in `run.sh`.
113
+ What you need to change at least in `run.sh` is as follows:
114
+ - `db_root`: Root path of the database.
115
+ - `num_dev`: The number of utterances for development set.
116
+ - `num_eval`: The number of utterances for evaluation set.
117
+
118
+ 4. Modify the hyperpameters in `conf/parallel_wavegan.v1.yaml`.
119
+ What you need to change at least in config is as follows:
120
+ - `sampling_rate`: If you can specify the lower sampling rate, the audio will be downsampled by sox.
121
+
122
+ 5. (Optional) Change command backend in `cmd.sh`.
123
+ If you are not familiar with kaldi and run in your local env, you do not need to change.
124
+ See more info on https://kaldi-asr.org/doc/queue.html.
125
+
126
+ 6. Run your recipe.
127
+
128
+ ```bash
129
+ # Run all stages from the first stage
130
+ ./run.sh
131
+
132
+ # If you want to specify CUDA device
133
+ CUDA_VISIBLE_DEVICES=0 ./run.sh
134
+ ```
135
+
136
+ If you want to try the other advanced model, please check the config files in `egs/ljspeech/voc1/conf`.
137
+
138
+ ## Run training using ESPnet2-TTS recipe within 5 minutes
139
+
140
+ Make sure already you finished the espnet2-tts recipe experiments (at least starting the training).
141
+
142
+ ```bash
143
+ cd egs
144
+
145
+ # Please use single spk template for both single and multi spk case
146
+ cp -r template_single_spk <recipe_name>
147
+
148
+ # Move on your recipe
149
+ cd egs/<recipe_name>/voc1
150
+
151
+ # Make symlink of data directory (Better to use absolute path)
152
+ mkdir dump data
153
+ ln -s /path/to/espnet/egs2/<recipe_name>/tts1/dump/raw dump/
154
+ ln -s /path/to/espnet/egs2/<recipe_name>/tts1/dump/raw/tr_no_dev data/train_nodev
155
+ ln -s /path/to/espnet/egs2/<recipe_name>/tts1/dump/raw/dev data/dev
156
+ ln -s /path/to/espnet/egs2/<recipe_name>/tts1/dump/raw/eval1 data/eval
157
+
158
+ # Edit config to match TTS model setting
159
+ vim conf/parallel_wavegan.v1.yaml
160
+
161
+ # Run from stage 1
162
+ ./run.sh --stage 1 --conf conf/parallel_wavegan.v1.yaml
163
+ ```
164
+
165
+ That's it!
ParallelWaveGAN/egs/arctic/voc1/cmd.sh ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
2
+ # Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
3
+ # e.g.
4
+ # run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
5
+ #
6
+ # Options:
7
+ # --time <time>: Limit the maximum time to execute.
8
+ # --mem <mem>: Limit the maximum memory usage.
9
+ # -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
10
+ # --num-threads <ngpu>: Specify the number of CPU core.
11
+ # --gpu <ngpu>: Specify the number of GPU devices.
12
+ # --config: Change the configuration file from default.
13
+ #
14
+ # "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
15
+ # The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
16
+ # e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
17
+ # Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
18
+ #
19
+ # run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
20
+ # These options are mapping to specific options for each backend and
21
+ # it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
22
+ # If jobs failed, your configuration might be wrong for your environment.
23
+ #
24
+ #
25
+ # The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
26
+ # "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
27
+ # =========================================================~
28
+
29
+
30
+ # Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
31
+ cmd_backend="local"
32
+
33
+ # Local machine, without any Job scheduling system
34
+ if [ "${cmd_backend}" = local ]; then
35
+
36
+ # The other usage
37
+ export train_cmd="utils/run.pl"
38
+ # Used for "*_train.py": "--gpu" is appended optionally by run.sh
39
+ export cuda_cmd="utils/run.pl"
40
+ # Used for "*_recog.py"
41
+ export decode_cmd="utils/run.pl"
42
+
43
+ # Local machine, without any Job scheduling system
44
+ elif [ "${cmd_backend}" = stdout ]; then
45
+
46
+ # The other usage
47
+ export train_cmd="utils/stdout.pl"
48
+ # Used for "*_train.py": "--gpu" is appended optionally by run.sh
49
+ export cuda_cmd="utils/stdout.pl"
50
+ # Used for "*_recog.py"
51
+ export decode_cmd="utils/stdout.pl"
52
+
53
+ # "qsub" (SGE, Torque, PBS, etc.)
54
+ elif [ "${cmd_backend}" = sge ]; then
55
+ # The default setting is written in conf/queue.conf.
56
+ # You must change "-q g.q" for the "queue" for your environment.
57
+ # To know the "queue" names, type "qhost -q"
58
+ # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
59
+
60
+ export train_cmd="utils/queue.pl"
61
+ export cuda_cmd="utils/queue.pl"
62
+ export decode_cmd="utils/queue.pl"
63
+
64
+ # "sbatch" (Slurm)
65
+ elif [ "${cmd_backend}" = slurm ]; then
66
+ # The default setting is written in conf/slurm.conf.
67
+ # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
68
+ # To know the "partion" names, type "sinfo".
69
+ # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
70
+ # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
71
+
72
+ export train_cmd="utils/slurm.pl"
73
+ export cuda_cmd="utils/slurm.pl"
74
+ export decode_cmd="utils/slurm.pl"
75
+
76
+ elif [ "${cmd_backend}" = ssh ]; then
77
+ # You have to create ".queue/machines" to specify the host to execute jobs.
78
+ # e.g. .queue/machines
79
+ # host1
80
+ # host2
81
+ # host3
82
+ # Assuming you can login them without any password, i.e. You have to set ssh keys.
83
+
84
+ export train_cmd="utils/ssh.pl"
85
+ export cuda_cmd="utils/ssh.pl"
86
+ export decode_cmd="utils/ssh.pl"
87
+
88
+ else
89
+ echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
90
+ return 1
91
+ fi
ParallelWaveGAN/egs/arctic/voc1/conf/parallel_wavegan.v1.yaml ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is the hyperparameter configuration file for Parallel WaveGAN.
2
+ # Please make sure this is adjusted for the Arctic dataset. If you want to
3
+ # apply to the other dataset, you might need to carefully change some parameters.
4
+ # This configuration requires 12 GB GPU memory and takes ~3 days on TITAN V.
5
+
6
+ ###########################################################
7
+ # FEATURE EXTRACTION SETTING #
8
+ ###########################################################
9
+ sampling_rate: 16000 # Sampling rate.
10
+ fft_size: 1024 # FFT size.
11
+ hop_size: 256 # Hop size.
12
+ win_length: null # Window length.
13
+ # If set to null, it will be the same as fft_size.
14
+ window: "hann" # Window function.
15
+ num_mels: 80 # Number of mel basis.
16
+ fmin: 80 # Minimum freq in mel basis calculation.
17
+ fmax: 7600 # Maximum frequency in mel basis calculation.
18
+ global_gain_scale: 1.0 # Will be multiplied to all of waveform.
19
+ trim_silence: false # Whether to trim the start and end of silence.
20
+ trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
21
+ trim_frame_size: 2048 # Frame size in trimming.
22
+ trim_hop_size: 512 # Hop size in trimming.
23
+ format: "hdf5" # Feature file format. "npy" or "hdf5" is supported.
24
+
25
+ ###########################################################
26
+ # GENERATOR NETWORK ARCHITECTURE SETTING #
27
+ ###########################################################
28
+ generator_params:
29
+ in_channels: 1 # Number of input channels.
30
+ out_channels: 1 # Number of output channels.
31
+ kernel_size: 3 # Kernel size of dilated convolution.
32
+ layers: 30 # Number of residual block layers.
33
+ stacks: 3 # Number of stacks i.e., dilation cycles.
34
+ residual_channels: 64 # Number of channels in residual conv.
35
+ gate_channels: 128 # Number of channels in gated conv.
36
+ skip_channels: 64 # Number of channels in skip conv.
37
+ aux_channels: 80 # Number of channels for auxiliary feature conv.
38
+ # Must be the same as num_mels.
39
+ aux_context_window: 2 # Context window size for auxiliary feature.
40
+ # If set to 2, previous 2 and future 2 frames will be considered.
41
+ dropout: 0.0 # Dropout rate. 0.0 means no dropout applied.
42
+ use_weight_norm: true # Whether to use weight norm.
43
+ # If set to true, it will be applied to all of the conv layers.
44
+ upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
45
+ upsample_params: # Upsampling network parameters.
46
+ upsample_scales: [4, 4, 4, 4] # Upsampling scales. Prodcut of these must be the same as hop size.
47
+
48
+ ###########################################################
49
+ # DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
50
+ ###########################################################
51
+ discriminator_params:
52
+ in_channels: 1 # Number of input channels.
53
+ out_channels: 1 # Number of output channels.
54
+ kernel_size: 3 # Number of output channels.
55
+ layers: 10 # Number of conv layers.
56
+ conv_channels: 64 # Number of chnn layers.
57
+ bias: true # Whether to use bias parameter in conv.
58
+ use_weight_norm: true # Whether to use weight norm.
59
+ # If set to true, it will be applied to all of the conv layers.
60
+ nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
61
+ nonlinear_activation_params: # Nonlinear function parameters
62
+ negative_slope: 0.2 # Alpha in LeakyReLU.
63
+
64
+ ###########################################################
65
+ # STFT LOSS SETTING #
66
+ ###########################################################
67
+ stft_loss_params:
68
+ fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
69
+ hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss
70
+ win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
71
+ window: "hann_window" # Window function for STFT-based loss
72
+
73
+ ###########################################################
74
+ # ADVERSARIAL LOSS SETTING #
75
+ ###########################################################
76
+ lambda_adv: 4.0 # Loss balancing coefficient.
77
+
78
+ ###########################################################
79
+ # DATA LOADER SETTING #
80
+ ###########################################################
81
+ batch_size: 10 # Batch size.
82
+ batch_max_steps: 15360 # Length of each audio in batch. Make sure dividable by hop_size.
83
+ pin_memory: true # Whether to pin memory in Pytorch DataLoader.
84
+ num_workers: 2 # Number of workers in Pytorch DataLoader.
85
+ remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
86
+ allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.
87
+
88
+ ###########################################################
89
+ # OPTIMIZER & SCHEDULER SETTING #
90
+ ###########################################################
91
+ generator_optimizer_params:
92
+ lr: 0.0001 # Generator's learning rate.
93
+ eps: 1.0e-6 # Generator's epsilon.
94
+ weight_decay: 0.0 # Generator's weight decay coefficient.
95
+ generator_scheduler_params:
96
+ step_size: 200000 # Generator's scheduler step size.
97
+ gamma: 0.5 # Generator's scheduler gamma.
98
+ # At each step size, lr will be multiplied by this parameter.
99
+ generator_grad_norm: 10 # Generator's gradient norm.
100
+ discriminator_optimizer_params:
101
+ lr: 0.00005 # Discriminator's learning rate.
102
+ eps: 1.0e-6 # Discriminator's epsilon.
103
+ weight_decay: 0.0 # Discriminator's weight decay coefficient.
104
+ discriminator_scheduler_params:
105
+ step_size: 200000 # Discriminator's scheduler step size.
106
+ gamma: 0.5 # Discriminator's scheduler gamma.
107
+ # At each step size, lr will be multiplied by this parameter.
108
+ discriminator_grad_norm: 1 # Discriminator's gradient norm.
109
+
110
+ ###########################################################
111
+ # INTERVAL SETTING #
112
+ ###########################################################
113
+ discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
114
+ train_max_steps: 400000 # Number of training steps.
115
+ save_interval_steps: 5000 # Interval steps to save checkpoint.
116
+ eval_interval_steps: 1000 # Interval steps to evaluate the network.
117
+ log_interval_steps: 100 # Interval steps to record the training log.
118
+
119
+ ###########################################################
120
+ # OTHER SETTING #
121
+ ###########################################################
122
+ num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
ParallelWaveGAN/egs/arctic/voc1/conf/slurm.conf ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Default configuration
2
+ command sbatch --export=PATH --ntasks-per-node=1
3
+ option time=* --time $0
4
+ option mem=* --mem-per-cpu $0
5
+ option mem=0 # Do not add anything to qsub_opts
6
+ option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
7
+ option num_threads=1 --cpus-per-task 1 --ntasks-per-node=1 # Do not add anything to qsub_opts
8
+ default gpu=0
9
+ option gpu=0 -p cpu
10
+ option gpu=* -p gpu --gres=gpu:$0
11
+ # note: the --max-jobs-run option is supported as a special case
12
+ # by slurm.pl and you don't have to handle it in the config file.
ParallelWaveGAN/egs/arctic/voc1/local/data_download.sh ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Copyright 2019 Tomoki Hayashi
4
+ # MIT License (https://opensource.org/licenses/MIT)
5
+
6
+ download_dir=$1
7
+ spk=$2
8
+
9
+ available_spks=(
10
+ "slt" "clb" "bdl" "rms" "jmk" "awb" "ksp"
11
+ )
12
+
13
+ # check arguments
14
+ if [ $# != 2 ]; then
15
+ echo "Usage: $0 <download_dir> <spk>"
16
+ echo "Available speakers: ${available_spks[*]}"
17
+ exit 1
18
+ fi
19
+
20
+ set -euo pipefail
21
+
22
+ # check speakers
23
+ if ! echo "${available_spks[*]}" | grep -q "${spk}"; then
24
+ echo "Specified spk (${spk}) is not available or not supported." >&2
25
+ exit 1
26
+ fi
27
+
28
+ # download dataset
29
+ cwd=$(pwd)
30
+ if [ ! -e "${download_dir}/cmu_us_${spk}_arctic" ]; then
31
+ mkdir -p "${download_dir}"
32
+ cd "${download_dir}"
33
+ wget "http://festvox.org/cmu_arctic/cmu_arctic/packed/cmu_us_${spk}_arctic-0.95-release.tar.bz2"
34
+ tar xf "cmu_us_${spk}_arctic-0.95-release.tar.bz2"
35
+ rm "cmu_us_${spk}_arctic-0.95-release.tar.bz2"
36
+ cd "${cwd}"
37
+ echo "Successfully finished download."
38
+ else
39
+ echo "Already exists. Skip download."
40
+ fi
ParallelWaveGAN/egs/arctic/voc1/local/data_prep.sh ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Copyright 2019 Tomoki Hayashi
4
+ # MIT License (https://opensource.org/licenses/MIT)
5
+
6
+ # shellcheck disable=SC1091
7
+ . ./path.sh || exit 1;
8
+
9
+ num_dev=100
10
+ num_eval=100
11
+ train_set="train_nodev"
12
+ dev_set="dev"
13
+ eval_set="eval"
14
+ shuffle=false
15
+
16
+ # shellcheck disable=SC1091
17
+ . utils/parse_options.sh || exit 1;
18
+
19
+ db_root=$1
20
+ spk=$2
21
+ data_dir=$3
22
+
23
+ # check arguments
24
+ if [ $# != 3 ]; then
25
+ echo "Usage: $0 <db_root> <spk> <data_dir>"
26
+ echo "e.g.: $0 downloads/cms_us_slt_arctic slt data"
27
+ echo ""
28
+ echo "Options:"
29
+ echo " --num_dev: number of development uttreances (default=250)."
30
+ echo " --num_eval: number of evaluation uttreances (default=250)."
31
+ echo " --train_set: name of train set (default=train_nodev)."
32
+ echo " --dev_set: name of dev set (default=dev)."
33
+ echo " --eval_set: name of eval set (default=eval)."
34
+ echo " --shuffle: whether to perform shuffle in making dev / eval set (default=false)."
35
+ exit 1
36
+ fi
37
+
38
+ set -euo pipefail
39
+
40
+ # check speaker
41
+ available_spks=(
42
+ "slt" "clb" "bdl" "rms" "jmk" "awb" "ksp"
43
+ )
44
+ if ! echo "${available_spks[*]}" | grep -q "${spk}"; then
45
+ echo "Specified speaker ${spk} is not available."
46
+ echo "Available speakers: ${available_spks[*]}"
47
+ exit 1
48
+ fi
49
+
50
+ [ ! -e "${data_dir}/all" ] && mkdir -p "${data_dir}/all"
51
+
52
+ # set filenames
53
+ scp="${data_dir}/all/wav.scp"
54
+ segments="${data_dir}/all/segments"
55
+
56
+ # check file existence
57
+ [ -e "${scp}" ] && rm "${scp}"
58
+ [ -e "${segments}" ] && rm "${segments}"
59
+
60
+ # make scp
61
+ find "${db_root}" -name "*.wav" -follow | sort | while read -r filename; do
62
+ id="${spk}_$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")"
63
+ echo "${id} ${filename}" >> "${scp}"
64
+ done
65
+
66
+ # make segments
67
+ find "${db_root}/lab" -name "*.lab" -follow | sort | while read -r filename; do
68
+ # get start time
69
+ while read -r line; do
70
+ phn=$(echo "${line}" | cut -d " " -f 3)
71
+ if [ "${phn}" != "pau" ]; then
72
+ break
73
+ fi
74
+ start=$(echo "${line}" | cut -d " " -f 1)
75
+ done < <(tail -n +2 "$filename")
76
+ # get end time
77
+ while read -r line; do
78
+ end=$(echo "${line}" | cut -d " " -f 1)
79
+ phn=$(echo "${line}" | cut -d " " -f 3)
80
+ if [ "${phn}" != "pau" ]; then
81
+ break
82
+ fi
83
+ done < <(tail -n +2 "$filename" | tac)
84
+ echo "${spk}_$(basename "${filename}" .lab) ${spk}_$(basename "${filename}" .lab) ${start} ${end}" >> "${segments}"
85
+ done
86
+
87
+ # check
88
+ diff -q <(awk '{print $1}' "${scp}") <(awk '{print $1}' "${segments}") > /dev/null
89
+
90
+ # split
91
+ num_all=$(wc -l < "${scp}")
92
+ num_deveval=$((num_dev + num_eval))
93
+ num_train=$((num_all - num_deveval))
94
+ utils/split_data.sh \
95
+ --num_first "${num_train}" \
96
+ --num_second "${num_deveval}" \
97
+ --shuffle "${shuffle}" \
98
+ "${data_dir}/all" \
99
+ "${data_dir}/${train_set}" \
100
+ "${data_dir}/deveval"
101
+ utils/split_data.sh \
102
+ --num_first "${num_dev}" \
103
+ --num_second "${num_eval}" \
104
+ --shuffle "${shuffle}" \
105
+ "${data_dir}/deveval" \
106
+ "${data_dir}/${dev_set}" \
107
+ "${data_dir}/${eval_set}"
108
+
109
+ # remove tmp directories
110
+ rm -rf "${data_dir}/all"
111
+ rm -rf "${data_dir}/deveval"
112
+
113
+ echo "Successfully prepared data."
ParallelWaveGAN/egs/arctic/voc1/path.sh ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # cuda related
2
+ export CUDA_HOME=/usr/local/cuda-10.0
3
+ export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
4
+
5
+ # path related
6
+ export PRJ_ROOT="${PWD}/../../.."
7
+ if [ -e "${PRJ_ROOT}/tools/venv/bin/activate" ]; then
8
+ # shellcheck disable=SC1090
9
+ . "${PRJ_ROOT}/tools/venv/bin/activate"
10
+ fi
11
+
12
+ # python related
13
+ export OMP_NUM_THREADS=1
14
+ export PYTHONIOENCODING=UTF-8
15
+ export MPL_BACKEND=Agg
16
+
17
+ # check installation
18
+ if ! command -v parallel-wavegan-train > /dev/null; then
19
+ echo "Error: It seems setup is not finished." >&2
20
+ echo "Error: Please setup your environment by following README.md" >&2
21
+ return 1
22
+ fi
23
+ if ! command -v jq > /dev/null; then
24
+ echo "Error: It seems jq is not installed." >&2
25
+ echo "Error: Please install via \`sudo apt-get install jq\`." >&2
26
+ echo "Error: If you do not have sudo, please download from https://stedolan.github.io/jq/download/." >&2
27
+ return 1
28
+ fi
29
+ if ! command -v yq > /dev/null; then
30
+ echo "Error: It seems yq is not installed." >&2
31
+ echo "Error: Please install via \`pip install yq\`." >&2
32
+ return 1
33
+ fi
ParallelWaveGAN/egs/arctic/voc1/run.sh ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Copyright 2019 Tomoki Hayashi
4
+ # MIT License (https://opensource.org/licenses/MIT)
5
+
6
+ . ./cmd.sh || exit 1;
7
+ . ./path.sh || exit 1;
8
+
9
+ # basic settings
10
+ stage=-1 # stage to start
11
+ stop_stage=100 # stage to stop
12
+ verbose=1 # verbosity level (lower is less info)
13
+ n_gpus=1 # number of gpus in training
14
+ n_jobs=16 # number of parallel jobs in feature extraction
15
+
16
+ # NOTE(kan-bayashi): renamed to conf to avoid conflict in parse_options.sh
17
+ conf=conf/parallel_wavegan.v1.yaml
18
+
19
+ # directory path setting
20
+ download_dir=downloads # direcotry to save downloaded files
21
+ dumpdir=dump # directory to dump features
22
+
23
+ # target speaker setting
24
+ spk=slt # you can select from slt, clb, bdl, rms, awb, jmk, ksp
25
+
26
+ # training related setting
27
+ tag="" # tag for directory to save model
28
+ resume="" # checkpoint path to resume training
29
+ # (e.g. <path>/<to>/checkpoint-10000steps.pkl)
30
+
31
+ # decoding related setting
32
+ checkpoint="" # checkpoint path to be used for decoding
33
+ # if not provided, the latest one will be used
34
+ # (e.g. <path>/<to>/checkpoint-400000steps.pkl)
35
+
36
+ # shellcheck disable=SC1091
37
+ . utils/parse_options.sh || exit 1;
38
+
39
+ train_set="train_nodev_${spk}" # name of training data directory
40
+ dev_set="dev_${spk}" # name of development data direcotry
41
+ eval_set="eval_${spk}" # name of evaluation data direcotry
42
+
43
+ set -euo pipefail
44
+
45
+ if [ "${stage}" -le -1 ] && [ "${stop_stage}" -ge -1 ]; then
46
+ echo "Stage -1: Data download"
47
+ local/data_download.sh "${download_dir}" "${spk}"
48
+ fi
49
+
50
+ if [ "${stage}" -le 0 ] && [ "${stop_stage}" -ge 0 ]; then
51
+ echo "Stage 0: Data preparation"
52
+ local/data_prep.sh \
53
+ --train_set "${train_set}" \
54
+ --dev_set "${dev_set}" \
55
+ --eval_set "${eval_set}" \
56
+ "${download_dir}/cmu_us_${spk}_arctic" "${spk}" data
57
+ fi
58
+
59
+ stats_ext=$(grep -q "hdf5" <(yq ".format" "${conf}") && echo "h5" || echo "npy")
60
+ if [ "${stage}" -le 1 ] && [ "${stop_stage}" -ge 1 ]; then
61
+ echo "Stage 1: Feature extraction"
62
+ # extract raw features
63
+ pids=()
64
+ for name in "${train_set}" "${dev_set}" "${eval_set}"; do
65
+ (
66
+ [ ! -e "${dumpdir}/${name}/raw" ] && mkdir -p "${dumpdir}/${name}/raw"
67
+ echo "Feature extraction start. See the progress via ${dumpdir}/${name}/raw/preprocessing.*.log."
68
+ utils/make_subset_data.sh "data/${name}" "${n_jobs}" "${dumpdir}/${name}/raw"
69
+ ${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/raw/preprocessing.JOB.log" \
70
+ parallel-wavegan-preprocess \
71
+ --config "${conf}" \
72
+ --scp "${dumpdir}/${name}/raw/wav.JOB.scp" \
73
+ --segments "${dumpdir}/${name}/raw/segments.JOB" \
74
+ --dumpdir "${dumpdir}/${name}/raw/dump.JOB" \
75
+ --verbose "${verbose}"
76
+ echo "Successfully finished feature extraction of ${name} set."
77
+ ) &
78
+ pids+=($!)
79
+ done
80
+ i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
81
+ [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
82
+ echo "Successfully finished feature extraction."
83
+
84
+ # calculate statistics for normalization
85
+ echo "Statistics computation start. See the progress via ${dumpdir}/${train_set}/compute_statistics.log."
86
+ ${train_cmd} "${dumpdir}/${train_set}/compute_statistics.log" \
87
+ parallel-wavegan-compute-statistics \
88
+ --config "${conf}" \
89
+ --rootdir "${dumpdir}/${train_set}/raw" \
90
+ --dumpdir "${dumpdir}/${train_set}" \
91
+ --verbose "${verbose}"
92
+ echo "Successfully finished calculation of statistics."
93
+
94
+ # normalize and dump them
95
+ pids=()
96
+ for name in "${train_set}" "${dev_set}" "${eval_set}"; do
97
+ (
98
+ [ ! -e "${dumpdir}/${name}/norm" ] && mkdir -p "${dumpdir}/${name}/norm"
99
+ echo "Nomalization start. See the progress via ${dumpdir}/${name}/norm/normalize.*.log."
100
+ ${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/norm/normalize.JOB.log" \
101
+ parallel-wavegan-normalize \
102
+ --config "${conf}" \
103
+ --stats "${dumpdir}/${train_set}/stats.${stats_ext}" \
104
+ --rootdir "${dumpdir}/${name}/raw/dump.JOB" \
105
+ --dumpdir "${dumpdir}/${name}/norm/dump.JOB" \
106
+ --verbose "${verbose}"
107
+ echo "Successfully finished normalization of ${name} set."
108
+ ) &
109
+ pids+=($!)
110
+ done
111
+ i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
112
+ [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
113
+ echo "Successfully finished normalization."
114
+ fi
115
+
116
+ if [ -z "${tag}" ]; then
117
+ expdir="exp/${train_set}_arctic_$(basename "${conf}" .yaml)"
118
+ else
119
+ expdir="exp/${train_set}_arctic_${tag}"
120
+ fi
121
+ if [ "${stage}" -le 2 ] && [ "${stop_stage}" -ge 2 ]; then
122
+ echo "Stage 2: Network training"
123
+ [ ! -e "${expdir}" ] && mkdir -p "${expdir}"
124
+ cp "${dumpdir}/${train_set}/stats.${stats_ext}" "${expdir}"
125
+ if [ "${n_gpus}" -gt 1 ]; then
126
+ train="python -m parallel_wavegan.distributed.launch --nproc_per_node ${n_gpus} -c parallel-wavegan-train"
127
+ else
128
+ train="parallel-wavegan-train"
129
+ fi
130
+ echo "Training start. See the progress via ${expdir}/train.log."
131
+ ${cuda_cmd} --gpu "${n_gpus}" "${expdir}/train.log" \
132
+ ${train} \
133
+ --config "${conf}" \
134
+ --train-dumpdir "${dumpdir}/${train_set}/norm" \
135
+ --dev-dumpdir "${dumpdir}/${dev_set}/norm" \
136
+ --outdir "${expdir}" \
137
+ --resume "${resume}" \
138
+ --verbose "${verbose}"
139
+ echo "Successfully finished training."
140
+ fi
141
+
142
+ if [ "${stage}" -le 3 ] && [ "${stop_stage}" -ge 3 ]; then
143
+ echo "Stage 3: Network decoding"
144
+ # shellcheck disable=SC2012
145
+ [ -z "${checkpoint}" ] && checkpoint="$(ls -dt "${expdir}"/*.pkl | head -1 || true)"
146
+ outdir="${expdir}/wav/$(basename "${checkpoint}" .pkl)"
147
+ pids=()
148
+ for name in "${dev_set}" "${eval_set}"; do
149
+ (
150
+ [ ! -e "${outdir}/${name}" ] && mkdir -p "${outdir}/${name}"
151
+ [ "${n_gpus}" -gt 1 ] && n_gpus=1
152
+ echo "Decoding start. See the progress via ${outdir}/${name}/decode.log."
153
+ ${cuda_cmd} --gpu "${n_gpus}" "${outdir}/${name}/decode.log" \
154
+ parallel-wavegan-decode \
155
+ --dumpdir "${dumpdir}/${name}/norm" \
156
+ --checkpoint "${checkpoint}" \
157
+ --outdir "${outdir}/${name}" \
158
+ --verbose "${verbose}"
159
+ echo "Successfully finished decoding of ${name} set."
160
+ ) &
161
+ pids+=($!)
162
+ done
163
+ i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
164
+ [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
165
+ echo "Successfully finished decoding."
166
+ fi
167
+ echo "Finished."
ParallelWaveGAN/egs/arctic/voc1/utils ADDED
@@ -0,0 +1 @@
 
 
1
+ ../../../utils
ParallelWaveGAN/egs/csmsc/voc1/cmd.sh ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
2
+ # Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
3
+ # e.g.
4
+ # run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
5
+ #
6
+ # Options:
7
+ # --time <time>: Limit the maximum time to execute.
8
+ # --mem <mem>: Limit the maximum memory usage.
9
+ # -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
10
+ # --num-threads <ngpu>: Specify the number of CPU core.
11
+ # --gpu <ngpu>: Specify the number of GPU devices.
12
+ # --config: Change the configuration file from default.
13
+ #
14
+ # "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
15
+ # The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
16
+ # e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
17
+ # Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
18
+ #
19
+ # run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
20
+ # These options are mapping to specific options for each backend and
21
+ # it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
22
+ # If jobs failed, your configuration might be wrong for your environment.
23
+ #
24
+ #
25
+ # The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
26
+ # "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
27
+ # =========================================================~
28
+
29
+
30
+ # Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
31
+ cmd_backend="local"
32
+
33
+ # Local machine, without any Job scheduling system
34
+ if [ "${cmd_backend}" = local ]; then
35
+
36
+ # The other usage
37
+ export train_cmd="utils/run.pl"
38
+ # Used for "*_train.py": "--gpu" is appended optionally by run.sh
39
+ export cuda_cmd="utils/run.pl"
40
+ # Used for "*_recog.py"
41
+ export decode_cmd="utils/run.pl"
42
+
43
+ # Local machine, without any Job scheduling system
44
+ elif [ "${cmd_backend}" = stdout ]; then
45
+
46
+ # The other usage
47
+ export train_cmd="utils/stdout.pl"
48
+ # Used for "*_train.py": "--gpu" is appended optionally by run.sh
49
+ export cuda_cmd="utils/stdout.pl"
50
+ # Used for "*_recog.py"
51
+ export decode_cmd="utils/stdout.pl"
52
+
53
+ # "qsub" (SGE, Torque, PBS, etc.)
54
+ elif [ "${cmd_backend}" = sge ]; then
55
+ # The default setting is written in conf/queue.conf.
56
+ # You must change "-q g.q" for the "queue" for your environment.
57
+ # To know the "queue" names, type "qhost -q"
58
+ # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
59
+
60
+ export train_cmd="utils/queue.pl"
61
+ export cuda_cmd="utils/queue.pl"
62
+ export decode_cmd="utils/queue.pl"
63
+
64
+ # "sbatch" (Slurm)
65
+ elif [ "${cmd_backend}" = slurm ]; then
66
+ # The default setting is written in conf/slurm.conf.
67
+ # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
68
+ # To know the "partion" names, type "sinfo".
69
+ # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
70
+ # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
71
+
72
+ export train_cmd="utils/slurm.pl"
73
+ export cuda_cmd="utils/slurm.pl"
74
+ export decode_cmd="utils/slurm.pl"
75
+
76
+ elif [ "${cmd_backend}" = ssh ]; then
77
+ # You have to create ".queue/machines" to specify the host to execute jobs.
78
+ # e.g. .queue/machines
79
+ # host1
80
+ # host2
81
+ # host3
82
+ # Assuming you can login them without any password, i.e. You have to set ssh keys.
83
+
84
+ export train_cmd="utils/ssh.pl"
85
+ export cuda_cmd="utils/ssh.pl"
86
+ export decode_cmd="utils/ssh.pl"
87
+
88
+ else
89
+ echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
90
+ return 1
91
+ fi
ParallelWaveGAN/egs/csmsc/voc1/conf/hifigan.v1.yaml ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is the configuration file for CSMSC dataset.
2
+ # This configuration is based on HiFiGAN V1, which is
3
+ # an official configuration. But I found that the optimizer
4
+ # setting does not work well with my implementation.
5
+ # So I changed optimizer settings as follows:
6
+ # - AdamW -> Adam
7
+ # - betas: [0.8, 0.99] -> betas: [0.5, 0.9]
8
+ # - Scheduler: ExponentialLR -> MultiStepLR
9
+ # To match the shift size difference, the upsample scales
10
+ # is also modified from the original 256 shift setting.
11
+
12
+ ###########################################################
13
+ # FEATURE EXTRACTION SETTING #
14
+ ###########################################################
15
+ sampling_rate: 24000 # Sampling rate.
16
+ fft_size: 2048 # FFT size.
17
+ hop_size: 300 # Hop size.
18
+ win_length: 1200 # Window length.
19
+ # If set to null, it will be the same as fft_size.
20
+ window: "hann" # Window function.
21
+ num_mels: 80 # Number of mel basis.
22
+ fmin: 80 # Minimum freq in mel basis calculation.
23
+ fmax: 7600 # Maximum frequency in mel basis calculation.
24
+ global_gain_scale: 1.0 # Will be multiplied to all of waveform.
25
+ trim_silence: false # Whether to trim the start and end of silence.
26
+ trim_threshold_in_db: 20 # Need to tune carefully if the recording is not good.
27
+ trim_frame_size: 1024 # Frame size in trimming.
28
+ trim_hop_size: 256 # Hop size in trimming.
29
+ format: "hdf5" # Feature file format. "npy" or "hdf5" is supported.
30
+
31
+ ###########################################################
32
+ # GENERATOR NETWORK ARCHITECTURE SETTING #
33
+ ###########################################################
34
+ generator_type: HiFiGANGenerator
35
+ generator_params:
36
+ in_channels: 80 # Number of input channels.
37
+ out_channels: 1 # Number of output channels.
38
+ channels: 512 # Number of initial channels.
39
+ kernel_size: 7 # Kernel size of initial and final conv layers.
40
+ upsample_scales: [5, 5, 4, 3] # Upsampling scales.
41
+ upsample_kernel_sizes: [10, 10, 8, 6] # Kernel size for upsampling layers.
42
+ resblock_kernel_sizes: [3, 7, 11] # Kernel size for residual blocks.
43
+ resblock_dilations: # Dilations for residual blocks.
44
+ - [1, 3, 5]
45
+ - [1, 3, 5]
46
+ - [1, 3, 5]
47
+ use_additional_convs: true # Whether to use additional conv layer in residual blocks.
48
+ bias: true # Whether to use bias parameter in conv.
49
+ nonlinear_activation: "LeakyReLU" # Nonlinear activation type.
50
+ nonlinear_activation_params: # Nonlinear activation paramters.
51
+ negative_slope: 0.1
52
+ use_weight_norm: true # Whether to apply weight normalization.
53
+
54
+ ###########################################################
55
+ # DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
56
+ ###########################################################
57
+ discriminator_type: HiFiGANMultiScaleMultiPeriodDiscriminator
58
+ discriminator_params:
59
+ scales: 3 # Number of multi-scale discriminator.
60
+ scale_downsample_pooling: "AvgPool1d" # Pooling operation for scale discriminator.
61
+ scale_downsample_pooling_params:
62
+ kernel_size: 4 # Pooling kernel size.
63
+ stride: 2 # Pooling stride.
64
+ padding: 2 # Padding size.
65
+ scale_discriminator_params:
66
+ in_channels: 1 # Number of input channels.
67
+ out_channels: 1 # Number of output channels.
68
+ kernel_sizes: [15, 41, 5, 3] # List of kernel sizes.
69
+ channels: 128 # Initial number of channels.
70
+ max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
71
+ max_groups: 16 # Maximum number of groups in downsampling conv layers.
72
+ bias: true
73
+ downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales.
74
+ nonlinear_activation: "LeakyReLU" # Nonlinear activation.
75
+ nonlinear_activation_params:
76
+ negative_slope: 0.1
77
+ follow_official_norm: true # Whether to follow the official norm setting.
78
+ periods: [2, 3, 5, 7, 11] # List of period for multi-period discriminator.
79
+ period_discriminator_params:
80
+ in_channels: 1 # Number of input channels.
81
+ out_channels: 1 # Number of output channels.
82
+ kernel_sizes: [5, 3] # List of kernel sizes.
83
+ channels: 32 # Initial number of channels.
84
+ downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
85
+ max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
86
+ bias: true # Whether to use bias parameter in conv layer."
87
+ nonlinear_activation: "LeakyReLU" # Nonlinear activation.
88
+ nonlinear_activation_params: # Nonlinear activation paramters.
89
+ negative_slope: 0.1
90
+ use_weight_norm: true # Whether to apply weight normalization.
91
+ use_spectral_norm: false # Whether to apply spectral normalization.
92
+
93
+ ###########################################################
94
+ # STFT LOSS SETTING #
95
+ ###########################################################
96
+ use_stft_loss: false # Whether to use multi-resolution STFT loss.
97
+ use_mel_loss: true # Whether to use Mel-spectrogram loss.
98
+ mel_loss_params:
99
+ fs: 24000
100
+ fft_size: 2048
101
+ hop_size: 300
102
+ win_length: 1200
103
+ window: "hann"
104
+ num_mels: 80
105
+ fmin: 0
106
+ fmax: 12000
107
+ log_base: null
108
+ generator_adv_loss_params:
109
+ average_by_discriminators: false # Whether to average loss by #discriminators.
110
+ discriminator_adv_loss_params:
111
+ average_by_discriminators: false # Whether to average loss by #discriminators.
112
+ use_feat_match_loss: true
113
+ feat_match_loss_params:
114
+ average_by_discriminators: false # Whether to average loss by #discriminators.
115
+ average_by_layers: false # Whether to average loss by #layers in each discriminator.
116
+ include_final_outputs: false # Whether to include final outputs in feat match loss calculation.
117
+
118
+ ###########################################################
119
+ # ADVERSARIAL LOSS SETTING #
120
+ ###########################################################
121
+ lambda_aux: 45.0 # Loss balancing coefficient for STFT loss.
122
+ lambda_adv: 1.0 # Loss balancing coefficient for adversarial loss.
123
+ lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss..
124
+
125
+ ###########################################################
126
+ # DATA LOADER SETTING #
127
+ ###########################################################
128
+ batch_size: 16 # Batch size.
129
+ batch_max_steps: 8400 # Length of each audio in batch. Make sure dividable by hop_size.
130
+ pin_memory: true # Whether to pin memory in Pytorch DataLoader.
131
+ num_workers: 2 # Number of workers in Pytorch DataLoader.
132
+ remove_short_samples: false # Whether to remove samples the length of which are less than batch_max_steps.
133
+ allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.
134
+
135
+ ###########################################################
136
+ # OPTIMIZER & SCHEDULER SETTING #
137
+ ###########################################################
138
+ generator_optimizer_type: Adam
139
+ generator_optimizer_params:
140
+ lr: 2.0e-4
141
+ betas: [0.5, 0.9]
142
+ weight_decay: 0.0
143
+ generator_scheduler_type: MultiStepLR
144
+ generator_scheduler_params:
145
+ gamma: 0.5
146
+ milestones:
147
+ - 200000
148
+ - 400000
149
+ - 600000
150
+ - 800000
151
+ generator_grad_norm: -1
152
+ discriminator_optimizer_type: Adam
153
+ discriminator_optimizer_params:
154
+ lr: 2.0e-4
155
+ betas: [0.5, 0.9]
156
+ weight_decay: 0.0
157
+ discriminator_scheduler_type: MultiStepLR
158
+ discriminator_scheduler_params:
159
+ gamma: 0.5
160
+ milestones:
161
+ - 200000
162
+ - 400000
163
+ - 600000
164
+ - 800000
165
+ discriminator_grad_norm: -1
166
+
167
+ ###########################################################
168
+ # INTERVAL SETTING #
169
+ ###########################################################
170
+ generator_train_start_steps: 1 # Number of steps to start to train discriminator.
171
+ discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
172
+ train_max_steps: 2500000 # Number of training steps.
173
+ save_interval_steps: 10000 # Interval steps to save checkpoint.
174
+ eval_interval_steps: 1000 # Interval steps to evaluate the network.
175
+ log_interval_steps: 100 # Interval steps to record the training log.
176
+
177
+ ###########################################################
178
+ # OTHER SETTING #
179
+ ###########################################################
180
+ num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
ParallelWaveGAN/egs/csmsc/voc1/conf/multi_band_melgan.v2.yaml ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is the hyperparameter configuration file for MelGAN.
2
+ # Please make sure this is adjusted for the CSMSC dataset. If you want to
3
+ # apply to the other dataset, you might need to carefully change some parameters.
4
+ # This configuration requires ~ 8GB memory and will finish within 4 days on Titan V.
5
+
6
+ # This configuration is based on full-band MelGAN but the hop size and sampling
7
+ # rate is different from the paper (16kHz vs 24kHz). The number of iteraions
8
+ # is now shown in the paper so currently we train 1M iterations (not sure enough
9
+ # to converge). The optimizer setting is based on @dathudeptrai advice.
10
+ # https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906
11
+
12
+ ###########################################################
13
+ # FEATURE EXTRACTION SETTING #
14
+ ###########################################################
15
+ sampling_rate: 24000 # Sampling rate.
16
+ fft_size: 2048 # FFT size.
17
+ hop_size: 300 # Hop size.
18
+ win_length: 1200 # Window length.
19
+ # If set to null, it will be the same as fft_size.
20
+ window: "hann" # Window function.
21
+ num_mels: 80 # Number of mel basis.
22
+ fmin: 80 # Minimum freq in mel basis calculation.
23
+ fmax: 7600 # Maximum frequency in mel basis calculation.
24
+ global_gain_scale: 1.0 # Will be multiplied to all of waveform.
25
+ trim_silence: false # Whether to trim the start and end of silence.
26
+ trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
27
+ trim_frame_size: 2048 # Frame size in trimming.
28
+ trim_hop_size: 512 # Hop size in trimming.
29
+ format: "hdf5" # Feature file format. "npy" or "hdf5" is supported.
30
+
31
+ ###########################################################
32
+ # GENERATOR NETWORK ARCHITECTURE SETTING #
33
+ ###########################################################
34
+ generator_type: "MelGANGenerator" # Generator type.
35
+ generator_params:
36
+ in_channels: 80 # Number of input channels.
37
+ out_channels: 4 # Number of output channels.
38
+ kernel_size: 7 # Kernel size of initial and final conv layers.
39
+ channels: 384 # Initial number of channels for conv layers.
40
+ upsample_scales: [5, 5, 3] # List of Upsampling scales.
41
+ stack_kernel_size: 3 # Kernel size of dilated conv layers in residual stack.
42
+ stacks: 4 # Number of stacks in a single residual stack module.
43
+ use_weight_norm: True # Whether to use weight normalization.
44
+ use_causal_conv: False # Whether to use causal convolution.
45
+
46
+ ###########################################################
47
+ # DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
48
+ ###########################################################
49
+ discriminator_type: "MelGANMultiScaleDiscriminator" # Discriminator type.
50
+ discriminator_params:
51
+ in_channels: 1 # Number of input channels.
52
+ out_channels: 1 # Number of output channels.
53
+ scales: 3 # Number of multi-scales.
54
+ downsample_pooling: "AvgPool1d" # Pooling type for the input downsampling.
55
+ downsample_pooling_params: # Parameters of the above pooling function.
56
+ kernel_size: 4
57
+ stride: 2
58
+ padding: 1
59
+ count_include_pad: False
60
+ kernel_sizes: [5, 3] # List of kernel size.
61
+ channels: 16 # Number of channels of the initial conv layer.
62
+ max_downsample_channels: 512 # Maximum number of channels of downsampling layers.
63
+ downsample_scales: [4, 4, 4] # List of downsampling scales.
64
+ nonlinear_activation: "LeakyReLU" # Nonlinear activation function.
65
+ nonlinear_activation_params: # Parameters of nonlinear activation function.
66
+ negative_slope: 0.2
67
+ use_weight_norm: True # Whether to use weight norm.
68
+
69
+ ###########################################################
70
+ # STFT LOSS SETTING #
71
+ ###########################################################
72
+ stft_loss_params:
73
+ fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
74
+ hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss
75
+ win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
76
+ window: "hann_window" # Window function for STFT-based loss
77
+ use_subband_stft_loss: true
78
+ subband_stft_loss_params:
79
+ fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss.
80
+ hop_sizes: [30, 60, 10] # List of hop size for STFT-based loss
81
+ win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
82
+ window: "hann_window" # Window function for STFT-based loss
83
+
84
+ ###########################################################
85
+ # ADVERSARIAL LOSS SETTING #
86
+ ###########################################################
87
+ use_feat_match_loss: false # Whether to use feature matching loss.
88
+ lambda_adv: 2.5 # Loss balancing coefficient for adversarial loss.
89
+
90
+ ###########################################################
91
+ # DATA LOADER SETTING #
92
+ ###########################################################
93
+ batch_size: 64 # Batch size.
94
+ batch_max_steps: 16200 # Length of each audio in batch. Make sure dividable by hop_size.
95
+ pin_memory: true # Whether to pin memory in Pytorch DataLoader.
96
+ num_workers: 4 # Number of workers in Pytorch DataLoader.
97
+ remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
98
+ allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.
99
+
100
+ ###########################################################
101
+ # OPTIMIZER & SCHEDULER SETTING #
102
+ ###########################################################
103
+ generator_optimizer_type: "Adam" # Generator's optimizer type.
104
+ generator_optimizer_params:
105
+ lr: 1.0e-3 # Generator's learning rate.
106
+ eps: 1.0e-7 # Generator's epsilon.
107
+ weight_decay: 0.0 # Generator's weight decay coefficient.
108
+ amsgrad: true
109
+ generator_grad_norm: -1 # Generator's gradient norm.
110
+ generator_scheduler_type: "MultiStepLR" # Generator's scheduler type.
111
+ generator_scheduler_params:
112
+ gamma: 0.5 # Generator's scheduler gamma.
113
+ milestones: # At each milestone, lr will be multiplied by gamma.
114
+ - 100000
115
+ - 200000
116
+ - 300000
117
+ - 400000
118
+ - 500000
119
+ - 600000
120
+ discriminator_optimizer_type: "Adam" # Discriminator's optimizer type.
121
+ discriminator_optimizer_params:
122
+ lr: 1.0e-3 # Discriminator's learning rate.
123
+ eps: 1.0e-7 # Discriminator's epsilon.
124
+ weight_decay: 0.0 # Discriminator's weight decay coefficient.
125
+ amsgrad: true
126
+ discriminator_grad_norm: -1 # Discriminator's gradient norm.
127
+ discriminator_scheduler_type: "MultiStepLR" # Discriminator's scheduler type.
128
+ discriminator_scheduler_params:
129
+ gamma: 0.5 # Discriminator's scheduler gamma.
130
+ milestones: # At each milestone, lr will be multiplied by gamma.
131
+ - 100000
132
+ - 200000
133
+ - 300000
134
+ - 400000
135
+ - 500000
136
+ - 600000
137
+
138
+ ###########################################################
139
+ # INTERVAL SETTING #
140
+ ###########################################################
141
+ discriminator_train_start_steps: 200000 # Number of steps to start to train discriminator.
142
+ train_max_steps: 1000000 # Number of training steps.
143
+ save_interval_steps: 50000 # Interval steps to save checkpoint.
144
+ eval_interval_steps: 1000 # Interval steps to evaluate the network.
145
+ log_interval_steps: 1000 # Interval steps to record the training log.
146
+
147
+ ###########################################################
148
+ # OTHER SETTING #
149
+ ###########################################################
150
+ num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
ParallelWaveGAN/egs/csmsc/voc1/conf/parallel_wavegan.v1.yaml ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is the hyperparameter configuration file for Parallel WaveGAN.
2
+ # Please make sure this is adjusted for the CSMSC dataset. If you want to
3
+ # apply to the other dataset, you might need to carefully change some parameters.
4
+ # This configuration requires 12 GB GPU memory and takes ~3 days on RTX TITAN.
5
+
6
+ ###########################################################
7
+ # FEATURE EXTRACTION SETTING #
8
+ ###########################################################
9
+ sampling_rate: 24000 # Sampling rate.
10
+ fft_size: 2048 # FFT size.
11
+ hop_size: 300 # Hop size.
12
+ win_length: 1200 # Window length.
13
+ # If set to null, it will be the same as fft_size.
14
+ window: "hann" # Window function.
15
+ num_mels: 80 # Number of mel basis.
16
+ fmin: 80 # Minimum freq in mel basis calculation.
17
+ fmax: 7600 # Maximum frequency in mel basis calculation.
18
+ global_gain_scale: 1.0 # Will be multiplied to all of waveform.
19
+ trim_silence: false # Whether to trim the start and end of silence.
20
+ trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
21
+ trim_frame_size: 2048 # Frame size in trimming.
22
+ trim_hop_size: 512 # Hop size in trimming.
23
+ format: "hdf5" # Feature file format. "npy" or "hdf5" is supported.
24
+
25
+ ###########################################################
26
+ # GENERATOR NETWORK ARCHITECTURE SETTING #
27
+ ###########################################################
28
+ generator_params:
29
+ in_channels: 1 # Number of input channels.
30
+ out_channels: 1 # Number of output channels.
31
+ kernel_size: 3 # Kernel size of dilated convolution.
32
+ layers: 30 # Number of residual block layers.
33
+ stacks: 3 # Number of stacks i.e., dilation cycles.
34
+ residual_channels: 64 # Number of channels in residual conv.
35
+ gate_channels: 128 # Number of channels in gated conv.
36
+ skip_channels: 64 # Number of channels in skip conv.
37
+ aux_channels: 80 # Number of channels for auxiliary feature conv.
38
+ # Must be the same as num_mels.
39
+ aux_context_window: 2 # Context window size for auxiliary feature.
40
+ # If set to 2, previous 2 and future 2 frames will be considered.
41
+ dropout: 0.0 # Dropout rate. 0.0 means no dropout applied.
42
+ use_weight_norm: true # Whether to use weight norm.
43
+ # If set to true, it will be applied to all of the conv layers.
44
+ upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
45
+ upsample_params: # Upsampling network parameters.
46
+ upsample_scales: [4, 5, 3, 5] # Upsampling scales. Prodcut of these must be the same as hop size.
47
+
48
+ ###########################################################
49
+ # DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
50
+ ###########################################################
51
+ discriminator_params:
52
+ in_channels: 1 # Number of input channels.
53
+ out_channels: 1 # Number of output channels.
54
+ kernel_size: 3 # Number of output channels.
55
+ layers: 10 # Number of conv layers.
56
+ conv_channels: 64 # Number of chnn layers.
57
+ bias: true # Whether to use bias parameter in conv.
58
+ use_weight_norm: true # Whether to use weight norm.
59
+ # If set to true, it will be applied to all of the conv layers.
60
+ nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
61
+ nonlinear_activation_params: # Nonlinear function parameters
62
+ negative_slope: 0.2 # Alpha in LeakyReLU.
63
+
64
+ ###########################################################
65
+ # STFT LOSS SETTING #
66
+ ###########################################################
67
+ stft_loss_params:
68
+ fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
69
+ hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss
70
+ win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
71
+ window: "hann_window" # Window function for STFT-based loss
72
+
73
+ ###########################################################
74
+ # ADVERSARIAL LOSS SETTING #
75
+ ###########################################################
76
+ lambda_adv: 4.0 # Loss balancing coefficient.
77
+
78
+ ###########################################################
79
+ # DATA LOADER SETTING #
80
+ ###########################################################
81
+ batch_size: 6 # Batch size.
82
+ batch_max_steps: 25500 # Length of each audio in batch. Make sure dividable by hop_size.
83
+ pin_memory: true # Whether to pin memory in Pytorch DataLoader.
84
+ num_workers: 2 # Number of workers in Pytorch DataLoader.
85
+ remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
86
+ allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.
87
+
88
+ ###########################################################
89
+ # OPTIMIZER & SCHEDULER SETTING #
90
+ ###########################################################
91
+ generator_optimizer_params:
92
+ lr: 0.0001 # Generator's learning rate.
93
+ eps: 1.0e-6 # Generator's epsilon.
94
+ weight_decay: 0.0 # Generator's weight decay coefficient.
95
+ generator_scheduler_params:
96
+ step_size: 200000 # Generator's scheduler step size.
97
+ gamma: 0.5 # Generator's scheduler gamma.
98
+ # At each step size, lr will be multiplied by this parameter.
99
+ generator_grad_norm: 10 # Generator's gradient norm.
100
+ discriminator_optimizer_params:
101
+ lr: 0.00005 # Discriminator's learning rate.
102
+ eps: 1.0e-6 # Discriminator's epsilon.
103
+ weight_decay: 0.0 # Discriminator's weight decay coefficient.
104
+ discriminator_scheduler_params:
105
+ step_size: 200000 # Discriminator's scheduler step size.
106
+ gamma: 0.5 # Discriminator's scheduler gamma.
107
+ # At each step size, lr will be multiplied by this parameter.
108
+ discriminator_grad_norm: 1 # Discriminator's gradient norm.
109
+
110
+ ###########################################################
111
+ # INTERVAL SETTING #
112
+ ###########################################################
113
+ discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
114
+ train_max_steps: 400000 # Number of training steps.
115
+ save_interval_steps: 5000 # Interval steps to save checkpoint.
116
+ eval_interval_steps: 1000 # Interval steps to evaluate the network.
117
+ log_interval_steps: 100 # Interval steps to record the training log.
118
+
119
+ ###########################################################
120
+ # OTHER SETTING #
121
+ ###########################################################
122
+ num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
ParallelWaveGAN/egs/csmsc/voc1/conf/slurm.conf ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Default configuration
2
+ command sbatch --export=PATH --ntasks-per-node=1
3
+ option time=* --time $0
4
+ option mem=* --mem-per-cpu $0
5
+ option mem=0 # Do not add anything to qsub_opts
6
+ option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
7
+ option num_threads=1 --cpus-per-task 1 --ntasks-per-node=1 # Do not add anything to qsub_opts
8
+ default gpu=0
9
+ option gpu=0 -p cpu
10
+ option gpu=* -p gpu --gres=gpu:$0
11
+ # note: the --max-jobs-run option is supported as a special case
12
+ # by slurm.pl and you don't have to handle it in the config file.
ParallelWaveGAN/egs/csmsc/voc1/conf/style_melgan.v1.yaml ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is the configuration file for CSMSC dataset.
2
+ # This configuration is based on StyleMelGAN paper but
3
+ # uses MSE loss instead of Hinge loss. And I found that
4
+ # batch_size = 8 is also working good. So maybe if you
5
+ # want to accelerate the training, you can reduce the
6
+ # batch size (e.g. 8 or 16). Upsampling scales is modified
7
+ # to fit the shift size 300 pt.
8
+
9
+ ###########################################################
10
+ # FEATURE EXTRACTION SETTING #
11
+ ###########################################################
12
+ sampling_rate: 24000 # Sampling rate.
13
+ fft_size: 2048 # FFT size.
14
+ hop_size: 300 # Hop size.
15
+ win_length: 1200 # Window length.
16
+ # If set to null, it will be the same as fft_size.
17
+ window: "hann" # Window function.
18
+ num_mels: 80 # Number of mel basis.
19
+ fmin: 80 # Minimum freq in mel basis calculation.
20
+ fmax: 7600 # Maximum frequency in mel basis calculation.
21
+ global_gain_scale: 1.0 # Will be multiplied to all of waveform.
22
+ trim_silence: false # Whether to trim the start and end of silence.
23
+ trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
24
+ trim_frame_size: 1024 # Frame size in trimming.
25
+ trim_hop_size: 256 # Hop size in trimming.
26
+ format: "hdf5" # Feature file format. " npy " or " hdf5 " is supported.
27
+
28
+ ###########################################################
29
+ # GENERATOR NETWORK ARCHITECTURE SETTING #
30
+ ###########################################################
31
+ generator_type: "StyleMelGANGenerator" # Generator type.
32
+ generator_params:
33
+ in_channels: 128
34
+ aux_channels: 80
35
+ channels: 64
36
+ out_channels: 1
37
+ kernel_size: 9
38
+ dilation: 2
39
+ bias: True
40
+ noise_upsample_scales: [10, 2, 2, 2]
41
+ noise_upsample_activation: "LeakyReLU"
42
+ noise_upsample_activation_params:
43
+ negative_slope: 0.2
44
+ upsample_scales: [5, 1, 5, 1, 3, 1, 2, 2, 1]
45
+ upsample_mode: "nearest"
46
+ gated_function: "softmax"
47
+ use_weight_norm: True
48
+
49
+ ###########################################################
50
+ # DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
51
+ ###########################################################
52
+ discriminator_type: "StyleMelGANDiscriminator" # Discriminator type.
53
+ discriminator_params:
54
+ repeats: 4
55
+ window_sizes: [512, 1024, 2048, 4096]
56
+ pqmf_params:
57
+ - [1, None, None, None]
58
+ - [2, 62, 0.26700, 9.0]
59
+ - [4, 62, 0.14200, 9.0]
60
+ - [8, 62, 0.07949, 9.0]
61
+ discriminator_params:
62
+ out_channels: 1
63
+ kernel_sizes: [5, 3]
64
+ channels: 16
65
+ max_downsample_channels: 512
66
+ bias: True
67
+ downsample_scales: [4, 4, 4, 1]
68
+ nonlinear_activation: "LeakyReLU"
69
+ nonlinear_activation_params:
70
+ negative_slope: 0.2
71
+ use_weight_norm: True
72
+
73
+ ###########################################################
74
+ # STFT LOSS SETTING #
75
+ ###########################################################
76
+ stft_loss_params:
77
+ fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
78
+ hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss
79
+ win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
80
+ window: "hann_window" # Window function for STFT-based loss
81
+ lambda_aux: 1.0 # Loss balancing coefficient for aux loss.
82
+
83
+ ###########################################################
84
+ # ADVERSARIAL LOSS SETTING #
85
+ ###########################################################
86
+ lambda_adv: 1.0 # Loss balancing coefficient for adv loss.
87
+ generator_adv_loss_params:
88
+ average_by_discriminators: false # Whether to average loss by #discriminators.
89
+ discriminator_adv_loss_params:
90
+ average_by_discriminators: false # Whether to average loss by #discriminators.
91
+
92
+ ###########################################################
93
+ # DATA LOADER SETTING #
94
+ ###########################################################
95
+ batch_size: 32 # Batch size.
96
+ batch_max_steps: 24000 # Length of each audio in batch. Make sure dividable by hop_size.
97
+ pin_memory: true # Whether to pin memory in Pytorch DataLoader.
98
+ num_workers: 2 # Number of workers in Pytorch DataLoader.
99
+ remove_short_samples: false # Whether to remove samples the length of which are less than batch_max_steps.
100
+ allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.
101
+
102
+ ###########################################################
103
+ # OPTIMIZER & SCHEDULER SETTING #
104
+ ###########################################################
105
+ generator_optimizer_type: Adam
106
+ generator_optimizer_params:
107
+ lr: 1.0e-4
108
+ betas: [0.5, 0.9]
109
+ weight_decay: 0.0
110
+ generator_scheduler_type: MultiStepLR
111
+ generator_scheduler_params:
112
+ gamma: 0.5
113
+ milestones:
114
+ - 100000
115
+ - 300000
116
+ - 500000
117
+ - 700000
118
+ - 900000
119
+ generator_grad_norm: -1
120
+ discriminator_optimizer_type: Adam
121
+ discriminator_optimizer_params:
122
+ lr: 2.0e-4
123
+ betas: [0.5, 0.9]
124
+ weight_decay: 0.0
125
+ discriminator_scheduler_type: MultiStepLR
126
+ discriminator_scheduler_params:
127
+ gamma: 0.5
128
+ milestones:
129
+ - 200000
130
+ - 400000
131
+ - 600000
132
+ - 800000
133
+ discriminator_grad_norm: -1
134
+
135
+ ###########################################################
136
+ # INTERVAL SETTING #
137
+ ###########################################################
138
+ discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
139
+ train_max_steps: 1500000 # Number of training steps.
140
+ save_interval_steps: 50000 # Interval steps to save checkpoint.
141
+ eval_interval_steps: 1000 # Interval steps to evaluate the network.
142
+ log_interval_steps: 100 # Interval steps to record the training log.
143
+
144
+ ###########################################################
145
+ # OTHER SETTING #
146
+ ###########################################################
147
+ num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
ParallelWaveGAN/egs/csmsc/voc1/local/data_download.sh ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Copyright 2019 Tomoki Hayashi
4
+ # MIT License (https://opensource.org/licenses/MIT)
5
+
6
+ download_dir=$1
7
+
8
+ # check arguments
9
+ if [ $# != 1 ]; then
10
+ echo "Usage: $0 <download_dir>"
11
+ exit 1
12
+ fi
13
+
14
+ set -euo pipefail
15
+
16
+ # download dataset
17
+ cwd=$(pwd)
18
+ if [ ! -e "${download_dir}/CSMSC" ]; then
19
+ mkdir -p "${download_dir}"
20
+ cd "${download_dir}"
21
+ wget https://weixinxcxdb.oss-cn-beijing.aliyuncs.com/gwYinPinKu/BZNSYP.rar
22
+ mkdir CSMSC && cd CSMSC && unrar x ../BZNSYP.rar
23
+ # convert new line code
24
+ find ./PhoneLabeling -name "*.interval" | while read -r line; do
25
+ nkf -Lu --overwrite "${line}"
26
+ done
27
+ rm ../BZNSYP.rar
28
+ cd "${cwd}"
29
+ echo "Successfully finished download."
30
+ else
31
+ echo "Already exists. Skip download."
32
+ fi
ParallelWaveGAN/egs/csmsc/voc1/local/data_prep.sh ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Copyright 2019 Tomoki Hayashi
4
+ # MIT License (https://opensource.org/licenses/MIT)
5
+
6
+ # shellcheck disable=SC1091
7
+ . ./path.sh || exit 1;
8
+
9
+ fs=24000
10
+ num_dev=100
11
+ num_eval=100
12
+ train_set="train_nodev"
13
+ dev_set="dev"
14
+ eval_set="eval"
15
+ shuffle=false
16
+
17
+ # shellcheck disable=SC1091
18
+ . utils/parse_options.sh || exit 1;
19
+
20
+ db_root=$1
21
+ data_dir=$2
22
+
23
+ # check arguments
24
+ if [ $# != 2 ]; then
25
+ echo "Usage: $0 [Options] <db_root> <data_dir>"
26
+ echo "e.g.: $0 downloads/CSMSC data"
27
+ echo ""
28
+ echo "Options:"
29
+ echo " --fs: target sampling rate (default=24000)."
30
+ echo " --num_dev: number of development uttreances (default=100)."
31
+ echo " --num_eval: number of evaluation uttreances (default=100)."
32
+ echo " --train_set: name of train set (default=train_nodev)."
33
+ echo " --dev_set: name of dev set (default=dev)."
34
+ echo " --eval_set: name of eval set (default=eval)."
35
+ echo " --shuffle: whether to perform shuffle in making dev / eval set (default=false)."
36
+ exit 1
37
+ fi
38
+
39
+ set -euo pipefail
40
+
41
+ [ ! -e "${data_dir}/all" ] && mkdir -p "${data_dir}/all"
42
+
43
+ # set filenames
44
+ scp="${data_dir}/all/wav.scp"
45
+ segments="${data_dir}/all/segments"
46
+
47
+ # check file existence
48
+ [ -e "${scp}" ] && rm "${scp}"
49
+ [ -e "${segments}" ] && rm "${segments}"
50
+
51
+ # make wav.scp
52
+ find "${db_root}/Wave" -name "*.wav" -follow | sort | while read -r filename; do
53
+ id="$(basename "${filename}" .wav)"
54
+ echo "csmsc_${id} cat ${filename} | sox -t wav - -c 1 -b 16 -t wav - rate ${fs} |" >> "${scp}"
55
+ done
56
+
57
+ # make segments
58
+ find "${db_root}/PhoneLabeling" -name "*.interval" -follow | sort | while read -r filename; do
59
+ nkf -Lu --overwrite "${filename}"
60
+ id="$(basename "${filename}" .interval)"
61
+ start_sec=$(tail -n +14 "${filename}" | head -n 1)
62
+ end_sec=$(head -n -2 "${filename}" | tail -n 1)
63
+ [ -z "${start_sec}" ] && echo "Start second is missing (utt_id=${id}). " >&2 && exit 1;
64
+ [ -z "${end_sec}" ] && echo "End second is missing (utt_id=${id})." >&2 && exit 1;
65
+ echo "csmsc_${id} csmsc_${id} ${start_sec} ${end_sec}" >> "${segments}"
66
+ done
67
+
68
+ # check
69
+ diff -q <(awk '{print $1}' "${scp}") <(awk '{print $1}' "${segments}") > /dev/null
70
+
71
+ # split
72
+ num_all=$(wc -l < "${scp}")
73
+ num_deveval=$((num_dev + num_eval))
74
+ num_train=$((num_all - num_deveval))
75
+ utils/split_data.sh \
76
+ --num_first "${num_train}" \
77
+ --num_second "${num_deveval}" \
78
+ --shuffle "${shuffle}" \
79
+ "${data_dir}/all" \
80
+ "${data_dir}/${train_set}" \
81
+ "${data_dir}/deveval"
82
+ utils/split_data.sh \
83
+ --num_first "${num_dev}" \
84
+ --num_second "${num_eval}" \
85
+ --shuffle "${shuffle}" \
86
+ "${data_dir}/deveval" \
87
+ "${data_dir}/${dev_set}" \
88
+ "${data_dir}/${eval_set}"
89
+
90
+ # remove tmp directories
91
+ rm -rf "${data_dir}/all"
92
+ rm -rf "${data_dir}/deveval"
93
+
94
+ echo "Successfully prepared data."
ParallelWaveGAN/egs/csmsc/voc1/path.sh ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # cuda related
2
+ export CUDA_HOME=/usr/local/cuda-10.0
3
+ export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
4
+
5
+ # path related
6
+ export PRJ_ROOT="${PWD}/../../.."
7
+ if [ -e "${PRJ_ROOT}/tools/venv/bin/activate" ]; then
8
+ # shellcheck disable=SC1090
9
+ . "${PRJ_ROOT}/tools/venv/bin/activate"
10
+ fi
11
+
12
+ # python related
13
+ export OMP_NUM_THREADS=1
14
+ export PYTHONIOENCODING=UTF-8
15
+ export MPL_BACKEND=Agg
16
+
17
+ # check installation
18
+ if ! command -v parallel-wavegan-train > /dev/null; then
19
+ echo "Error: It seems setup is not finished." >&2
20
+ echo "Error: Please setup your environment by following README.md" >&2
21
+ return 1
22
+ fi
23
+ if ! command -v jq > /dev/null; then
24
+ echo "Error: It seems jq is not installed." >&2
25
+ echo "Error: Please install via \`sudo apt-get install jq\`." >&2
26
+ echo "Error: If you do not have sudo, please download from https://stedolan.github.io/jq/download/." >&2
27
+ return 1
28
+ fi
29
+ if ! command -v yq > /dev/null; then
30
+ echo "Error: It seems yq is not installed." >&2
31
+ echo "Error: Please install via \`pip install yq\`." >&2
32
+ return 1
33
+ fi
ParallelWaveGAN/egs/csmsc/voc1/run.sh ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Copyright 2019 Tomoki Hayashi
4
+ # MIT License (https://opensource.org/licenses/MIT)
5
+
6
+ . ./cmd.sh || exit 1;
7
+ . ./path.sh || exit 1;
8
+
9
+ # basic settings
10
+ stage=-1 # stage to start
11
+ stop_stage=100 # stage to stop
12
+ verbose=1 # verbosity level (lower is less info)
13
+ n_gpus=1 # number of gpus in training
14
+ n_jobs=16 # number of parallel jobs in feature extraction
15
+
16
+ # NOTE(kan-bayashi): renamed to conf to avoid conflict in parse_options.sh
17
+ conf=conf/parallel_wavegan.v1.yaml
18
+
19
+ # directory path setting
20
+ download_dir=downloads # direcotry to save downloaded files
21
+ dumpdir=dump # directory to dump features
22
+
23
+ # training related setting
24
+ tag="" # tag for directory to save model
25
+ resume="" # checkpoint path to resume training
26
+ # (e.g. <path>/<to>/checkpoint-10000steps.pkl)
27
+
28
+ # decoding related setting
29
+ checkpoint="" # checkpoint path to be used for decoding
30
+ # if not provided, the latest one will be used
31
+ # (e.g. <path>/<to>/checkpoint-400000steps.pkl)
32
+
33
+ # shellcheck disable=SC1091
34
+ . utils/parse_options.sh || exit 1;
35
+
36
+ train_set="train_nodev" # name of training data directory
37
+ dev_set="dev" # name of development data direcotry
38
+ eval_set="eval" # name of evaluation data direcotry
39
+
40
+ set -euo pipefail
41
+
42
+ if [ "${stage}" -le -1 ] && [ "${stop_stage}" -ge -1 ]; then
43
+ echo "Stage -1: Data download"
44
+ local/data_download.sh "${download_dir}"
45
+ fi
46
+
47
+ if [ "${stage}" -le 0 ] && [ "${stop_stage}" -ge 0 ]; then
48
+ echo "Stage 0: Data preparation"
49
+ local/data_prep.sh \
50
+ --train_set "${train_set}" \
51
+ --dev_set "${dev_set}" \
52
+ --eval_set "${eval_set}" \
53
+ "${download_dir}/CSMSC" data
54
+ fi
55
+
56
+ stats_ext=$(grep -q "hdf5" <(yq ".format" "${conf}") && echo "h5" || echo "npy")
57
+ if [ "${stage}" -le 1 ] && [ "${stop_stage}" -ge 1 ]; then
58
+ echo "Stage 1: Feature extraction"
59
+ # extract raw features
60
+ pids=()
61
+ for name in "${train_set}" "${dev_set}" "${eval_set}"; do
62
+ (
63
+ [ ! -e "${dumpdir}/${name}/raw" ] && mkdir -p "${dumpdir}/${name}/raw"
64
+ echo "Feature extraction start. See the progress via ${dumpdir}/${name}/raw/preprocessing.*.log."
65
+ utils/make_subset_data.sh "data/${name}" "${n_jobs}" "${dumpdir}/${name}/raw"
66
+ ${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/raw/preprocessing.JOB.log" \
67
+ parallel-wavegan-preprocess \
68
+ --config "${conf}" \
69
+ --scp "${dumpdir}/${name}/raw/wav.JOB.scp" \
70
+ --segments "${dumpdir}/${name}/raw/segments.JOB" \
71
+ --dumpdir "${dumpdir}/${name}/raw/dump.JOB" \
72
+ --verbose "${verbose}"
73
+ echo "Successfully finished feature extraction of ${name} set."
74
+ ) &
75
+ pids+=($!)
76
+ done
77
+ i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
78
+ [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
79
+ echo "Successfully finished feature extraction."
80
+
81
+ # calculate statistics for normalization
82
+ echo "Statistics computation start. See the progress via ${dumpdir}/${train_set}/compute_statistics.log."
83
+ ${train_cmd} "${dumpdir}/${train_set}/compute_statistics.log" \
84
+ parallel-wavegan-compute-statistics \
85
+ --config "${conf}" \
86
+ --rootdir "${dumpdir}/${train_set}/raw" \
87
+ --dumpdir "${dumpdir}/${train_set}" \
88
+ --verbose "${verbose}"
89
+ echo "Successfully finished calculation of statistics."
90
+
91
+ # normalize and dump them
92
+ pids=()
93
+ for name in "${train_set}" "${dev_set}" "${eval_set}"; do
94
+ (
95
+ [ ! -e "${dumpdir}/${name}/norm" ] && mkdir -p "${dumpdir}/${name}/norm"
96
+ echo "Nomalization start. See the progress via ${dumpdir}/${name}/norm/normalize.*.log."
97
+ ${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/norm/normalize.JOB.log" \
98
+ parallel-wavegan-normalize \
99
+ --config "${conf}" \
100
+ --stats "${dumpdir}/${train_set}/stats.${stats_ext}" \
101
+ --rootdir "${dumpdir}/${name}/raw/dump.JOB" \
102
+ --dumpdir "${dumpdir}/${name}/norm/dump.JOB" \
103
+ --verbose "${verbose}"
104
+ echo "Successfully finished normalization of ${name} set."
105
+ ) &
106
+ pids+=($!)
107
+ done
108
+ i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
109
+ [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
110
+ echo "Successfully finished normalization."
111
+ fi
112
+
113
+ if [ -z "${tag}" ]; then
114
+ expdir="exp/${train_set}_csmsc_$(basename "${conf}" .yaml)"
115
+ else
116
+ expdir="exp/${train_set}_csmsc_${tag}"
117
+ fi
118
+ if [ "${stage}" -le 2 ] && [ "${stop_stage}" -ge 2 ]; then
119
+ echo "Stage 2: Network training"
120
+ [ ! -e "${expdir}" ] && mkdir -p "${expdir}"
121
+ cp "${dumpdir}/${train_set}/stats.${stats_ext}" "${expdir}"
122
+ if [ "${n_gpus}" -gt 1 ]; then
123
+ train="python -m parallel_wavegan.distributed.launch --nproc_per_node ${n_gpus} -c parallel-wavegan-train"
124
+ else
125
+ train="parallel-wavegan-train"
126
+ fi
127
+ echo "Training start. See the progress via ${expdir}/train.log."
128
+ ${cuda_cmd} --gpu "${n_gpus}" "${expdir}/train.log" \
129
+ ${train} \
130
+ --config "${conf}" \
131
+ --train-dumpdir "${dumpdir}/${train_set}/norm" \
132
+ --dev-dumpdir "${dumpdir}/${dev_set}/norm" \
133
+ --outdir "${expdir}" \
134
+ --resume "${resume}" \
135
+ --verbose "${verbose}"
136
+ echo "Successfully finished training."
137
+ fi
138
+
139
+ if [ "${stage}" -le 3 ] && [ "${stop_stage}" -ge 3 ]; then
140
+ echo "Stage 3: Network decoding"
141
+ # shellcheck disable=SC2012
142
+ [ -z "${checkpoint}" ] && checkpoint="$(ls -dt "${expdir}"/*.pkl | head -1 || true)"
143
+ outdir="${expdir}/wav/$(basename "${checkpoint}" .pkl)"
144
+ pids=()
145
+ for name in "${dev_set}" "${eval_set}"; do
146
+ (
147
+ [ ! -e "${outdir}/${name}" ] && mkdir -p "${outdir}/${name}"
148
+ [ "${n_gpus}" -gt 1 ] && n_gpus=1
149
+ echo "Decoding start. See the progress via ${outdir}/${name}/decode.log."
150
+ ${cuda_cmd} --gpu "${n_gpus}" "${outdir}/${name}/decode.log" \
151
+ parallel-wavegan-decode \
152
+ --dumpdir "${dumpdir}/${name}/norm" \
153
+ --checkpoint "${checkpoint}" \
154
+ --outdir "${outdir}/${name}" \
155
+ --verbose "${verbose}"
156
+ echo "Successfully finished decoding of ${name} set."
157
+ ) &
158
+ pids+=($!)
159
+ done
160
+ i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
161
+ [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
162
+ echo "Successfully finished decoding."
163
+ fi
164
+ echo "Finished."
ParallelWaveGAN/egs/csmsc/voc1/utils ADDED
@@ -0,0 +1 @@
 
 
1
+ ../../../utils
ParallelWaveGAN/egs/jnas/voc1/cmd.sh ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
2
+ # Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
3
+ # e.g.
4
+ # run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
5
+ #
6
+ # Options:
7
+ # --time <time>: Limit the maximum time to execute.
8
+ # --mem <mem>: Limit the maximum memory usage.
9
+ # -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
10
+ # --num-threads <ngpu>: Specify the number of CPU core.
11
+ # --gpu <ngpu>: Specify the number of GPU devices.
12
+ # --config: Change the configuration file from default.
13
+ #
14
+ # "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
15
+ # The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
16
+ # e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
17
+ # Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
18
+ #
19
+ # run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
20
+ # These options are mapping to specific options for each backend and
21
+ # it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
22
+ # If jobs failed, your configuration might be wrong for your environment.
23
+ #
24
+ #
25
+ # The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
26
+ # "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
27
+ # =========================================================~
28
+
29
+
30
+ # Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
31
+ cmd_backend="local"
32
+
33
+ # Local machine, without any Job scheduling system
34
+ if [ "${cmd_backend}" = local ]; then
35
+
36
+ # The other usage
37
+ export train_cmd="utils/run.pl"
38
+ # Used for "*_train.py": "--gpu" is appended optionally by run.sh
39
+ export cuda_cmd="utils/run.pl"
40
+ # Used for "*_recog.py"
41
+ export decode_cmd="utils/run.pl"
42
+
43
+ # Local machine, without any Job scheduling system
44
+ elif [ "${cmd_backend}" = stdout ]; then
45
+
46
+ # The other usage
47
+ export train_cmd="utils/stdout.pl"
48
+ # Used for "*_train.py": "--gpu" is appended optionally by run.sh
49
+ export cuda_cmd="utils/stdout.pl"
50
+ # Used for "*_recog.py"
51
+ export decode_cmd="utils/stdout.pl"
52
+
53
+ # "qsub" (SGE, Torque, PBS, etc.)
54
+ elif [ "${cmd_backend}" = sge ]; then
55
+ # The default setting is written in conf/queue.conf.
56
+ # You must change "-q g.q" for the "queue" for your environment.
57
+ # To know the "queue" names, type "qhost -q"
58
+ # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
59
+
60
+ export train_cmd="utils/queue.pl"
61
+ export cuda_cmd="utils/queue.pl"
62
+ export decode_cmd="utils/queue.pl"
63
+
64
+ # "sbatch" (Slurm)
65
+ elif [ "${cmd_backend}" = slurm ]; then
66
+ # The default setting is written in conf/slurm.conf.
67
+ # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
68
+ # To know the "partion" names, type "sinfo".
69
+ # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
70
+ # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
71
+
72
+ export train_cmd="utils/slurm.pl"
73
+ export cuda_cmd="utils/slurm.pl"
74
+ export decode_cmd="utils/slurm.pl"
75
+
76
+ elif [ "${cmd_backend}" = ssh ]; then
77
+ # You have to create ".queue/machines" to specify the host to execute jobs.
78
+ # e.g. .queue/machines
79
+ # host1
80
+ # host2
81
+ # host3
82
+ # Assuming you can login them without any password, i.e. You have to set ssh keys.
83
+
84
+ export train_cmd="utils/ssh.pl"
85
+ export cuda_cmd="utils/ssh.pl"
86
+ export decode_cmd="utils/ssh.pl"
87
+
88
+ else
89
+ echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
90
+ return 1
91
+ fi
ParallelWaveGAN/egs/jnas/voc1/conf/parallel_wavegan.v1.long.yaml ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is the hyperparameter configuration file for Parallel WaveGAN.
2
+ # Please make sure this is adjusted for the JNAS dataset. If you want to
3
+ # apply to the other dataset, you might need to carefully change some parameters.
4
+ # This configuration trains more steps up to 1000k compared to v1 config.
5
+ # It requires 12 GB GPU memory and takes ~7 days on TITAN V.
6
+
7
+ ###########################################################
8
+ # FEATURE EXTRACTION SETTING #
9
+ ###########################################################
10
+ sampling_rate: 16000 # Sampling rate.
11
+ fft_size: 1024 # FFT size.
12
+ hop_size: 256 # Hop size.
13
+ win_length: null # Window length.
14
+ # If set to null, it will be the same as fft_size.
15
+ window: "hann" # Window function.
16
+ num_mels: 80 # Number of mel basis.
17
+ fmin: 80 # Minimum freq in mel basis calculation.
18
+ fmax: 7600 # Maximum frequency in mel basis calculation.
19
+ global_gain_scale: 1.0 # Will be multiplied to all of waveform.
20
+ trim_silence: true # Whether to trim the start and end of silence.
21
+ trim_threshold_in_db: 20 # Need to tune carefully if the recording is not good.
22
+ trim_frame_size: 1024 # Frame size in trimming.
23
+ trim_hop_size: 256 # Hop size in trimming.
24
+ format: "hdf5" # Feature file format. "npy" or "hdf5" is supported.
25
+
26
+ ###########################################################
27
+ # GENERATOR NETWORK ARCHITECTURE SETTING #
28
+ ###########################################################
29
+ generator_params:
30
+ in_channels: 1 # Number of input channels.
31
+ out_channels: 1 # Number of output channels.
32
+ kernel_size: 3 # Kernel size of dilated convolution.
33
+ layers: 30 # Number of residual block layers.
34
+ stacks: 3 # Number of stacks i.e., dilation cycles.
35
+ residual_channels: 64 # Number of channels in residual conv.
36
+ gate_channels: 128 # Number of channels in gated conv.
37
+ skip_channels: 64 # Number of channels in skip conv.
38
+ aux_channels: 80 # Number of channels for auxiliary feature conv.
39
+ # Must be the same as num_mels.
40
+ aux_context_window: 2 # Context window size for auxiliary feature.
41
+ # If set to 2, previous 2 and future 2 frames will be considered.
42
+ dropout: 0.0 # Dropout rate. 0.0 means no dropout applied.
43
+ use_weight_norm: true # Whether to use weight norm.
44
+ # If set to true, it will be applied to all of the conv layers.
45
+ upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
46
+ upsample_params: # Upsampling network parameters.
47
+ upsample_scales: [4, 4, 4, 4] # Upsampling scales. Prodcut of these must be the same as hop size.
48
+
49
+ ###########################################################
50
+ # DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
51
+ ###########################################################
52
+ discriminator_params:
53
+ in_channels: 1 # Number of input channels.
54
+ out_channels: 1 # Number of output channels.
55
+ kernel_size: 3 # Number of output channels.
56
+ layers: 10 # Number of conv layers.
57
+ conv_channels: 64 # Number of chnn layers.
58
+ bias: true # Whether to use bias parameter in conv.
59
+ use_weight_norm: true # Whether to use weight norm.
60
+ # If set to true, it will be applied to all of the conv layers.
61
+ nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
62
+ nonlinear_activation_params: # Nonlinear function parameters
63
+ negative_slope: 0.2 # Alpha in LeakyReLU.
64
+
65
+ ###########################################################
66
+ # STFT LOSS SETTING #
67
+ ###########################################################
68
+ stft_loss_params:
69
+ fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
70
+ hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss
71
+ win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
72
+ window: "hann_window" # Window function for STFT-based loss
73
+
74
+ ###########################################################
75
+ # ADVERSARIAL LOSS SETTING #
76
+ ###########################################################
77
+ lambda_adv: 4.0 # Loss balancing coefficient.
78
+
79
+ ###########################################################
80
+ # DATA LOADER SETTING #
81
+ ###########################################################
82
+ batch_size: 10 # Batch size.
83
+ batch_max_steps: 15360 # Length of each audio in batch. Make sure dividable by hop_size.
84
+ pin_memory: true # Whether to pin memory in Pytorch DataLoader.
85
+ num_workers: 2 # Number of workers in Pytorch DataLoader.
86
+ remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
87
+ allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.
88
+
89
+ ###########################################################
90
+ # OPTIMIZER & SCHEDULER SETTING #
91
+ ###########################################################
92
+ generator_optimizer_params:
93
+ lr: 0.0001 # Generator's learning rate.
94
+ eps: 1.0e-6 # Generator's epsilon.
95
+ weight_decay: 0.0 # Generator's weight decay coefficient.
96
+ generator_scheduler_params:
97
+ step_size: 200000 # Generator's scheduler step size.
98
+ gamma: 0.5 # Generator's scheduler gamma.
99
+ # At each step size, lr will be multiplied by this parameter.
100
+ generator_grad_norm: 10 # Generator's gradient norm.
101
+ discriminator_optimizer_params:
102
+ lr: 0.00005 # Discriminator's learning rate.
103
+ eps: 1.0e-6 # Discriminator's epsilon.
104
+ weight_decay: 0.0 # Discriminator's weight decay coefficient.
105
+ discriminator_scheduler_params:
106
+ step_size: 200000 # Discriminator's scheduler step size.
107
+ gamma: 0.5 # Discriminator's scheduler gamma.
108
+ # At each step size, lr will be multiplied by this parameter.
109
+ discriminator_grad_norm: 1 # Discriminator's gradient norm.
110
+
111
+ ###########################################################
112
+ # INTERVAL SETTING #
113
+ ###########################################################
114
+ discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
115
+ train_max_steps: 1000000 # Number of training steps.
116
+ save_interval_steps: 5000 # Interval steps to save checkpoint.
117
+ eval_interval_steps: 1000 # Interval steps to evaluate the network.
118
+ log_interval_steps: 100 # Interval steps to record the training log.
119
+
120
+ ###########################################################
121
+ # OTHER SETTING #
122
+ ###########################################################
123
+ num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
ParallelWaveGAN/egs/jnas/voc1/conf/parallel_wavegan.v1.yaml ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is the hyperparameter configuration file for Parallel WaveGAN.
2
+ # Please make sure this is adjusted for the JNAS dataset. If you want to
3
+ # apply to the other dataset, you might need to carefully change some parameters.
4
+ # This configuration requires 12 GB GPU memory and takes ~3 days on TITAN V.
5
+
6
+ ###########################################################
7
+ # FEATURE EXTRACTION SETTING #
8
+ ###########################################################
9
+ sampling_rate: 16000 # Sampling rate.
10
+ fft_size: 1024 # FFT size.
11
+ hop_size: 256 # Hop size.
12
+ win_length: null # Window length.
13
+ # If set to null, it will be the same as fft_size.
14
+ window: "hann" # Window function.
15
+ num_mels: 80 # Number of mel basis.
16
+ fmin: 80 # Minimum freq in mel basis calculation.
17
+ fmax: 7600 # Maximum frequency in mel basis calculation.
18
+ global_gain_scale: 1.0 # Will be multiplied to all of waveform.
19
+ trim_silence: true # Whether to trim the start and end of silence.
20
+ trim_threshold_in_db: 20 # Need to tune carefully if the recording is not good.
21
+ trim_frame_size: 1024 # Frame size in trimming.
22
+ trim_hop_size: 256 # Hop size in trimming.
23
+ format: "hdf5" # Feature file format. "npy" or "hdf5" is supported.
24
+
25
+ ###########################################################
26
+ # GENERATOR NETWORK ARCHITECTURE SETTING #
27
+ ###########################################################
28
+ generator_params:
29
+ in_channels: 1 # Number of input channels.
30
+ out_channels: 1 # Number of output channels.
31
+ kernel_size: 3 # Kernel size of dilated convolution.
32
+ layers: 30 # Number of residual block layers.
33
+ stacks: 3 # Number of stacks i.e., dilation cycles.
34
+ residual_channels: 64 # Number of channels in residual conv.
35
+ gate_channels: 128 # Number of channels in gated conv.
36
+ skip_channels: 64 # Number of channels in skip conv.
37
+ aux_channels: 80 # Number of channels for auxiliary feature conv.
38
+ # Must be the same as num_mels.
39
+ aux_context_window: 2 # Context window size for auxiliary feature.
40
+ # If set to 2, previous 2 and future 2 frames will be considered.
41
+ dropout: 0.0 # Dropout rate. 0.0 means no dropout applied.
42
+ use_weight_norm: true # Whether to use weight norm.
43
+ # If set to true, it will be applied to all of the conv layers.
44
+ upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
45
+ upsample_params: # Upsampling network parameters.
46
+ upsample_scales: [4, 4, 4, 4] # Upsampling scales. Prodcut of these must be the same as hop size.
47
+
48
+ ###########################################################
49
+ # DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
50
+ ###########################################################
51
+ discriminator_params:
52
+ in_channels: 1 # Number of input channels.
53
+ out_channels: 1 # Number of output channels.
54
+ kernel_size: 3 # Number of output channels.
55
+ layers: 10 # Number of conv layers.
56
+ conv_channels: 64 # Number of chnn layers.
57
+ bias: true # Whether to use bias parameter in conv.
58
+ use_weight_norm: true # Whether to use weight norm.
59
+ # If set to true, it will be applied to all of the conv layers.
60
+ nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
61
+ nonlinear_activation_params: # Nonlinear function parameters
62
+ negative_slope: 0.2 # Alpha in LeakyReLU.
63
+
64
+ ###########################################################
65
+ # STFT LOSS SETTING #
66
+ ###########################################################
67
+ stft_loss_params:
68
+ fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
69
+ hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss
70
+ win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
71
+ window: "hann_window" # Window function for STFT-based loss
72
+
73
+ ###########################################################
74
+ # ADVERSARIAL LOSS SETTING #
75
+ ###########################################################
76
+ lambda_adv: 4.0 # Loss balancing coefficient.
77
+
78
+ ###########################################################
79
+ # DATA LOADER SETTING #
80
+ ###########################################################
81
+ batch_size: 10 # Batch size.
82
+ batch_max_steps: 15360 # Length of each audio in batch. Make sure dividable by hop_size.
83
+ pin_memory: true # Whether to pin memory in Pytorch DataLoader.
84
+ num_workers: 2 # Number of workers in Pytorch DataLoader.
85
+ remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
86
+ allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.
87
+
88
+ ###########################################################
89
+ # OPTIMIZER & SCHEDULER SETTING #
90
+ ###########################################################
91
+ generator_optimizer_params:
92
+ lr: 0.0001 # Generator's learning rate.
93
+ eps: 1.0e-6 # Generator's epsilon.
94
+ weight_decay: 0.0 # Generator's weight decay coefficient.
95
+ generator_scheduler_params:
96
+ step_size: 200000 # Generator's scheduler step size.
97
+ gamma: 0.5 # Generator's scheduler gamma.
98
+ # At each step size, lr will be multiplied by this parameter.
99
+ generator_grad_norm: 10 # Generator's gradient norm.
100
+ discriminator_optimizer_params:
101
+ lr: 0.00005 # Discriminator's learning rate.
102
+ eps: 1.0e-6 # Discriminator's epsilon.
103
+ weight_decay: 0.0 # Discriminator's weight decay coefficient.
104
+ discriminator_scheduler_params:
105
+ step_size: 200000 # Discriminator's scheduler step size.
106
+ gamma: 0.5 # Discriminator's scheduler gamma.
107
+ # At each step size, lr will be multiplied by this parameter.
108
+ discriminator_grad_norm: 1 # Discriminator's gradient norm.
109
+
110
+ ###########################################################
111
+ # INTERVAL SETTING #
112
+ ###########################################################
113
+ discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
114
+ train_max_steps: 400000 # Number of training steps.
115
+ save_interval_steps: 5000 # Interval steps to save checkpoint.
116
+ eval_interval_steps: 1000 # Interval steps to evaluate the network.
117
+ log_interval_steps: 100 # Interval steps to record the training log.
118
+
119
+ ###########################################################
120
+ # OTHER SETTING #
121
+ ###########################################################
122
+ num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
ParallelWaveGAN/egs/jnas/voc1/conf/slurm.conf ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Default configuration
2
+ command sbatch --export=PATH --ntasks-per-node=1
3
+ option time=* --time $0
4
+ option mem=* --mem-per-cpu $0
5
+ option mem=0 # Do not add anything to qsub_opts
6
+ option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
7
+ option num_threads=1 --cpus-per-task 1 --ntasks-per-node=1 # Do not add anything to qsub_opts
8
+ default gpu=0
9
+ option gpu=0 -p cpu
10
+ option gpu=* -p gpu --gres=gpu:$0
11
+ # note: the --max-jobs-run option is supported as a special case
12
+ # by slurm.pl and you don't have to handle it in the config file.
ParallelWaveGAN/egs/jnas/voc1/conf/train_speakers.txt ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ F001
2
+ F002
3
+ F003
4
+ F004
5
+ F007
6
+ F008
7
+ F009
8
+ F010
9
+ F012
10
+ F013
11
+ F015
12
+ F016
13
+ F018
14
+ F019
15
+ F020
16
+ F022
17
+ F023
18
+ F024
19
+ F025
20
+ F026
21
+ F027
22
+ F028
23
+ F029
24
+ F030
25
+ F031
26
+ F032
27
+ F033
28
+ F034
29
+ F035
30
+ F036
31
+ F037
32
+ F038
33
+ F039
34
+ F040
35
+ F041
36
+ F042
37
+ F043
38
+ F044
39
+ F045
40
+ F046
41
+ F047
42
+ F049
43
+ F050
44
+ F051
45
+ F052
46
+ F053
47
+ F054
48
+ F055
49
+ F056
50
+ F057
51
+ F058
52
+ F059
53
+ F061
54
+ F062
55
+ F063
56
+ F065
57
+ F066
58
+ F067
59
+ F069
60
+ F070
61
+ F071
62
+ F073
63
+ F074
64
+ F076
65
+ F077
66
+ F079
67
+ F081
68
+ F083
69
+ F084
70
+ F085
71
+ F087
72
+ F090
73
+ F091
74
+ F092
75
+ F093
76
+ F094
77
+ F095
78
+ F096
79
+ F097
80
+ F098
81
+ F099
82
+ F100
83
+ F101
84
+ F103
85
+ F104
86
+ F105
87
+ F106
88
+ F107
89
+ F108
90
+ F110
91
+ F111
92
+ F112
93
+ F113
94
+ F114
95
+ F115
96
+ F116
97
+ F117
98
+ F118
99
+ F119
100
+ F120
101
+ F121
102
+ F123
103
+ F124
104
+ F125
105
+ F126
106
+ F127
107
+ F128
108
+ F130
109
+ F131
110
+ F132
111
+ F133
112
+ F134
113
+ F135
114
+ F136
115
+ F137
116
+ F138
117
+ F143A
118
+ F143B
119
+ F144A
120
+ F145A
121
+ F145B
122
+ F146A
123
+ F146B
124
+ F149
125
+ F150
126
+ FP01
127
+ FP02
128
+ FP03
129
+ FP04
130
+ FP05
131
+ M001
132
+ M002
133
+ M003
134
+ M004
135
+ M007
136
+ M008
137
+ M009
138
+ M010
139
+ M011
140
+ M012
141
+ M013
142
+ M015
143
+ M016
144
+ M018
145
+ M019
146
+ M020
147
+ M021
148
+ M022
149
+ M023
150
+ M024
151
+ M025
152
+ M027
153
+ M028
154
+ M029
155
+ M030
156
+ M031
157
+ M032
158
+ M033
159
+ M034
160
+ M035
161
+ M036
162
+ M037
163
+ M038
164
+ M039
165
+ M040
166
+ M041
167
+ M042
168
+ M043
169
+ M044
170
+ M045
171
+ M046
172
+ M047
173
+ M049
174
+ M050
175
+ M051
176
+ M052
177
+ M053
178
+ M054
179
+ M055
180
+ M056
181
+ M057
182
+ M058
183
+ M059
184
+ M061
185
+ M062
186
+ M063
187
+ M065
188
+ M066
189
+ M067
190
+ M069
191
+ M070
192
+ M071
193
+ M073
194
+ M074
195
+ M076
196
+ M077
197
+ M079
198
+ M081
199
+ M083
200
+ M084
201
+ M085
202
+ M087
203
+ M090
204
+ M091
205
+ M092
206
+ M093
207
+ M094
208
+ M095
209
+ M096
210
+ M097
211
+ M098
212
+ M099
213
+ M100
214
+ M101
215
+ M103
216
+ M104
217
+ M105
218
+ M106
219
+ M107
220
+ M108
221
+ M110
222
+ M111
223
+ M112
224
+ M113
225
+ M114
226
+ M116
227
+ M117
228
+ M118
229
+ M119
230
+ M120
231
+ M121
232
+ M123
233
+ M124
234
+ M125
235
+ M126
236
+ M127
237
+ M128
238
+ M130
239
+ M131
240
+ M132
241
+ M133
242
+ M134
243
+ M135
244
+ M136
245
+ M137
246
+ M138
247
+ M139A
248
+ M139B
249
+ M140A
250
+ M140B
251
+ M141A
252
+ M141B
253
+ M142A
254
+ M142B
255
+ M147
256
+ M148
257
+ MP01
258
+ MP02
259
+ MP03
260
+ MP04
261
+ MP05
ParallelWaveGAN/egs/jnas/voc1/local/data_prep.sh ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Copyright 2019 Tomoki Hayashi
4
+ # MIT License (https://opensource.org/licenses/MIT)
5
+
6
+ # shellcheck disable=SC1091
7
+ . ./path.sh || exit 1;
8
+
9
+ num_dev=500
10
+ train_set="train_nodev"
11
+ dev_set="dev"
12
+ eval_set="eval"
13
+ shuffle=false
14
+
15
+ # shellcheck disable=SC1091
16
+ . utils/parse_options.sh || exit 1;
17
+
18
+ # check arguments
19
+ if [ $# != 3 ]; then
20
+ echo "Usage: $0 <db_root> <data_dir> <spk_list>"
21
+ echo "e.g.: $0 /database/JNAS data conf/train_speakers.txt"
22
+ echo ""
23
+ echo "Options:"
24
+ echo " --num_dev: number of development uttreances (default=500)."
25
+ echo " --train_set: name of train set (default=train_nodev)."
26
+ echo " --dev_set: name of dev set (default=dev)."
27
+ echo " --eval_set: name of eval set (default=eval)."
28
+ echo " --shuffle: whether to perform shuffle in making dev / eval set (default=false)."
29
+ exit 1
30
+ fi
31
+
32
+ set -euo pipefail
33
+
34
+ db_root=$1 # database root directory
35
+ data_dir=$2
36
+ spk_list=$3
37
+
38
+ eval_db_root=${db_root}/DOCS/Test_set
39
+ wav_type=HS # DT or HS
40
+
41
+ # make directories
42
+ for name in train "${eval_set}"; do
43
+ [ ! -e "${data_dir}/${name}" ] && mkdir -p "${data_dir}/${name}"
44
+ done
45
+
46
+ # make training & development data
47
+ scp="${data_dir}/train/wav.scp"
48
+
49
+ # check file existence
50
+ [ -e "${scp}" ] && rm "${scp}"
51
+
52
+ # shellcheck disable=SC2013
53
+ for spk in $(cat "${spk_list}"); do
54
+ wavdir=${db_root}/WAVES_${wav_type}/${spk}
55
+ [ ! -e "${wavdir}" ] && echo "There are no such a directory (${wavdir})" && exit 1
56
+ find "${wavdir}" -follow -name "*.wav" | sort | while read -r filename; do
57
+ id=$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")
58
+ echo "${spk}_${id} ${filename}" >> "${scp}"
59
+ done
60
+ done
61
+
62
+ # shuffle
63
+ cp "${scp}" "${scp}.tmp"
64
+ sort -R "${scp}.tmp" > "${scp}"
65
+ rm -r "${scp}.tmp"
66
+
67
+ # split
68
+ utils/split_data.sh \
69
+ --num_second ${num_dev} \
70
+ --shuffle "${shuffle}" \
71
+ "${data_dir}/train" \
72
+ "${data_dir}/${train_set}" \
73
+ "${data_dir}/${dev_set}"
74
+
75
+ # make evaluation data
76
+ scp="${data_dir}/${eval_set}/wav.scp"
77
+
78
+ # check file existence
79
+ [ -e "${scp}" ] && rm "${scp}"
80
+
81
+ for name in JNAS_testset_100 JNAS_testset_500; do
82
+ find "${eval_db_root}/${name}/WAVES" -follow -name "*.wav" | sort | while read -r filename; do
83
+ id=$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")
84
+ dirname=$(basename "$(dirname "${filename}")")
85
+ echo "${name}_${dirname}_${id} ${filename}" >> "${scp}"
86
+ done
87
+ done
88
+
89
+ echo "Successfully prepared data."
ParallelWaveGAN/egs/jnas/voc1/path.sh ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # cuda related
2
+ export CUDA_HOME=/usr/local/cuda-10.0
3
+ export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
4
+
5
+ # path related
6
+ export PRJ_ROOT="${PWD}/../../.."
7
+ if [ -e "${PRJ_ROOT}/tools/venv/bin/activate" ]; then
8
+ # shellcheck disable=SC1090
9
+ . "${PRJ_ROOT}/tools/venv/bin/activate"
10
+ fi
11
+
12
+ # python related
13
+ export OMP_NUM_THREADS=1
14
+ export PYTHONIOENCODING=UTF-8
15
+ export MPL_BACKEND=Agg
16
+
17
+ # check installation
18
+ if ! command -v parallel-wavegan-train > /dev/null; then
19
+ echo "Error: It seems setup is not finished." >&2
20
+ echo "Error: Please setup your environment by following README.md" >&2
21
+ return 1
22
+ fi
23
+ if ! command -v jq > /dev/null; then
24
+ echo "Error: It seems jq is not installed." >&2
25
+ echo "Error: Please install via \`sudo apt-get install jq\`." >&2
26
+ echo "Error: If you do not have sudo, please download from https://stedolan.github.io/jq/download/." >&2
27
+ return 1
28
+ fi
29
+ if ! command -v yq > /dev/null; then
30
+ echo "Error: It seems yq is not installed." >&2
31
+ echo "Error: Please install via \`pip install yq\`." >&2
32
+ return 1
33
+ fi
ParallelWaveGAN/egs/jnas/voc1/run.sh ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Copyright 2019 Tomoki Hayashi
4
+ # MIT License (https://opensource.org/licenses/MIT)
5
+
6
+ . ./cmd.sh || exit 1;
7
+ . ./path.sh || exit 1;
8
+
9
+ # basic settings
10
+ stage=0 # stage to start
11
+ stop_stage=100 # stage to stop
12
+ verbose=1 # verbosity level (lower is less info)
13
+ n_gpus=1 # number of gpus in training
14
+ n_jobs=16 # number of parallel jobs in feature extraction
15
+
16
+ # NOTE(kan-bayashi): renamed to conf to avoid conflict in parse_options.sh
17
+ conf=conf/parallel_wavegan.v1.yaml
18
+
19
+ # directory path setting
20
+ db_root=/database/JNAS # database direcotry
21
+ dumpdir=dump # directory to dump features
22
+
23
+ # training related setting
24
+ tag="" # tag for directory to save model
25
+ resume="" # checkpoint path to resume training
26
+ # (e.g. <path>/<to>/checkpoint-10000steps.pkl)
27
+
28
+ # decoding related setting
29
+ checkpoint="" # checkpoint path to be used for decoding
30
+ # if not provided, the latest one will be used
31
+ # (e.g. <path>/<to>/checkpoint-400000steps.pkl)
32
+
33
+ # shellcheck disable=SC1091
34
+ . utils/parse_options.sh || exit 1;
35
+
36
+ train_set="train_nodev" # name of training data directory
37
+ dev_set="dev" # name of development data direcotry
38
+ eval_set="eval" # name of evaluation data direcotry
39
+
40
+ set -euo pipefail
41
+
42
+ if [ "${stage}" -le 0 ] && [ "${stop_stage}" -ge 0 ]; then
43
+ echo "Stage 0: Data preparation"
44
+ local/data_prep.sh \
45
+ --train_set "${train_set}" \
46
+ --dev_set "${dev_set}" \
47
+ --eval_set "${eval_set}" \
48
+ "${db_root}" data conf/train_speakers.txt
49
+ fi
50
+
51
+ stats_ext=$(grep -q "hdf5" <(yq ".format" "${conf}") && echo "h5" || echo "npy")
52
+ if [ "${stage}" -le 1 ] && [ "${stop_stage}" -ge 1 ]; then
53
+ echo "Stage 1: Feature extraction"
54
+ # extract raw features
55
+ pids=()
56
+ for name in "${train_set}" "${dev_set}" "${eval_set}"; do
57
+ (
58
+ [ ! -e "${dumpdir}/${name}/raw" ] && mkdir -p "${dumpdir}/${name}/raw"
59
+ echo "Feature extraction start. See the progress via ${dumpdir}/${name}/raw/preprocessing.*.log."
60
+ utils/make_subset_data.sh "data/${name}" "${n_jobs}" "${dumpdir}/${name}/raw"
61
+ ${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/raw/preprocessing.JOB.log" \
62
+ parallel-wavegan-preprocess \
63
+ --config "${conf}" \
64
+ --scp "${dumpdir}/${name}/raw/wav.JOB.scp" \
65
+ --dumpdir "${dumpdir}/${name}/raw/dump.JOB" \
66
+ --verbose "${verbose}"
67
+ echo "Successfully finished feature extraction of ${name} set."
68
+ ) &
69
+ pids+=($!)
70
+ done
71
+ i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
72
+ [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
73
+ echo "Successfully finished feature extraction."
74
+
75
+ # calculate statistics for normalization
76
+ echo "Statistics computation start. See the progress via ${dumpdir}/${train_set}/compute_statistics.log."
77
+ ${train_cmd} "${dumpdir}/${train_set}/compute_statistics.log" \
78
+ parallel-wavegan-compute-statistics \
79
+ --config "${conf}" \
80
+ --rootdir "${dumpdir}/${train_set}/raw" \
81
+ --dumpdir "${dumpdir}/${train_set}" \
82
+ --verbose "${verbose}"
83
+ echo "Successfully finished calculation of statistics."
84
+
85
+ # normalize and dump them
86
+ pids=()
87
+ for name in "${train_set}" "${dev_set}" "${eval_set}"; do
88
+ (
89
+ [ ! -e "${dumpdir}/${name}/norm" ] && mkdir -p "${dumpdir}/${name}/norm"
90
+ echo "Nomalization start. See the progress via ${dumpdir}/${name}/norm/normalize.*.log."
91
+ ${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/norm/normalize.JOB.log" \
92
+ parallel-wavegan-normalize \
93
+ --config "${conf}" \
94
+ --stats "${dumpdir}/${train_set}/stats.${stats_ext}" \
95
+ --rootdir "${dumpdir}/${name}/raw/dump.JOB" \
96
+ --dumpdir "${dumpdir}/${name}/norm/dump.JOB" \
97
+ --verbose "${verbose}"
98
+ echo "Successfully finished normalization of ${name} set."
99
+ ) &
100
+ pids+=($!)
101
+ done
102
+ i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
103
+ [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
104
+ echo "Successfully finished normalization."
105
+ fi
106
+
107
+ if [ -z "${tag}" ]; then
108
+ expdir="exp/${train_set}_jnas_$(basename "${conf}" .yaml)"
109
+ else
110
+ expdir="exp/${train_set}_jnas_${tag}"
111
+ fi
112
+ if [ "${stage}" -le 2 ] && [ "${stop_stage}" -ge 2 ]; then
113
+ echo "Stage 2: Network training"
114
+ [ ! -e "${expdir}" ] && mkdir -p "${expdir}"
115
+ cp "${dumpdir}/${train_set}/stats.${stats_ext}" "${expdir}"
116
+ if [ "${n_gpus}" -gt 1 ]; then
117
+ train="python -m parallel_wavegan.distributed.launch --nproc_per_node ${n_gpus} -c parallel-wavegan-train"
118
+ else
119
+ train="parallel-wavegan-train"
120
+ fi
121
+ echo "Training start. See the progress via ${expdir}/train.log."
122
+ ${cuda_cmd} --gpu "${n_gpus}" "${expdir}/train.log" \
123
+ ${train} \
124
+ --config "${conf}" \
125
+ --train-dumpdir "${dumpdir}/${train_set}/norm" \
126
+ --dev-dumpdir "${dumpdir}/${dev_set}/norm" \
127
+ --outdir "${expdir}" \
128
+ --resume "${resume}" \
129
+ --verbose "${verbose}"
130
+ echo "Successfully finished training."
131
+ fi
132
+
133
+ if [ "${stage}" -le 3 ] && [ "${stop_stage}" -ge 3 ]; then
134
+ echo "Stage 3: Network decoding"
135
+ # shellcheck disable=SC2012
136
+ [ -z "${checkpoint}" ] && checkpoint="$(ls -dt "${expdir}"/*.pkl | head -1 || true)"
137
+ outdir="${expdir}/wav/$(basename "${checkpoint}" .pkl)"
138
+ pids=()
139
+ for name in "${dev_set}" "${eval_set}"; do
140
+ (
141
+ [ ! -e "${outdir}/${name}" ] && mkdir -p "${outdir}/${name}"
142
+ [ "${n_gpus}" -gt 1 ] && n_gpus=1
143
+ echo "Decoding start. See the progress via ${outdir}/${name}/decode.log."
144
+ ${cuda_cmd} --gpu "${n_gpus}" "${outdir}/${name}/decode.log" \
145
+ parallel-wavegan-decode \
146
+ --dumpdir "${dumpdir}/${name}/norm" \
147
+ --checkpoint "${checkpoint}" \
148
+ --outdir "${outdir}/${name}" \
149
+ --verbose "${verbose}"
150
+ echo "Successfully finished decoding of ${name} set."
151
+ ) &
152
+ pids+=($!)
153
+ done
154
+ i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
155
+ [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
156
+ echo "Successfully finished decoding."
157
+ fi
158
+ echo "Finished."
ParallelWaveGAN/egs/jnas/voc1/utils ADDED
@@ -0,0 +1 @@
 
 
1
+ ../../../utils
ParallelWaveGAN/egs/jsss/voc1/cmd.sh ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
2
+ # Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
3
+ # e.g.
4
+ # run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
5
+ #
6
+ # Options:
7
+ # --time <time>: Limit the maximum time to execute.
8
+ # --mem <mem>: Limit the maximum memory usage.
9
+ # -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
10
+ # --num-threads <ngpu>: Specify the number of CPU core.
11
+ # --gpu <ngpu>: Specify the number of GPU devices.
12
+ # --config: Change the configuration file from default.
13
+ #
14
+ # "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
15
+ # The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
16
+ # e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
17
+ # Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
18
+ #
19
+ # run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
20
+ # These options are mapping to specific options for each backend and
21
+ # it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
22
+ # If jobs failed, your configuration might be wrong for your environment.
23
+ #
24
+ #
25
+ # The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
26
+ # "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
27
+ # =========================================================~
28
+
29
+
30
+ # Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
31
+ cmd_backend="local"
32
+
33
+ # Local machine, without any Job scheduling system
34
+ if [ "${cmd_backend}" = local ]; then
35
+
36
+ # The other usage
37
+ export train_cmd="utils/run.pl"
38
+ # Used for "*_train.py": "--gpu" is appended optionally by run.sh
39
+ export cuda_cmd="utils/run.pl"
40
+ # Used for "*_recog.py"
41
+ export decode_cmd="utils/run.pl"
42
+
43
+ # Local machine, without any Job scheduling system
44
+ elif [ "${cmd_backend}" = stdout ]; then
45
+
46
+ # The other usage
47
+ export train_cmd="utils/stdout.pl"
48
+ # Used for "*_train.py": "--gpu" is appended optionally by run.sh
49
+ export cuda_cmd="utils/stdout.pl"
50
+ # Used for "*_recog.py"
51
+ export decode_cmd="utils/stdout.pl"
52
+
53
+ # "qsub" (SGE, Torque, PBS, etc.)
54
+ elif [ "${cmd_backend}" = sge ]; then
55
+ # The default setting is written in conf/queue.conf.
56
+ # You must change "-q g.q" for the "queue" for your environment.
57
+ # To know the "queue" names, type "qhost -q"
58
+ # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
59
+
60
+ export train_cmd="utils/queue.pl"
61
+ export cuda_cmd="utils/queue.pl"
62
+ export decode_cmd="utils/queue.pl"
63
+
64
+ # "sbatch" (Slurm)
65
+ elif [ "${cmd_backend}" = slurm ]; then
66
+ # The default setting is written in conf/slurm.conf.
67
+ # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
68
+ # To know the "partion" names, type "sinfo".
69
+ # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
70
+ # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
71
+
72
+ export train_cmd="utils/slurm.pl"
73
+ export cuda_cmd="utils/slurm.pl"
74
+ export decode_cmd="utils/slurm.pl"
75
+
76
+ elif [ "${cmd_backend}" = ssh ]; then
77
+ # You have to create ".queue/machines" to specify the host to execute jobs.
78
+ # e.g. .queue/machines
79
+ # host1
80
+ # host2
81
+ # host3
82
+ # Assuming you can login them without any password, i.e. You have to set ssh keys.
83
+
84
+ export train_cmd="utils/ssh.pl"
85
+ export cuda_cmd="utils/ssh.pl"
86
+ export decode_cmd="utils/ssh.pl"
87
+
88
+ else
89
+ echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
90
+ return 1
91
+ fi
ParallelWaveGAN/egs/jsss/voc1/conf/parallel_wavegan.v1.yaml ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is the hyperparameter configuration file for Parallel WaveGAN.
2
+ # Please make sure this is adjusted for the JSSS dataset. If you want to
3
+ # apply to the other dataset, you might need to carefully change some parameters.
4
+ # This configuration requires 12 GB GPU memory and takes ~3 days on RTX TITAN.
5
+
6
+ ###########################################################
7
+ # FEATURE EXTRACTION SETTING #
8
+ ###########################################################
9
+ sampling_rate: 24000 # Sampling rate.
10
+ fft_size: 2048 # FFT size.
11
+ hop_size: 300 # Hop size.
12
+ win_length: 1200 # Window length.
13
+ # If set to null, it will be the same as fft_size.
14
+ window: "hann" # Window function.
15
+ num_mels: 80 # Number of mel basis.
16
+ fmin: 80 # Minimum freq in mel basis calculation.
17
+ fmax: 7600 # Maximum frequency in mel basis calculation.
18
+ global_gain_scale: 1.0 # Will be multiplied to all of waveform.
19
+ trim_silence: false # Whether to trim the start and end of silence.
20
+ trim_threshold_in_db: 40 # Need to tune carefully if the recording is not good.
21
+ trim_frame_size: 2048 # Frame size in trimming.
22
+ trim_hop_size: 512 # Hop size in trimming.
23
+ format: "hdf5" # Feature file format. "npy" or "hdf5" is supported.
24
+
25
+ ###########################################################
26
+ # GENERATOR NETWORK ARCHITECTURE SETTING #
27
+ ###########################################################
28
+ generator_params:
29
+ in_channels: 1 # Number of input channels.
30
+ out_channels: 1 # Number of output channels.
31
+ kernel_size: 3 # Kernel size of dilated convolution.
32
+ layers: 30 # Number of residual block layers.
33
+ stacks: 3 # Number of stacks i.e., dilation cycles.
34
+ residual_channels: 64 # Number of channels in residual conv.
35
+ gate_channels: 128 # Number of channels in gated conv.
36
+ skip_channels: 64 # Number of channels in skip conv.
37
+ aux_channels: 80 # Number of channels for auxiliary feature conv.
38
+ # Must be the same as num_mels.
39
+ aux_context_window: 2 # Context window size for auxiliary feature.
40
+ # If set to 2, previous 2 and future 2 frames will be considered.
41
+ dropout: 0.0 # Dropout rate. 0.0 means no dropout applied.
42
+ use_weight_norm: true # Whether to use weight norm.
43
+ # If set to true, it will be applied to all of the conv layers.
44
+ upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
45
+ upsample_params: # Upsampling network parameters.
46
+ upsample_scales: [4, 5, 3, 5] # Upsampling scales. Prodcut of these must be the same as hop size.
47
+
48
+ ###########################################################
49
+ # DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
50
+ ###########################################################
51
+ discriminator_params:
52
+ in_channels: 1 # Number of input channels.
53
+ out_channels: 1 # Number of output channels.
54
+ kernel_size: 3 # Number of output channels.
55
+ layers: 10 # Number of conv layers.
56
+ conv_channels: 64 # Number of chnn layers.
57
+ bias: true # Whether to use bias parameter in conv.
58
+ use_weight_norm: true # Whether to use weight norm.
59
+ # If set to true, it will be applied to all of the conv layers.
60
+ nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
61
+ nonlinear_activation_params: # Nonlinear function parameters
62
+ negative_slope: 0.2 # Alpha in LeakyReLU.
63
+
64
+ ###########################################################
65
+ # STFT LOSS SETTING #
66
+ ###########################################################
67
+ stft_loss_params:
68
+ fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
69
+ hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss
70
+ win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
71
+ window: "hann_window" # Window function for STFT-based loss
72
+
73
+ ###########################################################
74
+ # ADVERSARIAL LOSS SETTING #
75
+ ###########################################################
76
+ lambda_adv: 4.0 # Loss balancing coefficient.
77
+
78
+ ###########################################################
79
+ # DATA LOADER SETTING #
80
+ ###########################################################
81
+ batch_size: 6 # Batch size.
82
+ batch_max_steps: 25500 # Length of each audio in batch. Make sure dividable by hop_size.
83
+ pin_memory: true # Whether to pin memory in Pytorch DataLoader.
84
+ num_workers: 2 # Number of workers in Pytorch DataLoader.
85
+ remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
86
+ allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.
87
+
88
+ ###########################################################
89
+ # OPTIMIZER & SCHEDULER SETTING #
90
+ ###########################################################
91
+ generator_optimizer_params:
92
+ lr: 0.0001 # Generator's learning rate.
93
+ eps: 1.0e-6 # Generator's epsilon.
94
+ weight_decay: 0.0 # Generator's weight decay coefficient.
95
+ generator_scheduler_params:
96
+ step_size: 200000 # Generator's scheduler step size.
97
+ gamma: 0.5 # Generator's scheduler gamma.
98
+ # At each step size, lr will be multiplied by this parameter.
99
+ generator_grad_norm: 10 # Generator's gradient norm.
100
+ discriminator_optimizer_params:
101
+ lr: 0.00005 # Discriminator's learning rate.
102
+ eps: 1.0e-6 # Discriminator's epsilon.
103
+ weight_decay: 0.0 # Discriminator's weight decay coefficient.
104
+ discriminator_scheduler_params:
105
+ step_size: 200000 # Discriminator's scheduler step size.
106
+ gamma: 0.5 # Discriminator's scheduler gamma.
107
+ # At each step size, lr will be multiplied by this parameter.
108
+ discriminator_grad_norm: 1 # Discriminator's gradient norm.
109
+
110
+ ###########################################################
111
+ # INTERVAL SETTING #
112
+ ###########################################################
113
+ discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
114
+ train_max_steps: 400000 # Number of training steps.
115
+ save_interval_steps: 5000 # Interval steps to save checkpoint.
116
+ eval_interval_steps: 1000 # Interval steps to evaluate the network.
117
+ log_interval_steps: 100 # Interval steps to record the training log.
118
+
119
+ ###########################################################
120
+ # OTHER SETTING #
121
+ ###########################################################
122
+ num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
ParallelWaveGAN/egs/jsss/voc1/conf/slurm.conf ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Default configuration
2
+ command sbatch --export=PATH --ntasks-per-node=1
3
+ option time=* --time $0
4
+ option mem=* --mem-per-cpu $0
5
+ option mem=0 # Do not add anything to qsub_opts
6
+ option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
7
+ option num_threads=1 --cpus-per-task 1 --ntasks-per-node=1 # Do not add anything to qsub_opts
8
+ default gpu=0
9
+ option gpu=0 -p cpu
10
+ option gpu=* -p gpu --gres=gpu:$0
11
+ # note: the --max-jobs-run option is supported as a special case
12
+ # by slurm.pl and you don't have to handle it in the config file.
ParallelWaveGAN/egs/jsss/voc1/local/data_download.sh ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Copyright 2020 Tomoki Hayashi
4
+ # MIT License (https://opensource.org/licenses/MIT)
5
+
6
+ # Download JSSS Corpus
7
+
8
+ # shellcheck disable=SC1091
9
+ . ./path.sh || exit 1
10
+
11
+ download_dir=$1
12
+
13
+ # check arguments
14
+ if [ $# != 1 ]; then
15
+ echo "Usage: $0 <download_dir>"
16
+ exit 1
17
+ fi
18
+
19
+ set -euo pipefail
20
+
21
+ url="https://drive.google.com/a/g.sp.m.is.nagoya-u.ac.jp/uc?id=1NyiZCXkYTdYBNtD1B-IMAYCVa-0SQsKX"
22
+ if [ ! -e "${download_dir}/jsss_ver1" ]; then
23
+ utils/download_from_google_drive.sh "${url}" "${download_dir}" zip
24
+ echo "Successfully downloaded JSSS corpus."
25
+ else
26
+ echo "Already exists. Skipped."
27
+ fi
28
+
29
+ cwd=$(pwd)
30
+ if [ ! -e "${download_dir}/JSSSLabel" ]; then
31
+ echo "Downloading full-context labels for jsut v1.1..."
32
+ cd "${download_dir}"
33
+ git clone https://github.com/kan-bayashi/JSSSLabel
34
+ for name in long-form short-form simplification summarization; do
35
+ cp -vr JSSSLabel/${name} jsss_ver1
36
+ done
37
+ cd "${cwd}"
38
+ echo "Successfully downloaded JSSS label."
39
+ else
40
+ echo "Already exists. Skipped."
41
+ fi
ParallelWaveGAN/egs/jsss/voc1/local/data_prep.sh ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Copyright 2020 Tomoki Hayashi
4
+ # MIT License (https://opensource.org/licenses/MIT)
5
+
6
+ # Prepare kaldi-style data directory for JSSS corpus
7
+
8
+ fs=24000
9
+ num_dev=50
10
+ num_eval=50
11
+ train_set="train_nodev"
12
+ dev_set="dev"
13
+ eval_set="eval"
14
+ shuffle=false
15
+
16
+ # shellcheck disable=SC1091
17
+ . utils/parse_options.sh || exit 1;
18
+
19
+ db=$1
20
+ data_dir_root=$2
21
+
22
+ # check arguments
23
+ if [ $# != 2 ]; then
24
+ echo "Usage: $0 [Options] <db> <data_dir>"
25
+ echo "e.g.: $0 downloads/jsss_ver1 data"
26
+ echo ""
27
+ echo "Options:"
28
+ echo " --fs: target sampling rate (default=24000)."
29
+ echo " --num_dev: number of development uttreances (default=50)."
30
+ echo " --num_eval: number of evaluation uttreances (default=50)."
31
+ echo " --train_set: name of train set (default=train_nodev)."
32
+ echo " --dev_set: name of dev set (default=dev)."
33
+ echo " --eval_set: name of eval set (default=eval)."
34
+ echo " --shuffle: whether to perform shuffle in making dev / eval set (default=false)."
35
+ exit 1
36
+ fi
37
+
38
+ set -euo pipefail
39
+
40
+ ######################################
41
+ # process data without segments #
42
+ ######################################
43
+ dsets_without_segments="
44
+ short-form/basic5000
45
+ short-form/onomatopee300
46
+ short-form/voiceactress100
47
+ simplification
48
+ "
49
+ for dset in ${dsets_without_segments}; do
50
+ # check directory existence
51
+ _data_dir=${data_dir_root}/$(basename "${dset}")
52
+ [ ! -e "${_data_dir}" ] && mkdir -p "${_data_dir}"
53
+
54
+ # set filenames
55
+ scp=${_data_dir}/wav.scp
56
+ segments=${_data_dir}/segments
57
+
58
+ # check file existence
59
+ [ -e "${scp}" ] && rm "${scp}"
60
+ [ -e "${segments}" ] && rm "${segments}"
61
+
62
+ # make wav.scp and segments
63
+ find "${db}/${dset}/wav24kHz16bit" -name "*.wav" | sort | while read -r filename; do
64
+ utt_id=$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")
65
+ lab_filename="${db}/${dset}/lab/$(basename "${filename}" .wav).lab"
66
+ if [ ! -e "${lab_filename}" ]; then
67
+ echo "${lab_filename} does not exist. Skipped."
68
+ continue
69
+ fi
70
+ start_sec=$(head -n 1 "${lab_filename}" | cut -d " " -f 2)
71
+ end_sec=$(tail -n 1 "${lab_filename}" | cut -d " " -f 1)
72
+ echo "${utt_id} ${utt_id} ${start_sec} ${end_sec}" >> "${segments}"
73
+ if [ "${fs}" -eq 24000 ]; then
74
+ # default sampling rate
75
+ echo "${utt_id} ${filename}" >> "${scp}"
76
+ else
77
+ echo "${utt_id} sox ${filename} -t wav -r $fs - |" >> "${scp}"
78
+ fi
79
+ done
80
+ echo "Successfully prepared ${dset}."
81
+ done
82
+
83
+ ######################################
84
+ # process data with segments #
85
+ ######################################
86
+ dsets_with_segments="
87
+ long-form/katsura-masakazu
88
+ long-form/udon
89
+ long-form/washington-dc
90
+ summarization
91
+ "
92
+ for dset in ${dsets_with_segments}; do
93
+ # check directory existence
94
+ _data_dir=${data_dir_root}/$(basename "${dset}")
95
+ [ ! -e "${_data_dir}" ] && mkdir -p "${_data_dir}"
96
+
97
+ # set filenames
98
+ scp=${_data_dir}/wav.scp
99
+ segments=${_data_dir}/segments
100
+
101
+ # check file existence
102
+ [ -e "${scp}" ] && rm "${scp}"
103
+ [ -e "${segments}" ] && rm "${segments}"
104
+
105
+ # make wav.scp
106
+ find "${db}/${dset}/wav24kHz16bit" -name "*.wav" | sort | while read -r filename; do
107
+ wav_id=$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")
108
+ if [ "${fs}" -eq 24000 ]; then
109
+ # default sampling rate
110
+ echo "${wav_id} ${filename}" >> "${scp}"
111
+ else
112
+ echo "${wav_id} sox ${filename} -t wav -r $fs - |" >> "${scp}"
113
+ fi
114
+ done
115
+
116
+ # make segments
117
+ find "${db}/${dset}/transcript_utf8" -name "*.txt" | sort | while read -r filename; do
118
+ wav_id=$(basename "${filename}" .txt)
119
+ while read -r line; do
120
+ start_sec=$(echo "${line}" | cut -f 1)
121
+ end_sec=$(echo "${line}" | cut -f 2)
122
+ utt_id=${wav_id}
123
+ utt_id+="_$(printf %010d "$(echo "${start_sec}" | tr -d "." | sed -e "s/^[0]*//g")")"
124
+ utt_id+="_$(printf %010d "$(echo "${end_sec}" | tr -d "." | sed -e "s/^[0]*//g")")"
125
+
126
+ # modify segment information with force alignment results
127
+ lab_filename=${db}/${dset}/lab/${utt_id}.lab
128
+ if [ ! -e "${lab_filename}" ]; then
129
+ echo "${lab_filename} does not exist. Skipped."
130
+ continue
131
+ fi
132
+ start_sec_offset=$(head -n 1 "${lab_filename}" | cut -d " " -f 2)
133
+ end_sec_offset=$(tail -n 1 "${lab_filename}" | cut -d " " -f 1)
134
+ start_sec=$(python -c "print(${start_sec} + ${start_sec_offset})")
135
+ end_sec=$(python -c "print(${start_sec} + ${end_sec_offset} - ${start_sec_offset})")
136
+ echo "${utt_id} ${wav_id} ${start_sec} ${end_sec}" >> "${segments}"
137
+ done < "${filename}"
138
+ done
139
+
140
+ # fix
141
+ echo "Successfully prepared ${dset}."
142
+ done
143
+
144
+ ######################################
145
+ # combine and split data #
146
+ ######################################
147
+ # combine all data
148
+ combined_data_dirs=""
149
+ for dset in ${dsets_without_segments} ${dsets_with_segments}; do
150
+ combined_data_dirs+="${data_dir_root}/$(basename "${dset}") "
151
+ done
152
+ # shellcheck disable=SC2086
153
+ utils/combine_data.sh "${data_dir_root}/all" ${combined_data_dirs}
154
+ # shellcheck disable=SC2086
155
+ rm -rf ${combined_data_dirs}
156
+
157
+ # split
158
+ num_all=$(wc -l < "${data_dir_root}/all/segments")
159
+ num_deveval=$((num_dev + num_eval))
160
+ num_train=$((num_all - num_deveval))
161
+ utils/split_data.sh \
162
+ --num_first "${num_deveval}" \
163
+ --num_second "${num_train}" \
164
+ --shuffle "${shuffle}" \
165
+ "${data_dir_root}/all" \
166
+ "${data_dir_root}/deveval" \
167
+ "${data_dir_root}/${train_set}"
168
+ utils/split_data.sh \
169
+ --num_first "${num_eval}" \
170
+ --num_second "${num_dev}" \
171
+ --shuffle "${shuffle}" \
172
+ "${data_dir_root}/deveval" \
173
+ "${data_dir_root}/${eval_set}" \
174
+ "${data_dir_root}/${dev_set}"
175
+
176
+ # remove tmp directories
177
+ rm -rf "${data_dir_root}/all"
178
+ rm -rf "${data_dir_root}/deveval"
179
+
180
+ echo "Successfully prepared data."
ParallelWaveGAN/egs/jsss/voc1/path.sh ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # cuda related
2
+ export CUDA_HOME=/usr/local/cuda-10.0
3
+ export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
4
+
5
+ # path related
6
+ export PRJ_ROOT="${PWD}/../../.."
7
+ if [ -e "${PRJ_ROOT}/tools/venv/bin/activate" ]; then
8
+ # shellcheck disable=SC1090
9
+ . "${PRJ_ROOT}/tools/venv/bin/activate"
10
+ fi
11
+
12
+ # python related
13
+ export OMP_NUM_THREADS=1
14
+ export PYTHONIOENCODING=UTF-8
15
+ export MPL_BACKEND=Agg
16
+
17
+ # check installation
18
+ if ! command -v parallel-wavegan-train > /dev/null; then
19
+ echo "Error: It seems setup is not finished." >&2
20
+ echo "Error: Please setup your environment by following README.md" >&2
21
+ return 1
22
+ fi
23
+ if ! command -v jq > /dev/null; then
24
+ echo "Error: It seems jq is not installed." >&2
25
+ echo "Error: Please install via \`sudo apt-get install jq\`." >&2
26
+ echo "Error: If you do not have sudo, please download from https://stedolan.github.io/jq/download/." >&2
27
+ return 1
28
+ fi
29
+ if ! command -v yq > /dev/null; then
30
+ echo "Error: It seems yq is not installed." >&2
31
+ echo "Error: Please install via \`pip install yq\`." >&2
32
+ return 1
33
+ fi
ParallelWaveGAN/egs/jsss/voc1/run.sh ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Copyright 2020 Tomoki Hayashi
4
+ # MIT License (https://opensource.org/licenses/MIT)
5
+
6
+ . ./cmd.sh || exit 1;
7
+ . ./path.sh || exit 1;
8
+
9
+ # basic settings
10
+ stage=-1 # stage to start
11
+ stop_stage=100 # stage to stop
12
+ verbose=1 # verbosity level (lower is less info)
13
+ n_gpus=1 # number of gpus in training
14
+ n_jobs=4 # number of parallel jobs in feature extraction
15
+
16
+ # NOTE(kan-bayashi): renamed to conf to avoid conflict in parse_options.sh
17
+ conf=conf/parallel_wavegan.v1.yaml
18
+
19
+ # directory path setting
20
+ download_dir=downloads # direcotry to save downloaded files
21
+ dumpdir=dump # directory to dump features
22
+
23
+ # subset setting
24
+ shuffle=false # whether to shuffle the data to create subset
25
+ num_dev=50 # the number of development data
26
+ num_eval=50 # the number of evaluation data
27
+ # (if set to 0, the same dev set is used as eval set)
28
+
29
+ # training related setting
30
+ tag="" # tag for directory to save model
31
+ resume="" # checkpoint path to resume training
32
+ # (e.g. <path>/<to>/checkpoint-10000steps.pkl)
33
+ pretrain="" # checkpoint path to load pretrained parameters
34
+ # (e.g. ../../jsut/<path>/<to>/checkpoint-400000steps.pkl)
35
+
36
+ # decoding related setting
37
+ checkpoint="" # checkpoint path to be used for decoding
38
+ # if not provided, the latest one will be used
39
+ # (e.g. <path>/<to>/checkpoint-400000steps.pkl)
40
+
41
+ # shellcheck disable=SC1091
42
+ . utils/parse_options.sh || exit 1;
43
+
44
+ train_set="train_nodev" # name of training data directory
45
+ dev_set="dev" # name of development data direcotry
46
+ eval_set="eval" # name of evaluation data direcotry
47
+
48
+ set -euo pipefail
49
+
50
+ if [ "${stage}" -le -1 ] && [ "${stop_stage}" -ge -1 ]; then
51
+ echo "Stage -1: Data download"
52
+ local/data_download.sh "${download_dir}"
53
+ fi
54
+
55
+ if [ "${stage}" -le 0 ] && [ "${stop_stage}" -ge 0 ]; then
56
+ echo "Stage 0: Data preparation"
57
+ local/data_prep.sh \
58
+ --fs "$(yq ".sampling_rate" "${conf}")" \
59
+ --num_dev "${num_dev}" \
60
+ --num_eval "${num_eval}" \
61
+ --train_set "${train_set}" \
62
+ --dev_set "${dev_set}" \
63
+ --eval_set "${eval_set}" \
64
+ --shuffle "${shuffle}" \
65
+ "${download_dir}/jsss_ver1" data
66
+ fi
67
+
68
+ stats_ext=$(grep -q "hdf5" <(yq ".format" "${conf}") && echo "h5" || echo "npy")
69
+ if [ "${stage}" -le 1 ] && [ "${stop_stage}" -ge 1 ]; then
70
+ echo "Stage 1: Feature extraction"
71
+ # extract raw features
72
+ pids=()
73
+ for name in "${train_set}" "${dev_set}" "${eval_set}"; do
74
+ (
75
+ [ ! -e "${dumpdir}/${name}/raw" ] && mkdir -p "${dumpdir}/${name}/raw"
76
+ echo "Feature extraction start. See the progress via ${dumpdir}/${name}/raw/preprocessing.*.log."
77
+ utils/make_subset_data.sh "data/${name}" "${n_jobs}" "${dumpdir}/${name}/raw"
78
+ ${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/raw/preprocessing.JOB.log" \
79
+ parallel-wavegan-preprocess \
80
+ --config "${conf}" \
81
+ --scp "${dumpdir}/${name}/raw/wav.JOB.scp" \
82
+ --dumpdir "${dumpdir}/${name}/raw/dump.JOB" \
83
+ --verbose "${verbose}"
84
+ echo "Successfully finished feature extraction of ${name} set."
85
+ ) &
86
+ pids+=($!)
87
+ done
88
+ i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
89
+ [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
90
+ echo "Successfully finished feature extraction."
91
+
92
+ # calculate statistics for normalization
93
+ if [ -z "${pretrain}" ]; then
94
+ # calculate statistics for normalization
95
+ echo "Statistics computation start. See the progress via ${dumpdir}/${train_set}/compute_statistics.log."
96
+ ${train_cmd} "${dumpdir}/${train_set}/compute_statistics.log" \
97
+ parallel-wavegan-compute-statistics \
98
+ --config "${conf}" \
99
+ --rootdir "${dumpdir}/${train_set}/raw" \
100
+ --dumpdir "${dumpdir}/${train_set}" \
101
+ --verbose "${verbose}"
102
+ echo "Successfully finished calculation of statistics."
103
+ else
104
+ echo "Use statistics of pretrained model. Skip statistics computation."
105
+ cp "$(dirname "${pretrain}")/stats.${stats_ext}" "${dumpdir}/${train_set}"
106
+ fi
107
+
108
+ # normalize and dump them
109
+ pids=()
110
+ for name in "${train_set}" "${dev_set}" "${eval_set}"; do
111
+ (
112
+ [ ! -e "${dumpdir}/${name}/norm" ] && mkdir -p "${dumpdir}/${name}/norm"
113
+ echo "Nomalization start. See the progress via ${dumpdir}/${name}/norm/normalize.*.log."
114
+ ${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/norm/normalize.JOB.log" \
115
+ parallel-wavegan-normalize \
116
+ --config "${conf}" \
117
+ --stats "${dumpdir}/${train_set}/stats.${stats_ext}" \
118
+ --rootdir "${dumpdir}/${name}/raw/dump.JOB" \
119
+ --dumpdir "${dumpdir}/${name}/norm/dump.JOB" \
120
+ --verbose "${verbose}"
121
+ echo "Successfully finished normalization of ${name} set."
122
+ ) &
123
+ pids+=($!)
124
+ done
125
+ i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
126
+ [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
127
+ echo "Successfully finished normalization."
128
+ fi
129
+
130
+ if [ -z "${tag}" ]; then
131
+ expdir="exp/${train_set}_jsss_$(basename "${conf}" .yaml)"
132
+ if [ -n "${pretrain}" ]; then
133
+ pretrain_tag=$(basename "$(dirname "${pretrain}")")
134
+ expdir+="_${pretrain_tag}"
135
+ fi
136
+ else
137
+ expdir="exp/${train_set}_jsss_${tag}"
138
+ fi
139
+ if [ "${stage}" -le 2 ] && [ "${stop_stage}" -ge 2 ]; then
140
+ echo "Stage 2: Network training"
141
+ [ ! -e "${expdir}" ] && mkdir -p "${expdir}"
142
+ cp "${dumpdir}/${train_set}/stats.${stats_ext}" "${expdir}"
143
+ if [ "${n_gpus}" -gt 1 ]; then
144
+ train="python -m parallel_wavegan.distributed.launch --nproc_per_node ${n_gpus} -c parallel-wavegan-train"
145
+ else
146
+ train="parallel-wavegan-train"
147
+ fi
148
+ echo "Training start. See the progress via ${expdir}/train.log."
149
+ ${cuda_cmd} --gpu "${n_gpus}" "${expdir}/train.log" \
150
+ ${train} \
151
+ --config "${conf}" \
152
+ --train-dumpdir "${dumpdir}/${train_set}/norm" \
153
+ --dev-dumpdir "${dumpdir}/${dev_set}/norm" \
154
+ --outdir "${expdir}" \
155
+ --resume "${resume}" \
156
+ --pretrain "${pretrain}" \
157
+ --verbose "${verbose}"
158
+ echo "Successfully finished training."
159
+ fi
160
+
161
+ if [ "${stage}" -le 3 ] && [ "${stop_stage}" -ge 3 ]; then
162
+ echo "Stage 3: Network decoding"
163
+ # shellcheck disable=SC2012
164
+ [ -z "${checkpoint}" ] && checkpoint="$(ls -dt "${expdir}"/*.pkl | head -1 || true)"
165
+ outdir="${expdir}/wav/$(basename "${checkpoint}" .pkl)"
166
+ pids=()
167
+ for name in "${dev_set}" "${eval_set}"; do
168
+ (
169
+ [ ! -e "${outdir}/${name}" ] && mkdir -p "${outdir}/${name}"
170
+ [ "${n_gpus}" -gt 1 ] && n_gpus=1
171
+ echo "Decoding start. See the progress via ${outdir}/${name}/decode.log."
172
+ ${cuda_cmd} --gpu "${n_gpus}" "${outdir}/${name}/decode.log" \
173
+ parallel-wavegan-decode \
174
+ --dumpdir "${dumpdir}/${name}/norm" \
175
+ --checkpoint "${checkpoint}" \
176
+ --outdir "${outdir}/${name}" \
177
+ --verbose "${verbose}"
178
+ echo "Successfully finished decoding of ${name} set."
179
+ ) &
180
+ pids+=($!)
181
+ done
182
+ i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
183
+ [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
184
+ echo "Successfully finished decoding."
185
+ fi
186
+ echo "Finished."
ParallelWaveGAN/egs/jsss/voc1/utils ADDED
@@ -0,0 +1 @@
 
 
1
+ ../../../utils
ParallelWaveGAN/egs/jsut/voc1/cmd.sh ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
2
+ # Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
3
+ # e.g.
4
+ # run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
5
+ #
6
+ # Options:
7
+ # --time <time>: Limit the maximum time to execute.
8
+ # --mem <mem>: Limit the maximum memory usage.
9
+ # -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
10
+ # --num-threads <ngpu>: Specify the number of CPU core.
11
+ # --gpu <ngpu>: Specify the number of GPU devices.
12
+ # --config: Change the configuration file from default.
13
+ #
14
+ # "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
15
+ # The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
16
+ # e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
17
+ # Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
18
+ #
19
+ # run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
20
+ # These options are mapping to specific options for each backend and
21
+ # it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
22
+ # If jobs failed, your configuration might be wrong for your environment.
23
+ #
24
+ #
25
+ # The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
26
+ # "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
27
+ # =========================================================~
28
+
29
+
30
+ # Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
31
+ cmd_backend="local"
32
+
33
+ # Local machine, without any Job scheduling system
34
+ if [ "${cmd_backend}" = local ]; then
35
+
36
+ # The other usage
37
+ export train_cmd="utils/run.pl"
38
+ # Used for "*_train.py": "--gpu" is appended optionally by run.sh
39
+ export cuda_cmd="utils/run.pl"
40
+ # Used for "*_recog.py"
41
+ export decode_cmd="utils/run.pl"
42
+
43
+ # Local machine, without any Job scheduling system
44
+ elif [ "${cmd_backend}" = stdout ]; then
45
+
46
+ # The other usage
47
+ export train_cmd="utils/stdout.pl"
48
+ # Used for "*_train.py": "--gpu" is appended optionally by run.sh
49
+ export cuda_cmd="utils/stdout.pl"
50
+ # Used for "*_recog.py"
51
+ export decode_cmd="utils/stdout.pl"
52
+
53
+ # "qsub" (SGE, Torque, PBS, etc.)
54
+ elif [ "${cmd_backend}" = sge ]; then
55
+ # The default setting is written in conf/queue.conf.
56
+ # You must change "-q g.q" for the "queue" for your environment.
57
+ # To know the "queue" names, type "qhost -q"
58
+ # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
59
+
60
+ export train_cmd="utils/queue.pl"
61
+ export cuda_cmd="utils/queue.pl"
62
+ export decode_cmd="utils/queue.pl"
63
+
64
+ # "sbatch" (Slurm)
65
+ elif [ "${cmd_backend}" = slurm ]; then
66
+ # The default setting is written in conf/slurm.conf.
67
+ # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
68
+ # To know the "partion" names, type "sinfo".
69
+ # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
70
+ # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
71
+
72
+ export train_cmd="utils/slurm.pl"
73
+ export cuda_cmd="utils/slurm.pl"
74
+ export decode_cmd="utils/slurm.pl"
75
+
76
+ elif [ "${cmd_backend}" = ssh ]; then
77
+ # You have to create ".queue/machines" to specify the host to execute jobs.
78
+ # e.g. .queue/machines
79
+ # host1
80
+ # host2
81
+ # host3
82
+ # Assuming you can login them without any password, i.e. You have to set ssh keys.
83
+
84
+ export train_cmd="utils/ssh.pl"
85
+ export cuda_cmd="utils/ssh.pl"
86
+ export decode_cmd="utils/ssh.pl"
87
+
88
+ else
89
+ echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
90
+ return 1
91
+ fi
ParallelWaveGAN/egs/jsut/voc1/conf/hifigan.v1.yaml ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is the configuration file for JSUT dataset.
2
+ # This configuration is based on HiFiGAN V1, which is
3
+ # an official configuration. But I found that the optimizer
4
+ # setting does not work well with my implementation.
5
+ # So I changed optimizer settings as follows:
6
+ # - AdamW -> Adam
7
+ # - betas: [0.8, 0.99] -> betas: [0.5, 0.9]
8
+ # - Scheduler: ExponentialLR -> MultiStepLR
9
+ # To match the shift size difference, the upsample scales
10
+ # is also modified from the original 256 shift setting.
11
+
12
+ ###########################################################
13
+ # FEATURE EXTRACTION SETTING #
14
+ ###########################################################
15
+ sampling_rate: 24000 # Sampling rate.
16
+ fft_size: 2048 # FFT size.
17
+ hop_size: 300 # Hop size.
18
+ win_length: 1200 # Window length.
19
+ # If set to null, it will be the same as fft_size.
20
+ window: "hann" # Window function.
21
+ num_mels: 80 # Number of mel basis.
22
+ fmin: 80 # Minimum freq in mel basis calculation.
23
+ fmax: 7600 # Maximum frequency in mel basis calculation.
24
+ global_gain_scale: 1.0 # Will be multiplied to all of waveform.
25
+ trim_silence: false # Whether to trim the start and end of silence.
26
+ trim_threshold_in_db: 20 # Need to tune carefully if the recording is not good.
27
+ trim_frame_size: 1024 # Frame size in trimming.
28
+ trim_hop_size: 256 # Hop size in trimming.
29
+ format: "hdf5" # Feature file format. "npy" or "hdf5" is supported.
30
+
31
+ ###########################################################
32
+ # GENERATOR NETWORK ARCHITECTURE SETTING #
33
+ ###########################################################
34
+ generator_type: HiFiGANGenerator
35
+ generator_params:
36
+ in_channels: 80 # Number of input channels.
37
+ out_channels: 1 # Number of output channels.
38
+ channels: 512 # Number of initial channels.
39
+ kernel_size: 7 # Kernel size of initial and final conv layers.
40
+ upsample_scales: [5, 5, 4, 3] # Upsampling scales.
41
+ upsample_kernel_sizes: [10, 10, 8, 6] # Kernel size for upsampling layers.
42
+ resblock_kernel_sizes: [3, 7, 11] # Kernel size for residual blocks.
43
+ resblock_dilations: # Dilations for residual blocks.
44
+ - [1, 3, 5]
45
+ - [1, 3, 5]
46
+ - [1, 3, 5]
47
+ use_additional_convs: true # Whether to use additional conv layer in residual blocks.
48
+ bias: true # Whether to use bias parameter in conv.
49
+ nonlinear_activation: "LeakyReLU" # Nonlinear activation type.
50
+ nonlinear_activation_params: # Nonlinear activation paramters.
51
+ negative_slope: 0.1
52
+ use_weight_norm: true # Whether to apply weight normalization.
53
+
54
+ ###########################################################
55
+ # DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
56
+ ###########################################################
57
+ discriminator_type: HiFiGANMultiScaleMultiPeriodDiscriminator
58
+ discriminator_params:
59
+ scales: 3 # Number of multi-scale discriminator.
60
+ scale_downsample_pooling: "AvgPool1d" # Pooling operation for scale discriminator.
61
+ scale_downsample_pooling_params:
62
+ kernel_size: 4 # Pooling kernel size.
63
+ stride: 2 # Pooling stride.
64
+ padding: 2 # Padding size.
65
+ scale_discriminator_params:
66
+ in_channels: 1 # Number of input channels.
67
+ out_channels: 1 # Number of output channels.
68
+ kernel_sizes: [15, 41, 5, 3] # List of kernel sizes.
69
+ channels: 128 # Initial number of channels.
70
+ max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
71
+ max_groups: 16 # Maximum number of groups in downsampling conv layers.
72
+ bias: true
73
+ downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales.
74
+ nonlinear_activation: "LeakyReLU" # Nonlinear activation.
75
+ nonlinear_activation_params:
76
+ negative_slope: 0.1
77
+ follow_official_norm: true # Whether to follow the official norm setting.
78
+ periods: [2, 3, 5, 7, 11] # List of period for multi-period discriminator.
79
+ period_discriminator_params:
80
+ in_channels: 1 # Number of input channels.
81
+ out_channels: 1 # Number of output channels.
82
+ kernel_sizes: [5, 3] # List of kernel sizes.
83
+ channels: 32 # Initial number of channels.
84
+ downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
85
+ max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
86
+ bias: true # Whether to use bias parameter in conv layer."
87
+ nonlinear_activation: "LeakyReLU" # Nonlinear activation.
88
+ nonlinear_activation_params: # Nonlinear activation paramters.
89
+ negative_slope: 0.1
90
+ use_weight_norm: true # Whether to apply weight normalization.
91
+ use_spectral_norm: false # Whether to apply spectral normalization.
92
+
93
+ ###########################################################
94
+ # STFT LOSS SETTING #
95
+ ###########################################################
96
+ use_stft_loss: false # Whether to use multi-resolution STFT loss.
97
+ use_mel_loss: true # Whether to use Mel-spectrogram loss.
98
+ mel_loss_params:
99
+ fs: 24000
100
+ fft_size: 2048
101
+ hop_size: 300
102
+ win_length: 1200
103
+ window: "hann"
104
+ num_mels: 80
105
+ fmin: 0
106
+ fmax: 12000
107
+ log_base: null
108
+ generator_adv_loss_params:
109
+ average_by_discriminators: false # Whether to average loss by #discriminators.
110
+ discriminator_adv_loss_params:
111
+ average_by_discriminators: false # Whether to average loss by #discriminators.
112
+ use_feat_match_loss: true
113
+ feat_match_loss_params:
114
+ average_by_discriminators: false # Whether to average loss by #discriminators.
115
+ average_by_layers: false # Whether to average loss by #layers in each discriminator.
116
+ include_final_outputs: false # Whether to include final outputs in feat match loss calculation.
117
+
118
+ ###########################################################
119
+ # ADVERSARIAL LOSS SETTING #
120
+ ###########################################################
121
+ lambda_aux: 45.0 # Loss balancing coefficient for STFT loss.
122
+ lambda_adv: 1.0 # Loss balancing coefficient for adversarial loss.
123
+ lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss..
124
+
125
+ ###########################################################
126
+ # DATA LOADER SETTING #
127
+ ###########################################################
128
+ batch_size: 16 # Batch size.
129
+ batch_max_steps: 8400 # Length of each audio in batch. Make sure dividable by hop_size.
130
+ pin_memory: true # Whether to pin memory in Pytorch DataLoader.
131
+ num_workers: 2 # Number of workers in Pytorch DataLoader.
132
+ remove_short_samples: false # Whether to remove samples the length of which are less than batch_max_steps.
133
+ allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.
134
+
135
+ ###########################################################
136
+ # OPTIMIZER & SCHEDULER SETTING #
137
+ ###########################################################
138
+ generator_optimizer_type: Adam
139
+ generator_optimizer_params:
140
+ lr: 2.0e-4
141
+ betas: [0.5, 0.9]
142
+ weight_decay: 0.0
143
+ generator_scheduler_type: MultiStepLR
144
+ generator_scheduler_params:
145
+ gamma: 0.5
146
+ milestones:
147
+ - 200000
148
+ - 400000
149
+ - 600000
150
+ - 800000
151
+ generator_grad_norm: -1
152
+ discriminator_optimizer_type: Adam
153
+ discriminator_optimizer_params:
154
+ lr: 2.0e-4
155
+ betas: [0.5, 0.9]
156
+ weight_decay: 0.0
157
+ discriminator_scheduler_type: MultiStepLR
158
+ discriminator_scheduler_params:
159
+ gamma: 0.5
160
+ milestones:
161
+ - 200000
162
+ - 400000
163
+ - 600000
164
+ - 800000
165
+ discriminator_grad_norm: -1
166
+
167
+ ###########################################################
168
+ # INTERVAL SETTING #
169
+ ###########################################################
170
+ generator_train_start_steps: 1 # Number of steps to start to train discriminator.
171
+ discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
172
+ train_max_steps: 2500000 # Number of training steps.
173
+ save_interval_steps: 10000 # Interval steps to save checkpoint.
174
+ eval_interval_steps: 1000 # Interval steps to evaluate the network.
175
+ log_interval_steps: 100 # Interval steps to record the training log.
176
+
177
+ ###########################################################
178
+ # OTHER SETTING #
179
+ ###########################################################
180
+ num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
ParallelWaveGAN/egs/jsut/voc1/conf/multi_band_melgan.v2.yaml ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is the hyperparameter configuration file for MelGAN.
2
+ # Please make sure this is adjusted for the JSUT dataset. If you want to
3
+ # apply to the other dataset, you might need to carefully change some parameters.
4
+ # This configuration requires ~ 8GB memory and will finish within 4 days on Titan V.
5
+
6
+ # This configuration is based on full-band MelGAN but the hop size and sampling
7
+ # rate is different from the paper (16kHz vs 24kHz). The number of iteraions
8
+ # is now shown in the paper so currently we train 1M iterations (not sure enough
9
+ # to converge). The optimizer setting is based on @dathudeptrai advice.
10
+ # https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906
11
+
12
+ ###########################################################
13
+ # FEATURE EXTRACTION SETTING #
14
+ ###########################################################
15
+ sampling_rate: 24000 # Sampling rate.
16
+ fft_size: 2048 # FFT size.
17
+ hop_size: 300 # Hop size.
18
+ win_length: 1200 # Window length.
19
+ # If set to null, it will be the same as fft_size.
20
+ window: "hann" # Window function.
21
+ num_mels: 80 # Number of mel basis.
22
+ fmin: 80 # Minimum freq in mel basis calculation.
23
+ fmax: 7600 # Maximum frequency in mel basis calculation.
24
+ global_gain_scale: 1.0 # Will be multiplied to all of waveform.
25
+ trim_silence: false # Whether to trim the start and end of silence.
26
+ trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
27
+ trim_frame_size: 2048 # Frame size in trimming.
28
+ trim_hop_size: 512 # Hop size in trimming.
29
+ format: "hdf5" # Feature file format. "npy" or "hdf5" is supported.
30
+
31
+ ###########################################################
32
+ # GENERATOR NETWORK ARCHITECTURE SETTING #
33
+ ###########################################################
34
+ generator_type: "MelGANGenerator" # Generator type.
35
+ generator_params:
36
+ in_channels: 80 # Number of input channels.
37
+ out_channels: 4 # Number of output channels.
38
+ kernel_size: 7 # Kernel size of initial and final conv layers.
39
+ channels: 384 # Initial number of channels for conv layers.
40
+ upsample_scales: [5, 5, 3] # List of Upsampling scales.
41
+ stack_kernel_size: 3 # Kernel size of dilated conv layers in residual stack.
42
+ stacks: 4 # Number of stacks in a single residual stack module.
43
+ use_weight_norm: True # Whether to use weight normalization.
44
+ use_causal_conv: False # Whether to use causal convolution.
45
+
46
+ ###########################################################
47
+ # DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
48
+ ###########################################################
49
+ discriminator_type: "MelGANMultiScaleDiscriminator" # Discriminator type.
50
+ discriminator_params:
51
+ in_channels: 1 # Number of input channels.
52
+ out_channels: 1 # Number of output channels.
53
+ scales: 3 # Number of multi-scales.
54
+ downsample_pooling: "AvgPool1d" # Pooling type for the input downsampling.
55
+ downsample_pooling_params: # Parameters of the above pooling function.
56
+ kernel_size: 4
57
+ stride: 2
58
+ padding: 1
59
+ count_include_pad: False
60
+ kernel_sizes: [5, 3] # List of kernel size.
61
+ channels: 16 # Number of channels of the initial conv layer.
62
+ max_downsample_channels: 512 # Maximum number of channels of downsampling layers.
63
+ downsample_scales: [4, 4, 4] # List of downsampling scales.
64
+ nonlinear_activation: "LeakyReLU" # Nonlinear activation function.
65
+ nonlinear_activation_params: # Parameters of nonlinear activation function.
66
+ negative_slope: 0.2
67
+ use_weight_norm: True # Whether to use weight norm.
68
+
69
+ ###########################################################
70
+ # STFT LOSS SETTING #
71
+ ###########################################################
72
+ stft_loss_params:
73
+ fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
74
+ hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss
75
+ win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
76
+ window: "hann_window" # Window function for STFT-based loss
77
+ use_subband_stft_loss: true
78
+ subband_stft_loss_params:
79
+ fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss.
80
+ hop_sizes: [30, 60, 10] # List of hop size for STFT-based loss
81
+ win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
82
+ window: "hann_window" # Window function for STFT-based loss
83
+
84
+ ###########################################################
85
+ # ADVERSARIAL LOSS SETTING #
86
+ ###########################################################
87
+ use_feat_match_loss: false # Whether to use feature matching loss.
88
+ lambda_adv: 2.5 # Loss balancing coefficient for adversarial loss.
89
+
90
+ ###########################################################
91
+ # DATA LOADER SETTING #
92
+ ###########################################################
93
+ batch_size: 64 # Batch size.
94
+ batch_max_steps: 16200 # Length of each audio in batch. Make sure dividable by hop_size.
95
+ pin_memory: true # Whether to pin memory in Pytorch DataLoader.
96
+ num_workers: 4 # Number of workers in Pytorch DataLoader.
97
+ remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
98
+ allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.
99
+
100
+ ###########################################################
101
+ # OPTIMIZER & SCHEDULER SETTING #
102
+ ###########################################################
103
+ generator_optimizer_type: "Adam" # Generator's optimizer type.
104
+ generator_optimizer_params:
105
+ lr: 1.0e-3 # Generator's learning rate.
106
+ eps: 1.0e-7 # Generator's epsilon.
107
+ weight_decay: 0.0 # Generator's weight decay coefficient.
108
+ amsgrad: true
109
+ generator_grad_norm: -1 # Generator's gradient norm.
110
+ generator_scheduler_type: "MultiStepLR" # Generator's scheduler type.
111
+ generator_scheduler_params:
112
+ gamma: 0.5 # Generator's scheduler gamma.
113
+ milestones: # At each milestone, lr will be multiplied by gamma.
114
+ - 100000
115
+ - 200000
116
+ - 300000
117
+ - 400000
118
+ - 500000
119
+ - 600000
120
+ discriminator_optimizer_type: "Adam" # Discriminator's optimizer type.
121
+ discriminator_optimizer_params:
122
+ lr: 1.0e-3 # Discriminator's learning rate.
123
+ eps: 1.0e-7 # Discriminator's epsilon.
124
+ weight_decay: 0.0 # Discriminator's weight decay coefficient.
125
+ amsgrad: true
126
+ discriminator_grad_norm: -1 # Discriminator's gradient norm.
127
+ discriminator_scheduler_type: "MultiStepLR" # Discriminator's scheduler type.
128
+ discriminator_scheduler_params:
129
+ gamma: 0.5 # Discriminator's scheduler gamma.
130
+ milestones: # At each milestone, lr will be multiplied by gamma.
131
+ - 100000
132
+ - 200000
133
+ - 300000
134
+ - 400000
135
+ - 500000
136
+ - 600000
137
+
138
+ ###########################################################
139
+ # INTERVAL SETTING #
140
+ ###########################################################
141
+ discriminator_train_start_steps: 200000 # Number of steps to start to train discriminator.
142
+ train_max_steps: 1000000 # Number of training steps.
143
+ save_interval_steps: 50000 # Interval steps to save checkpoint.
144
+ eval_interval_steps: 1000 # Interval steps to evaluate the network.
145
+ log_interval_steps: 1000 # Interval steps to record the training log.
146
+
147
+ ###########################################################
148
+ # OTHER SETTING #
149
+ ###########################################################
150
+ num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
ParallelWaveGAN/egs/jsut/voc1/conf/parallel_wavegan.v1.yaml ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is the hyperparameter configuration file for Parallel WaveGAN.
2
+ # Please make sure this is adjusted for the JSUT dataset. If you want to
3
+ # apply to the other dataset, you might need to carefully change some parameters.
4
+ # This configuration requires 12 GB GPU memory and takes ~3 days on RTX TITAN.
5
+
6
+ ###########################################################
7
+ # FEATURE EXTRACTION SETTING #
8
+ ###########################################################
9
+ sampling_rate: 24000 # Sampling rate.
10
+ fft_size: 2048 # FFT size.
11
+ hop_size: 300 # Hop size.
12
+ win_length: 1200 # Window length.
13
+ # If set to null, it will be the same as fft_size.
14
+ window: "hann" # Window function.
15
+ num_mels: 80 # Number of mel basis.
16
+ fmin: 80 # Minimum freq in mel basis calculation.
17
+ fmax: 7600 # Maximum frequency in mel basis calculation.
18
+ global_gain_scale: 1.0 # Will be multiplied to all of waveform.
19
+ trim_silence: false # Whether to trim the start and end of silence.
20
+ trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
21
+ trim_frame_size: 2048 # Frame size in trimming.
22
+ trim_hop_size: 512 # Hop size in trimming.
23
+ format: "hdf5" # Feature file format. "npy" or "hdf5" is supported.
24
+
25
+ ###########################################################
26
+ # GENERATOR NETWORK ARCHITECTURE SETTING #
27
+ ###########################################################
28
+ generator_params:
29
+ in_channels: 1 # Number of input channels.
30
+ out_channels: 1 # Number of output channels.
31
+ kernel_size: 3 # Kernel size of dilated convolution.
32
+ layers: 30 # Number of residual block layers.
33
+ stacks: 3 # Number of stacks i.e., dilation cycles.
34
+ residual_channels: 64 # Number of channels in residual conv.
35
+ gate_channels: 128 # Number of channels in gated conv.
36
+ skip_channels: 64 # Number of channels in skip conv.
37
+ aux_channels: 80 # Number of channels for auxiliary feature conv.
38
+ # Must be the same as num_mels.
39
+ aux_context_window: 2 # Context window size for auxiliary feature.
40
+ # If set to 2, previous 2 and future 2 frames will be considered.
41
+ dropout: 0.0 # Dropout rate. 0.0 means no dropout applied.
42
+ use_weight_norm: true # Whether to use weight norm.
43
+ # If set to true, it will be applied to all of the conv layers.
44
+ upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
45
+ upsample_params: # Upsampling network parameters.
46
+ upsample_scales: [4, 5, 3, 5] # Upsampling scales. Prodcut of these must be the same as hop size.
47
+
48
+ ###########################################################
49
+ # DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
50
+ ###########################################################
51
+ discriminator_params:
52
+ in_channels: 1 # Number of input channels.
53
+ out_channels: 1 # Number of output channels.
54
+ kernel_size: 3 # Number of output channels.
55
+ layers: 10 # Number of conv layers.
56
+ conv_channels: 64 # Number of chnn layers.
57
+ bias: true # Whether to use bias parameter in conv.
58
+ use_weight_norm: true # Whether to use weight norm.
59
+ # If set to true, it will be applied to all of the conv layers.
60
+ nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
61
+ nonlinear_activation_params: # Nonlinear function parameters
62
+ negative_slope: 0.2 # Alpha in LeakyReLU.
63
+
64
+ ###########################################################
65
+ # STFT LOSS SETTING #
66
+ ###########################################################
67
+ stft_loss_params:
68
+ fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
69
+ hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss
70
+ win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
71
+ window: "hann_window" # Window function for STFT-based loss
72
+
73
+ ###########################################################
74
+ # ADVERSARIAL LOSS SETTING #
75
+ ###########################################################
76
+ lambda_adv: 4.0 # Loss balancing coefficient.
77
+
78
+ ###########################################################
79
+ # DATA LOADER SETTING #
80
+ ###########################################################
81
+ batch_size: 6 # Batch size.
82
+ batch_max_steps: 25500 # Length of each audio in batch. Make sure dividable by hop_size.
83
+ pin_memory: true # Whether to pin memory in Pytorch DataLoader.
84
+ num_workers: 2 # Number of workers in Pytorch DataLoader.
85
+ remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
86
+ allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.
87
+
88
+ ###########################################################
89
+ # OPTIMIZER & SCHEDULER SETTING #
90
+ ###########################################################
91
+ generator_optimizer_params:
92
+ lr: 0.0001 # Generator's learning rate.
93
+ eps: 1.0e-6 # Generator's epsilon.
94
+ weight_decay: 0.0 # Generator's weight decay coefficient.
95
+ generator_scheduler_params:
96
+ step_size: 200000 # Generator's scheduler step size.
97
+ gamma: 0.5 # Generator's scheduler gamma.
98
+ # At each step size, lr will be multiplied by this parameter.
99
+ generator_grad_norm: 10 # Generator's gradient norm.
100
+ discriminator_optimizer_params:
101
+ lr: 0.00005 # Discriminator's learning rate.
102
+ eps: 1.0e-6 # Discriminator's epsilon.
103
+ weight_decay: 0.0 # Discriminator's weight decay coefficient.
104
+ discriminator_scheduler_params:
105
+ step_size: 200000 # Discriminator's scheduler step size.
106
+ gamma: 0.5 # Discriminator's scheduler gamma.
107
+ # At each step size, lr will be multiplied by this parameter.
108
+ discriminator_grad_norm: 1 # Discriminator's gradient norm.
109
+
110
+ ###########################################################
111
+ # INTERVAL SETTING #
112
+ ###########################################################
113
+ discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
114
+ train_max_steps: 400000 # Number of training steps.
115
+ save_interval_steps: 5000 # Interval steps to save checkpoint.
116
+ eval_interval_steps: 1000 # Interval steps to evaluate the network.
117
+ log_interval_steps: 100 # Interval steps to record the training log.
118
+
119
+ ###########################################################
120
+ # OTHER SETTING #
121
+ ###########################################################
122
+ num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
ParallelWaveGAN/egs/jsut/voc1/conf/slurm.conf ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Default configuration
2
+ command sbatch --export=PATH --ntasks-per-node=1
3
+ option time=* --time $0
4
+ option mem=* --mem-per-cpu $0
5
+ option mem=0 # Do not add anything to qsub_opts
6
+ option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
7
+ option num_threads=1 --cpus-per-task 1 --ntasks-per-node=1 # Do not add anything to qsub_opts
8
+ default gpu=0
9
+ option gpu=0 -p cpu
10
+ option gpu=* -p gpu --gres=gpu:$0
11
+ # note: the --max-jobs-run option is supported as a special case
12
+ # by slurm.pl and you don't have to handle it in the config file.
ParallelWaveGAN/egs/jsut/voc1/conf/style_melgan.v1.yaml ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is the configuration file for JSUT dataset.
2
+ # This configuration is based on StyleMelGAN paper but
3
+ # uses MSE loss instead of Hinge loss. And I found that
4
+ # batch_size = 8 is also working good. So maybe if you
5
+ # want to accelerate the training, you can reduce the
6
+ # batch size (e.g. 8 or 16). Upsampling scales is modified
7
+ # to fit the shift size 300 pt.
8
+
9
+ ###########################################################
10
+ # FEATURE EXTRACTION SETTING #
11
+ ###########################################################
12
+ sampling_rate: 24000 # Sampling rate.
13
+ fft_size: 2048 # FFT size.
14
+ hop_size: 300 # Hop size.
15
+ win_length: 1200 # Window length.
16
+ # If set to null, it will be the same as fft_size.
17
+ window: "hann" # Window function.
18
+ num_mels: 80 # Number of mel basis.
19
+ fmin: 80 # Minimum freq in mel basis calculation.
20
+ fmax: 7600 # Maximum frequency in mel basis calculation.
21
+ global_gain_scale: 1.0 # Will be multiplied to all of waveform.
22
+ trim_silence: false # Whether to trim the start and end of silence.
23
+ trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
24
+ trim_frame_size: 1024 # Frame size in trimming.
25
+ trim_hop_size: 256 # Hop size in trimming.
26
+ format: "hdf5" # Feature file format. " npy " or " hdf5 " is supported.
27
+
28
+ ###########################################################
29
+ # GENERATOR NETWORK ARCHITECTURE SETTING #
30
+ ###########################################################
31
+ generator_type: "StyleMelGANGenerator" # Generator type.
32
+ generator_params:
33
+ in_channels: 128
34
+ aux_channels: 80
35
+ channels: 64
36
+ out_channels: 1
37
+ kernel_size: 9
38
+ dilation: 2
39
+ bias: True
40
+ noise_upsample_scales: [10, 2, 2, 2]
41
+ noise_upsample_activation: "LeakyReLU"
42
+ noise_upsample_activation_params:
43
+ negative_slope: 0.2
44
+ upsample_scales: [5, 1, 5, 1, 3, 1, 2, 2, 1]
45
+ upsample_mode: "nearest"
46
+ gated_function: "softmax"
47
+ use_weight_norm: True
48
+
49
+ ###########################################################
50
+ # DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
51
+ ###########################################################
52
+ discriminator_type: "StyleMelGANDiscriminator" # Discriminator type.
53
+ discriminator_params:
54
+ repeats: 4
55
+ window_sizes: [512, 1024, 2048, 4096]
56
+ pqmf_params:
57
+ - [1, None, None, None]
58
+ - [2, 62, 0.26700, 9.0]
59
+ - [4, 62, 0.14200, 9.0]
60
+ - [8, 62, 0.07949, 9.0]
61
+ discriminator_params:
62
+ out_channels: 1
63
+ kernel_sizes: [5, 3]
64
+ channels: 16
65
+ max_downsample_channels: 512
66
+ bias: True
67
+ downsample_scales: [4, 4, 4, 1]
68
+ nonlinear_activation: "LeakyReLU"
69
+ nonlinear_activation_params:
70
+ negative_slope: 0.2
71
+ use_weight_norm: True
72
+
73
+ ###########################################################
74
+ # STFT LOSS SETTING #
75
+ ###########################################################
76
+ stft_loss_params:
77
+ fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
78
+ hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss
79
+ win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
80
+ window: "hann_window" # Window function for STFT-based loss
81
+ lambda_aux: 1.0 # Loss balancing coefficient for aux loss.
82
+
83
+ ###########################################################
84
+ # ADVERSARIAL LOSS SETTING #
85
+ ###########################################################
86
+ lambda_adv: 1.0 # Loss balancing coefficient for adv loss.
87
+ generator_adv_loss_params:
88
+ average_by_discriminators: false # Whether to average loss by #discriminators.
89
+ discriminator_adv_loss_params:
90
+ average_by_discriminators: false # Whether to average loss by #discriminators.
91
+
92
+ ###########################################################
93
+ # DATA LOADER SETTING #
94
+ ###########################################################
95
+ batch_size: 32 # Batch size.
96
+ batch_max_steps: 24000 # Length of each audio in batch. Make sure dividable by hop_size.
97
+ pin_memory: true # Whether to pin memory in Pytorch DataLoader.
98
+ num_workers: 2 # Number of workers in Pytorch DataLoader.
99
+ remove_short_samples: false # Whether to remove samples the length of which are less than batch_max_steps.
100
+ allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.
101
+
102
+ ###########################################################
103
+ # OPTIMIZER & SCHEDULER SETTING #
104
+ ###########################################################
105
+ generator_optimizer_type: Adam
106
+ generator_optimizer_params:
107
+ lr: 1.0e-4
108
+ betas: [0.5, 0.9]
109
+ weight_decay: 0.0
110
+ generator_scheduler_type: MultiStepLR
111
+ generator_scheduler_params:
112
+ gamma: 0.5
113
+ milestones:
114
+ - 100000
115
+ - 300000
116
+ - 500000
117
+ - 700000
118
+ - 900000
119
+ generator_grad_norm: -1
120
+ discriminator_optimizer_type: Adam
121
+ discriminator_optimizer_params:
122
+ lr: 2.0e-4
123
+ betas: [0.5, 0.9]
124
+ weight_decay: 0.0
125
+ discriminator_scheduler_type: MultiStepLR
126
+ discriminator_scheduler_params:
127
+ gamma: 0.5
128
+ milestones:
129
+ - 200000
130
+ - 400000
131
+ - 600000
132
+ - 800000
133
+ discriminator_grad_norm: -1
134
+
135
+ ###########################################################
136
+ # INTERVAL SETTING #
137
+ ###########################################################
138
+ discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
139
+ train_max_steps: 1500000 # Number of training steps.
140
+ save_interval_steps: 50000 # Interval steps to save checkpoint.
141
+ eval_interval_steps: 1000 # Interval steps to evaluate the network.
142
+ log_interval_steps: 100 # Interval steps to record the training log.
143
+
144
+ ###########################################################
145
+ # OTHER SETTING #
146
+ ###########################################################
147
+ num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
ParallelWaveGAN/egs/jsut/voc1/local/data_download.sh ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Copyright 2019 Tomoki Hayashi
4
+ # MIT License (https://opensource.org/licenses/MIT)
5
+
6
+ download_dir=$1
7
+
8
+ # check arguments
9
+ if [ $# != 1 ]; then
10
+ echo "Usage: $0 <download_dir>"
11
+ exit 1
12
+ fi
13
+
14
+ set -euo pipefail
15
+
16
+ cwd=$(pwd)
17
+ if [ ! -e "${download_dir}/jsut_ver1.1" ]; then
18
+ mkdir -p "${download_dir}"
19
+ cd "${download_dir}" || exit 1;
20
+ wget http://ss-takashi.sakura.ne.jp/corpus/jsut_ver1.1.zip
21
+ unzip -o ./*.zip
22
+ rm ./*.zip
23
+ cd "${cwd}" || exit 1;
24
+ echo "Successfully downloaded data."
25
+ else
26
+ echo "Already exists. Skipped."
27
+ fi
28
+
29
+ if [ ! -e "${download_dir}/jsut_lab" ]; then
30
+ cd "${download_dir}" || exit 1;
31
+ git clone https://github.com/r9y9/jsut-lab
32
+ for name in loanword128 repeat500 voiceactress100 basic5000 onomatopee300 travel1000 countersuffix26 precedent130 utparaphrase512; do
33
+ cp -vr "jsut-lab/${name}" jsut_ver1.1/
34
+ done
35
+ cd - || exit 1;
36
+ echo "Successfully downloaded context label."
37
+ else
38
+ echo "Already exists. Skipped."
39
+ fi
ParallelWaveGAN/egs/jsut/voc1/local/data_prep.sh ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Copyright 2019 Tomoki Hayashi
4
+ # MIT License (https://opensource.org/licenses/MIT)
5
+
6
+ # shellcheck disable=SC1091
7
+ . ./path.sh || exit 1;
8
+
9
+ fs=24000
10
+ num_dev=250
11
+ num_eval=250
12
+ train_set="train_nodev"
13
+ dev_set="dev"
14
+ eval_set="eval"
15
+ shuffle=false
16
+
17
+ # shellcheck disable=SC1091
18
+ . utils/parse_options.sh || exit 1;
19
+
20
+ db_root=$1
21
+ data_dir=$2
22
+
23
+ # check arguments
24
+ if [ $# != 2 ]; then
25
+ echo "Usage: $0 [Options] <db_root> <data_dir>"
26
+ echo "e.g.: $0 downloads/jsut_ver1.1 data"
27
+ echo ""
28
+ echo "Options:"
29
+ echo " --fs: target sampling rate (default=24000)."
30
+ echo " --num_dev: number of development uttreances (default=250)."
31
+ echo " --num_eval: number of evaluation uttreances (default=250)."
32
+ echo " --train_set: name of train set (default=train_nodev)."
33
+ echo " --dev_set: name of dev set (default=dev)."
34
+ echo " --eval_set: name of eval set (default=eval)."
35
+ echo " --shuffle: whether to perform shuffle in making dev / eval set (default=false)."
36
+ exit 1
37
+ fi
38
+
39
+ set -euo pipefail
40
+
41
+ [ ! -e "${data_dir}/all" ] && mkdir -p "${data_dir}/all"
42
+
43
+ # set filenames
44
+ scp="${data_dir}/all/wav.scp"
45
+ segments="${data_dir}/all/segments"
46
+
47
+ # check file existence
48
+ [ -e "${scp}" ] && rm "${scp}"
49
+ [ -e "${segments}" ] && rm "${segments}"
50
+
51
+ # make scp
52
+ find "${db_root}" -follow -name "*.wav" | sort | while read -r filename; do
53
+ id=$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")
54
+ echo "${id} cat ${filename} | sox -t wav - -c 1 -b 16 -t wav - rate ${fs} |" >> "${scp}"
55
+ done
56
+
57
+ # make segments
58
+ find "${db_root}" -name "*.lab" -follow | sort | while read -r filename; do
59
+ id=$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")
60
+ start_nsec=$(head -n 1 "${filename}" | cut -d " " -f 2)
61
+ end_nsec=$(tail -n 1 "${filename}" | cut -d " " -f 1)
62
+ start_sec=$(echo "${start_nsec}*0.0000001" | bc | sed "s/^\./0./")
63
+ end_sec=$(echo "${end_nsec}*0.0000001" | bc | sed "s/^\./0./")
64
+ echo "${id} ${id} ${start_sec} ${end_sec}" >> "${segments}"
65
+ done
66
+
67
+ # check
68
+ diff -q <(awk '{print $1}' "${scp}") <(awk '{print $1}' "${segments}") > /dev/null
69
+
70
+ # split
71
+ num_all=$(wc -l < "${scp}")
72
+ num_deveval=$((num_dev + num_eval))
73
+ num_train=$((num_all - num_deveval))
74
+ utils/split_data.sh \
75
+ --num_first "${num_train}" \
76
+ --num_second "${num_deveval}" \
77
+ --shuffle "${shuffle}" \
78
+ "${data_dir}/all" \
79
+ "${data_dir}/${train_set}" \
80
+ "${data_dir}/deveval"
81
+ utils/split_data.sh \
82
+ --num_first "${num_dev}" \
83
+ --num_second "${num_eval}" \
84
+ --shuffle "${shuffle}" \
85
+ "${data_dir}/deveval" \
86
+ "${data_dir}/${dev_set}" \
87
+ "${data_dir}/${eval_set}"
88
+
89
+ # remove tmp directories
90
+ rm -rf "${data_dir}/all"
91
+ rm -rf "${data_dir}/deveval"
92
+
93
+ echo "Successfully prepared data."
ParallelWaveGAN/egs/jsut/voc1/path.sh ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # cuda related
2
+ export CUDA_HOME=/usr/local/cuda-10.0
3
+ export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
4
+
5
+ # path related
6
+ export PRJ_ROOT="${PWD}/../../.."
7
+ if [ -e "${PRJ_ROOT}/tools/venv/bin/activate" ]; then
8
+ # shellcheck disable=SC1090
9
+ . "${PRJ_ROOT}/tools/venv/bin/activate"
10
+ fi
11
+
12
+ # python related
13
+ export OMP_NUM_THREADS=1
14
+ export PYTHONIOENCODING=UTF-8
15
+ export MPL_BACKEND=Agg
16
+
17
+ # check installation
18
+ if ! command -v parallel-wavegan-train > /dev/null; then
19
+ echo "Error: It seems setup is not finished." >&2
20
+ echo "Error: Please setup your environment by following README.md" >&2
21
+ return 1
22
+ fi
23
+ if ! command -v jq > /dev/null; then
24
+ echo "Error: It seems jq is not installed." >&2
25
+ echo "Error: Please install via \`sudo apt-get install jq\`." >&2
26
+ echo "Error: If you do not have sudo, please download from https://stedolan.github.io/jq/download/." >&2
27
+ return 1
28
+ fi
29
+ if ! command -v yq > /dev/null; then
30
+ echo "Error: It seems yq is not installed." >&2
31
+ echo "Error: Please install via \`pip install yq\`." >&2
32
+ return 1
33
+ fi