Spaces:
Runtime error
Runtime error
akhaliq3
commited on
Commit
•
2b7bf83
1
Parent(s):
d25d456
spaces demo
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- ParallelWaveGAN/.github/FUNDING.yml +1 -0
- ParallelWaveGAN/.github/workflows/ci.yaml +97 -0
- ParallelWaveGAN/.gitignore +36 -0
- ParallelWaveGAN/LICENSE +21 -0
- ParallelWaveGAN/egs/README.md +165 -0
- ParallelWaveGAN/egs/arctic/voc1/cmd.sh +91 -0
- ParallelWaveGAN/egs/arctic/voc1/conf/parallel_wavegan.v1.yaml +122 -0
- ParallelWaveGAN/egs/arctic/voc1/conf/slurm.conf +12 -0
- ParallelWaveGAN/egs/arctic/voc1/local/data_download.sh +40 -0
- ParallelWaveGAN/egs/arctic/voc1/local/data_prep.sh +113 -0
- ParallelWaveGAN/egs/arctic/voc1/path.sh +33 -0
- ParallelWaveGAN/egs/arctic/voc1/run.sh +167 -0
- ParallelWaveGAN/egs/arctic/voc1/utils +1 -0
- ParallelWaveGAN/egs/csmsc/voc1/cmd.sh +91 -0
- ParallelWaveGAN/egs/csmsc/voc1/conf/hifigan.v1.yaml +180 -0
- ParallelWaveGAN/egs/csmsc/voc1/conf/multi_band_melgan.v2.yaml +150 -0
- ParallelWaveGAN/egs/csmsc/voc1/conf/parallel_wavegan.v1.yaml +122 -0
- ParallelWaveGAN/egs/csmsc/voc1/conf/slurm.conf +12 -0
- ParallelWaveGAN/egs/csmsc/voc1/conf/style_melgan.v1.yaml +147 -0
- ParallelWaveGAN/egs/csmsc/voc1/local/data_download.sh +32 -0
- ParallelWaveGAN/egs/csmsc/voc1/local/data_prep.sh +94 -0
- ParallelWaveGAN/egs/csmsc/voc1/path.sh +33 -0
- ParallelWaveGAN/egs/csmsc/voc1/run.sh +164 -0
- ParallelWaveGAN/egs/csmsc/voc1/utils +1 -0
- ParallelWaveGAN/egs/jnas/voc1/cmd.sh +91 -0
- ParallelWaveGAN/egs/jnas/voc1/conf/parallel_wavegan.v1.long.yaml +123 -0
- ParallelWaveGAN/egs/jnas/voc1/conf/parallel_wavegan.v1.yaml +122 -0
- ParallelWaveGAN/egs/jnas/voc1/conf/slurm.conf +12 -0
- ParallelWaveGAN/egs/jnas/voc1/conf/train_speakers.txt +261 -0
- ParallelWaveGAN/egs/jnas/voc1/local/data_prep.sh +89 -0
- ParallelWaveGAN/egs/jnas/voc1/path.sh +33 -0
- ParallelWaveGAN/egs/jnas/voc1/run.sh +158 -0
- ParallelWaveGAN/egs/jnas/voc1/utils +1 -0
- ParallelWaveGAN/egs/jsss/voc1/cmd.sh +91 -0
- ParallelWaveGAN/egs/jsss/voc1/conf/parallel_wavegan.v1.yaml +122 -0
- ParallelWaveGAN/egs/jsss/voc1/conf/slurm.conf +12 -0
- ParallelWaveGAN/egs/jsss/voc1/local/data_download.sh +41 -0
- ParallelWaveGAN/egs/jsss/voc1/local/data_prep.sh +180 -0
- ParallelWaveGAN/egs/jsss/voc1/path.sh +33 -0
- ParallelWaveGAN/egs/jsss/voc1/run.sh +186 -0
- ParallelWaveGAN/egs/jsss/voc1/utils +1 -0
- ParallelWaveGAN/egs/jsut/voc1/cmd.sh +91 -0
- ParallelWaveGAN/egs/jsut/voc1/conf/hifigan.v1.yaml +180 -0
- ParallelWaveGAN/egs/jsut/voc1/conf/multi_band_melgan.v2.yaml +150 -0
- ParallelWaveGAN/egs/jsut/voc1/conf/parallel_wavegan.v1.yaml +122 -0
- ParallelWaveGAN/egs/jsut/voc1/conf/slurm.conf +12 -0
- ParallelWaveGAN/egs/jsut/voc1/conf/style_melgan.v1.yaml +147 -0
- ParallelWaveGAN/egs/jsut/voc1/local/data_download.sh +39 -0
- ParallelWaveGAN/egs/jsut/voc1/local/data_prep.sh +93 -0
- ParallelWaveGAN/egs/jsut/voc1/path.sh +33 -0
ParallelWaveGAN/.github/FUNDING.yml
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
github: kan-bayashi
|
ParallelWaveGAN/.github/workflows/ci.yaml
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: CI
|
2 |
+
|
3 |
+
on:
|
4 |
+
push:
|
5 |
+
branches:
|
6 |
+
- master
|
7 |
+
pull_request:
|
8 |
+
branches:
|
9 |
+
- master
|
10 |
+
schedule:
|
11 |
+
- cron: 0 0 * * 1
|
12 |
+
|
13 |
+
jobs:
|
14 |
+
linter_and_test:
|
15 |
+
runs-on: ubuntu-20.04
|
16 |
+
strategy:
|
17 |
+
max-parallel: 5
|
18 |
+
matrix:
|
19 |
+
python-version: [3.6]
|
20 |
+
# 1.6 is failed on cpu: https://github.com/kan-bayashi/ParallelWaveGAN/issues/198
|
21 |
+
pytorch-version: [1.4, 1.5.1, 1.7.1, 1.8.1, 1.9]
|
22 |
+
steps:
|
23 |
+
- uses: actions/checkout@master
|
24 |
+
- uses: actions/setup-python@v2
|
25 |
+
with:
|
26 |
+
python-version: ${{ matrix.python-version }}
|
27 |
+
architecture: 'x64'
|
28 |
+
- uses: actions/cache@v2
|
29 |
+
with:
|
30 |
+
path: ~/.cache/pip
|
31 |
+
key: ${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.pytorch-version }}-pip-${{ hashFiles('**/setup.py') }}
|
32 |
+
restore-keys: |
|
33 |
+
${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.pytorch-version }}-pip-
|
34 |
+
- name: Install dependencies
|
35 |
+
run: |
|
36 |
+
sudo apt-get install libsndfile-dev
|
37 |
+
# make python env
|
38 |
+
cd tools; make CUDA_VERSION="" PYTHON=python${{ matrix.python-version }} PYTORCH_VERSION=${{ matrix.pytorch-version }}
|
39 |
+
# install shell check
|
40 |
+
wget https://github.com/koalaman/shellcheck/releases/download/stable/shellcheck-stable.linux.x86_64.tar.xz
|
41 |
+
tar -xvf shellcheck-stable.linux.x86_64.tar.xz
|
42 |
+
- name: ShellCheck
|
43 |
+
run: |
|
44 |
+
export PATH=shellcheck-stable:$PATH
|
45 |
+
find egs -name "*.sh" | grep -v path.sh | while read line; do shellcheck -x --shell=bash -P $(dirname $line) ${line}; done
|
46 |
+
- name: Black & Flake8
|
47 |
+
run: |
|
48 |
+
source tools/venv/bin/activate
|
49 |
+
black --diff parallel_wavegan
|
50 |
+
flake8 parallel_wavegan
|
51 |
+
flake8 --extend-ignore=D test
|
52 |
+
- name: Pytest
|
53 |
+
run: |
|
54 |
+
source tools/venv/bin/activate
|
55 |
+
pytest test
|
56 |
+
|
57 |
+
integration:
|
58 |
+
runs-on: ubuntu-20.04
|
59 |
+
strategy:
|
60 |
+
max-parallel: 10
|
61 |
+
matrix:
|
62 |
+
python-version: [3.7]
|
63 |
+
pytorch-version: [1.9]
|
64 |
+
config:
|
65 |
+
- "parallel_wavegan.v1.debug.yaml"
|
66 |
+
- "melgan.v1.debug.yaml"
|
67 |
+
- "melgan.v3.debug.yaml"
|
68 |
+
- "multi_band_melgan.v1.debug.yaml"
|
69 |
+
- "parallel_wavegan.v1.debug.npy.yaml"
|
70 |
+
- "parallel_wavegan.v1.debug.diff_fs.yaml"
|
71 |
+
- "hifigan.v1.debug.yaml"
|
72 |
+
- "style_melgan.v1.debug.yaml"
|
73 |
+
steps:
|
74 |
+
- uses: actions/checkout@master
|
75 |
+
- uses: actions/setup-python@v2
|
76 |
+
with:
|
77 |
+
python-version: ${{ matrix.python-version }}
|
78 |
+
architecture: 'x64'
|
79 |
+
- uses: actions/cache@v2
|
80 |
+
with:
|
81 |
+
path: ~/.cache/pip
|
82 |
+
key: ${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.pytorch-version }}-pip-${{ hashFiles('**/setup.py') }}
|
83 |
+
restore-keys: |
|
84 |
+
${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.pytorch-version }}-pip-
|
85 |
+
- name: Install dependencies
|
86 |
+
run: |
|
87 |
+
sudo apt-get install libsndfile-dev jq
|
88 |
+
# make python env
|
89 |
+
cd tools; make CUDA_VERSION="" PYTHON=python${{ matrix.python-version }} PYTORCH_VERSION=${{ matrix.pytorch-version }}
|
90 |
+
- name: Integration
|
91 |
+
run: |
|
92 |
+
cd egs/yesno/voc1 && ./run.sh --conf conf/${{ matrix.config }}
|
93 |
+
- uses: actions/upload-artifact@v1
|
94 |
+
if: failure()
|
95 |
+
with:
|
96 |
+
name: artifacts-${{ matrix.config }}
|
97 |
+
path: egs/yesno/voc1
|
ParallelWaveGAN/.gitignore
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# general
|
2 |
+
*~
|
3 |
+
*.pyc
|
4 |
+
\#*\#
|
5 |
+
.\#*
|
6 |
+
*DS_Store
|
7 |
+
out.txt
|
8 |
+
parallel_wavegan.egg-info/
|
9 |
+
doc/_build
|
10 |
+
slurm-*.out
|
11 |
+
tmp*
|
12 |
+
.eggs/
|
13 |
+
.hypothesis/
|
14 |
+
.idea
|
15 |
+
.backup/
|
16 |
+
.pytest_cache/
|
17 |
+
__pycache__/
|
18 |
+
.coverage*
|
19 |
+
coverage.xml*
|
20 |
+
.vscode*
|
21 |
+
.nfs*
|
22 |
+
.ipynb_checkpoints
|
23 |
+
.d000*
|
24 |
+
*.out
|
25 |
+
*.err
|
26 |
+
|
27 |
+
# recipe related
|
28 |
+
egs/*/*/data
|
29 |
+
egs/*/*/downloads
|
30 |
+
egs/*/*/dump
|
31 |
+
egs/*/*/exp
|
32 |
+
egs/*/*/conf/tuning
|
33 |
+
|
34 |
+
# tools related
|
35 |
+
tools/venv/
|
36 |
+
tools/apex/
|
ParallelWaveGAN/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
The MIT License (MIT)
|
2 |
+
|
3 |
+
Copyright (c) 2020 Tomoki Hayashi <[email protected]>
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in
|
13 |
+
all copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21 |
+
THE SOFTWARE.
|
ParallelWaveGAN/egs/README.md
ADDED
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Kaldi-style all-in-one recipes
|
2 |
+
|
3 |
+
This repository provides [Kaldi](https://github.com/kaldi-asr/kaldi)-style recipes, as the same as [ESPnet](https://github.com/espnet/espnet).
|
4 |
+
Currently, the following recipes are supported.
|
5 |
+
|
6 |
+
- [LJSpeech](https://keithito.com/LJ-Speech-Dataset/): English female speaker
|
7 |
+
- [JSUT](https://sites.google.com/site/shinnosuketakamichi/publication/jsut): Japanese female speaker
|
8 |
+
- [JSSS](https://sites.google.com/site/shinnosuketakamichi/research-topics/jsss_corpus): Japanese female speaker
|
9 |
+
- [CSMSC](https://www.data-baker.com/open_source.html): Mandarin female speaker
|
10 |
+
- [CMU Arctic](http://www.festvox.org/cmu_arctic/): English speakers
|
11 |
+
- [JNAS](http://research.nii.ac.jp/src/en/JNAS.html): Japanese multi-speaker
|
12 |
+
- [VCTK](https://homepages.inf.ed.ac.uk/jyamagis/page3/page58/page58.html): English multi-speaker
|
13 |
+
- [LibriTTS](https://arxiv.org/abs/1904.02882): English multi-speaker
|
14 |
+
- [YesNo](https://arxiv.org/abs/1904.02882): English speaker (For debugging)
|
15 |
+
|
16 |
+
|
17 |
+
## How to run the recipe
|
18 |
+
|
19 |
+
```bash
|
20 |
+
# Let us move on the recipe directory
|
21 |
+
$ cd egs/ljspeech/voc1
|
22 |
+
|
23 |
+
# Run the recipe from scratch
|
24 |
+
$ ./run.sh
|
25 |
+
|
26 |
+
# You can change config via command line
|
27 |
+
$ ./run.sh --conf <your_customized_yaml_config>
|
28 |
+
|
29 |
+
# You can select the stage to start and stop
|
30 |
+
$ ./run.sh --stage 2 --stop_stage 2
|
31 |
+
|
32 |
+
# If you want to specify the gpu
|
33 |
+
$ CUDA_VISIBLE_DEVICES=1 ./run.sh --stage 2
|
34 |
+
|
35 |
+
# If you want to resume training from 10000 steps checkpoint
|
36 |
+
$ ./run.sh --stage 2 --resume <path>/<to>/checkpoint-10000steps.pkl
|
37 |
+
```
|
38 |
+
|
39 |
+
You can check the command line options in `run.sh`.
|
40 |
+
|
41 |
+
The integration with job schedulers such as [slurm](https://slurm.schedmd.com/documentation.html) can be done via `cmd.sh` and `conf/slurm.conf`.
|
42 |
+
If you want to use it, please check [this page](https://kaldi-asr.org/doc/queue.html).
|
43 |
+
|
44 |
+
All of the hyperparameters are written in a single yaml format configuration file.
|
45 |
+
Please check [this example](https://github.com/kan-bayashi/ParallelWaveGAN/blob/master/egs/ljspeech/voc1/conf/parallel_wavegan.v1.yaml) in ljspeech recipe.
|
46 |
+
|
47 |
+
You can monitor the training progress via tensorboard.
|
48 |
+
|
49 |
+
```bash
|
50 |
+
$ tensorboard --logdir exp
|
51 |
+
```
|
52 |
+
|
53 |
+
![](https://user-images.githubusercontent.com/22779813/68100080-58bbc500-ff09-11e9-9945-c835186fd7c2.png)
|
54 |
+
|
55 |
+
If you want to accelerate the training, you can try distributed multi-gpu training based on apex.
|
56 |
+
You need to install apex for distributed training. Please make sure you already installed it.
|
57 |
+
Then you can run distributed multi-gpu training via following command:
|
58 |
+
|
59 |
+
```bash
|
60 |
+
# in the case of the number of gpus = 8
|
61 |
+
$ CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" ./run.sh --stage 2 --n_gpus 8
|
62 |
+
```
|
63 |
+
|
64 |
+
In the case of distributed training, the batch size will be automatically multiplied by the number of gpus.
|
65 |
+
Please be careful.
|
66 |
+
|
67 |
+
## How to make the recipe for your own dateset
|
68 |
+
|
69 |
+
Here, I will show how to make the recipe for your own dataset.
|
70 |
+
|
71 |
+
1. Setup your dataset to be the following structure.
|
72 |
+
|
73 |
+
```bash
|
74 |
+
# For single-speaker case
|
75 |
+
$ tree /path/to/databse
|
76 |
+
/path/to/database
|
77 |
+
├── utt_1.wav
|
78 |
+
├── utt_2.wav
|
79 |
+
│ ...
|
80 |
+
└── utt_N.wav
|
81 |
+
# The directory can be nested, but each filename must be unique
|
82 |
+
|
83 |
+
# For multi-speaker case
|
84 |
+
$ tree /path/to/databse
|
85 |
+
/path/to/database
|
86 |
+
├── spk_1
|
87 |
+
│ ├── utt1.wav
|
88 |
+
├── spk_2
|
89 |
+
│ ├── utt1.wav
|
90 |
+
│ ...
|
91 |
+
└── spk_N
|
92 |
+
├── utt1.wav
|
93 |
+
...
|
94 |
+
# The directory under each speaker can be nested, but each filename in each speaker directory must be unique
|
95 |
+
```
|
96 |
+
|
97 |
+
2. Copy the template directory.
|
98 |
+
|
99 |
+
```bash
|
100 |
+
cd egs
|
101 |
+
|
102 |
+
# For single speaker case
|
103 |
+
cp -r template_single_spk <your_dataset_name>
|
104 |
+
|
105 |
+
# For multi speaker case
|
106 |
+
cp -r template_multi_spk <your_dataset_name>
|
107 |
+
|
108 |
+
# Move on your recipe
|
109 |
+
cd egs/<your_dataset_name>/voc1
|
110 |
+
```
|
111 |
+
|
112 |
+
3. Modify the options in `run.sh`.
|
113 |
+
What you need to change at least in `run.sh` is as follows:
|
114 |
+
- `db_root`: Root path of the database.
|
115 |
+
- `num_dev`: The number of utterances for development set.
|
116 |
+
- `num_eval`: The number of utterances for evaluation set.
|
117 |
+
|
118 |
+
4. Modify the hyperpameters in `conf/parallel_wavegan.v1.yaml`.
|
119 |
+
What you need to change at least in config is as follows:
|
120 |
+
- `sampling_rate`: If you can specify the lower sampling rate, the audio will be downsampled by sox.
|
121 |
+
|
122 |
+
5. (Optional) Change command backend in `cmd.sh`.
|
123 |
+
If you are not familiar with kaldi and run in your local env, you do not need to change.
|
124 |
+
See more info on https://kaldi-asr.org/doc/queue.html.
|
125 |
+
|
126 |
+
6. Run your recipe.
|
127 |
+
|
128 |
+
```bash
|
129 |
+
# Run all stages from the first stage
|
130 |
+
./run.sh
|
131 |
+
|
132 |
+
# If you want to specify CUDA device
|
133 |
+
CUDA_VISIBLE_DEVICES=0 ./run.sh
|
134 |
+
```
|
135 |
+
|
136 |
+
If you want to try the other advanced model, please check the config files in `egs/ljspeech/voc1/conf`.
|
137 |
+
|
138 |
+
## Run training using ESPnet2-TTS recipe within 5 minutes
|
139 |
+
|
140 |
+
Make sure already you finished the espnet2-tts recipe experiments (at least starting the training).
|
141 |
+
|
142 |
+
```bash
|
143 |
+
cd egs
|
144 |
+
|
145 |
+
# Please use single spk template for both single and multi spk case
|
146 |
+
cp -r template_single_spk <recipe_name>
|
147 |
+
|
148 |
+
# Move on your recipe
|
149 |
+
cd egs/<recipe_name>/voc1
|
150 |
+
|
151 |
+
# Make symlink of data directory (Better to use absolute path)
|
152 |
+
mkdir dump data
|
153 |
+
ln -s /path/to/espnet/egs2/<recipe_name>/tts1/dump/raw dump/
|
154 |
+
ln -s /path/to/espnet/egs2/<recipe_name>/tts1/dump/raw/tr_no_dev data/train_nodev
|
155 |
+
ln -s /path/to/espnet/egs2/<recipe_name>/tts1/dump/raw/dev data/dev
|
156 |
+
ln -s /path/to/espnet/egs2/<recipe_name>/tts1/dump/raw/eval1 data/eval
|
157 |
+
|
158 |
+
# Edit config to match TTS model setting
|
159 |
+
vim conf/parallel_wavegan.v1.yaml
|
160 |
+
|
161 |
+
# Run from stage 1
|
162 |
+
./run.sh --stage 1 --conf conf/parallel_wavegan.v1.yaml
|
163 |
+
```
|
164 |
+
|
165 |
+
That's it!
|
ParallelWaveGAN/egs/arctic/voc1/cmd.sh
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
|
2 |
+
# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
|
3 |
+
# e.g.
|
4 |
+
# run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
|
5 |
+
#
|
6 |
+
# Options:
|
7 |
+
# --time <time>: Limit the maximum time to execute.
|
8 |
+
# --mem <mem>: Limit the maximum memory usage.
|
9 |
+
# -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
|
10 |
+
# --num-threads <ngpu>: Specify the number of CPU core.
|
11 |
+
# --gpu <ngpu>: Specify the number of GPU devices.
|
12 |
+
# --config: Change the configuration file from default.
|
13 |
+
#
|
14 |
+
# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
|
15 |
+
# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
|
16 |
+
# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
|
17 |
+
# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
|
18 |
+
#
|
19 |
+
# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
|
20 |
+
# These options are mapping to specific options for each backend and
|
21 |
+
# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
|
22 |
+
# If jobs failed, your configuration might be wrong for your environment.
|
23 |
+
#
|
24 |
+
#
|
25 |
+
# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
|
26 |
+
# "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
|
27 |
+
# =========================================================~
|
28 |
+
|
29 |
+
|
30 |
+
# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
|
31 |
+
cmd_backend="local"
|
32 |
+
|
33 |
+
# Local machine, without any Job scheduling system
|
34 |
+
if [ "${cmd_backend}" = local ]; then
|
35 |
+
|
36 |
+
# The other usage
|
37 |
+
export train_cmd="utils/run.pl"
|
38 |
+
# Used for "*_train.py": "--gpu" is appended optionally by run.sh
|
39 |
+
export cuda_cmd="utils/run.pl"
|
40 |
+
# Used for "*_recog.py"
|
41 |
+
export decode_cmd="utils/run.pl"
|
42 |
+
|
43 |
+
# Local machine, without any Job scheduling system
|
44 |
+
elif [ "${cmd_backend}" = stdout ]; then
|
45 |
+
|
46 |
+
# The other usage
|
47 |
+
export train_cmd="utils/stdout.pl"
|
48 |
+
# Used for "*_train.py": "--gpu" is appended optionally by run.sh
|
49 |
+
export cuda_cmd="utils/stdout.pl"
|
50 |
+
# Used for "*_recog.py"
|
51 |
+
export decode_cmd="utils/stdout.pl"
|
52 |
+
|
53 |
+
# "qsub" (SGE, Torque, PBS, etc.)
|
54 |
+
elif [ "${cmd_backend}" = sge ]; then
|
55 |
+
# The default setting is written in conf/queue.conf.
|
56 |
+
# You must change "-q g.q" for the "queue" for your environment.
|
57 |
+
# To know the "queue" names, type "qhost -q"
|
58 |
+
# Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
|
59 |
+
|
60 |
+
export train_cmd="utils/queue.pl"
|
61 |
+
export cuda_cmd="utils/queue.pl"
|
62 |
+
export decode_cmd="utils/queue.pl"
|
63 |
+
|
64 |
+
# "sbatch" (Slurm)
|
65 |
+
elif [ "${cmd_backend}" = slurm ]; then
|
66 |
+
# The default setting is written in conf/slurm.conf.
|
67 |
+
# You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
|
68 |
+
# To know the "partion" names, type "sinfo".
|
69 |
+
# You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
|
70 |
+
# The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
|
71 |
+
|
72 |
+
export train_cmd="utils/slurm.pl"
|
73 |
+
export cuda_cmd="utils/slurm.pl"
|
74 |
+
export decode_cmd="utils/slurm.pl"
|
75 |
+
|
76 |
+
elif [ "${cmd_backend}" = ssh ]; then
|
77 |
+
# You have to create ".queue/machines" to specify the host to execute jobs.
|
78 |
+
# e.g. .queue/machines
|
79 |
+
# host1
|
80 |
+
# host2
|
81 |
+
# host3
|
82 |
+
# Assuming you can login them without any password, i.e. You have to set ssh keys.
|
83 |
+
|
84 |
+
export train_cmd="utils/ssh.pl"
|
85 |
+
export cuda_cmd="utils/ssh.pl"
|
86 |
+
export decode_cmd="utils/ssh.pl"
|
87 |
+
|
88 |
+
else
|
89 |
+
echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
|
90 |
+
return 1
|
91 |
+
fi
|
ParallelWaveGAN/egs/arctic/voc1/conf/parallel_wavegan.v1.yaml
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This is the hyperparameter configuration file for Parallel WaveGAN.
|
2 |
+
# Please make sure this is adjusted for the Arctic dataset. If you want to
|
3 |
+
# apply to the other dataset, you might need to carefully change some parameters.
|
4 |
+
# This configuration requires 12 GB GPU memory and takes ~3 days on TITAN V.
|
5 |
+
|
6 |
+
###########################################################
|
7 |
+
# FEATURE EXTRACTION SETTING #
|
8 |
+
###########################################################
|
9 |
+
sampling_rate: 16000 # Sampling rate.
|
10 |
+
fft_size: 1024 # FFT size.
|
11 |
+
hop_size: 256 # Hop size.
|
12 |
+
win_length: null # Window length.
|
13 |
+
# If set to null, it will be the same as fft_size.
|
14 |
+
window: "hann" # Window function.
|
15 |
+
num_mels: 80 # Number of mel basis.
|
16 |
+
fmin: 80 # Minimum freq in mel basis calculation.
|
17 |
+
fmax: 7600 # Maximum frequency in mel basis calculation.
|
18 |
+
global_gain_scale: 1.0 # Will be multiplied to all of waveform.
|
19 |
+
trim_silence: false # Whether to trim the start and end of silence.
|
20 |
+
trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
|
21 |
+
trim_frame_size: 2048 # Frame size in trimming.
|
22 |
+
trim_hop_size: 512 # Hop size in trimming.
|
23 |
+
format: "hdf5" # Feature file format. "npy" or "hdf5" is supported.
|
24 |
+
|
25 |
+
###########################################################
|
26 |
+
# GENERATOR NETWORK ARCHITECTURE SETTING #
|
27 |
+
###########################################################
|
28 |
+
generator_params:
|
29 |
+
in_channels: 1 # Number of input channels.
|
30 |
+
out_channels: 1 # Number of output channels.
|
31 |
+
kernel_size: 3 # Kernel size of dilated convolution.
|
32 |
+
layers: 30 # Number of residual block layers.
|
33 |
+
stacks: 3 # Number of stacks i.e., dilation cycles.
|
34 |
+
residual_channels: 64 # Number of channels in residual conv.
|
35 |
+
gate_channels: 128 # Number of channels in gated conv.
|
36 |
+
skip_channels: 64 # Number of channels in skip conv.
|
37 |
+
aux_channels: 80 # Number of channels for auxiliary feature conv.
|
38 |
+
# Must be the same as num_mels.
|
39 |
+
aux_context_window: 2 # Context window size for auxiliary feature.
|
40 |
+
# If set to 2, previous 2 and future 2 frames will be considered.
|
41 |
+
dropout: 0.0 # Dropout rate. 0.0 means no dropout applied.
|
42 |
+
use_weight_norm: true # Whether to use weight norm.
|
43 |
+
# If set to true, it will be applied to all of the conv layers.
|
44 |
+
upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
|
45 |
+
upsample_params: # Upsampling network parameters.
|
46 |
+
upsample_scales: [4, 4, 4, 4] # Upsampling scales. Prodcut of these must be the same as hop size.
|
47 |
+
|
48 |
+
###########################################################
|
49 |
+
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
|
50 |
+
###########################################################
|
51 |
+
discriminator_params:
|
52 |
+
in_channels: 1 # Number of input channels.
|
53 |
+
out_channels: 1 # Number of output channels.
|
54 |
+
kernel_size: 3 # Number of output channels.
|
55 |
+
layers: 10 # Number of conv layers.
|
56 |
+
conv_channels: 64 # Number of chnn layers.
|
57 |
+
bias: true # Whether to use bias parameter in conv.
|
58 |
+
use_weight_norm: true # Whether to use weight norm.
|
59 |
+
# If set to true, it will be applied to all of the conv layers.
|
60 |
+
nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
|
61 |
+
nonlinear_activation_params: # Nonlinear function parameters
|
62 |
+
negative_slope: 0.2 # Alpha in LeakyReLU.
|
63 |
+
|
64 |
+
###########################################################
|
65 |
+
# STFT LOSS SETTING #
|
66 |
+
###########################################################
|
67 |
+
stft_loss_params:
|
68 |
+
fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
|
69 |
+
hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss
|
70 |
+
win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
|
71 |
+
window: "hann_window" # Window function for STFT-based loss
|
72 |
+
|
73 |
+
###########################################################
|
74 |
+
# ADVERSARIAL LOSS SETTING #
|
75 |
+
###########################################################
|
76 |
+
lambda_adv: 4.0 # Loss balancing coefficient.
|
77 |
+
|
78 |
+
###########################################################
|
79 |
+
# DATA LOADER SETTING #
|
80 |
+
###########################################################
|
81 |
+
batch_size: 10 # Batch size.
|
82 |
+
batch_max_steps: 15360 # Length of each audio in batch. Make sure dividable by hop_size.
|
83 |
+
pin_memory: true # Whether to pin memory in Pytorch DataLoader.
|
84 |
+
num_workers: 2 # Number of workers in Pytorch DataLoader.
|
85 |
+
remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
|
86 |
+
allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.
|
87 |
+
|
88 |
+
###########################################################
|
89 |
+
# OPTIMIZER & SCHEDULER SETTING #
|
90 |
+
###########################################################
|
91 |
+
generator_optimizer_params:
|
92 |
+
lr: 0.0001 # Generator's learning rate.
|
93 |
+
eps: 1.0e-6 # Generator's epsilon.
|
94 |
+
weight_decay: 0.0 # Generator's weight decay coefficient.
|
95 |
+
generator_scheduler_params:
|
96 |
+
step_size: 200000 # Generator's scheduler step size.
|
97 |
+
gamma: 0.5 # Generator's scheduler gamma.
|
98 |
+
# At each step size, lr will be multiplied by this parameter.
|
99 |
+
generator_grad_norm: 10 # Generator's gradient norm.
|
100 |
+
discriminator_optimizer_params:
|
101 |
+
lr: 0.00005 # Discriminator's learning rate.
|
102 |
+
eps: 1.0e-6 # Discriminator's epsilon.
|
103 |
+
weight_decay: 0.0 # Discriminator's weight decay coefficient.
|
104 |
+
discriminator_scheduler_params:
|
105 |
+
step_size: 200000 # Discriminator's scheduler step size.
|
106 |
+
gamma: 0.5 # Discriminator's scheduler gamma.
|
107 |
+
# At each step size, lr will be multiplied by this parameter.
|
108 |
+
discriminator_grad_norm: 1 # Discriminator's gradient norm.
|
109 |
+
|
110 |
+
###########################################################
|
111 |
+
# INTERVAL SETTING #
|
112 |
+
###########################################################
|
113 |
+
discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
|
114 |
+
train_max_steps: 400000 # Number of training steps.
|
115 |
+
save_interval_steps: 5000 # Interval steps to save checkpoint.
|
116 |
+
eval_interval_steps: 1000 # Interval steps to evaluate the network.
|
117 |
+
log_interval_steps: 100 # Interval steps to record the training log.
|
118 |
+
|
119 |
+
###########################################################
|
120 |
+
# OTHER SETTING #
|
121 |
+
###########################################################
|
122 |
+
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
|
ParallelWaveGAN/egs/arctic/voc1/conf/slurm.conf
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Default configuration
|
2 |
+
command sbatch --export=PATH --ntasks-per-node=1
|
3 |
+
option time=* --time $0
|
4 |
+
option mem=* --mem-per-cpu $0
|
5 |
+
option mem=0 # Do not add anything to qsub_opts
|
6 |
+
option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
|
7 |
+
option num_threads=1 --cpus-per-task 1 --ntasks-per-node=1 # Do not add anything to qsub_opts
|
8 |
+
default gpu=0
|
9 |
+
option gpu=0 -p cpu
|
10 |
+
option gpu=* -p gpu --gres=gpu:$0
|
11 |
+
# note: the --max-jobs-run option is supported as a special case
|
12 |
+
# by slurm.pl and you don't have to handle it in the config file.
|
ParallelWaveGAN/egs/arctic/voc1/local/data_download.sh
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# Copyright 2019 Tomoki Hayashi
|
4 |
+
# MIT License (https://opensource.org/licenses/MIT)
|
5 |
+
|
6 |
+
download_dir=$1
|
7 |
+
spk=$2
|
8 |
+
|
9 |
+
available_spks=(
|
10 |
+
"slt" "clb" "bdl" "rms" "jmk" "awb" "ksp"
|
11 |
+
)
|
12 |
+
|
13 |
+
# check arguments
|
14 |
+
if [ $# != 2 ]; then
|
15 |
+
echo "Usage: $0 <download_dir> <spk>"
|
16 |
+
echo "Available speakers: ${available_spks[*]}"
|
17 |
+
exit 1
|
18 |
+
fi
|
19 |
+
|
20 |
+
set -euo pipefail
|
21 |
+
|
22 |
+
# check speakers
|
23 |
+
if ! echo "${available_spks[*]}" | grep -q "${spk}"; then
|
24 |
+
echo "Specified spk (${spk}) is not available or not supported." >&2
|
25 |
+
exit 1
|
26 |
+
fi
|
27 |
+
|
28 |
+
# download dataset
|
29 |
+
cwd=$(pwd)
|
30 |
+
if [ ! -e "${download_dir}/cmu_us_${spk}_arctic" ]; then
|
31 |
+
mkdir -p "${download_dir}"
|
32 |
+
cd "${download_dir}"
|
33 |
+
wget "http://festvox.org/cmu_arctic/cmu_arctic/packed/cmu_us_${spk}_arctic-0.95-release.tar.bz2"
|
34 |
+
tar xf "cmu_us_${spk}_arctic-0.95-release.tar.bz2"
|
35 |
+
rm "cmu_us_${spk}_arctic-0.95-release.tar.bz2"
|
36 |
+
cd "${cwd}"
|
37 |
+
echo "Successfully finished download."
|
38 |
+
else
|
39 |
+
echo "Already exists. Skip download."
|
40 |
+
fi
|
ParallelWaveGAN/egs/arctic/voc1/local/data_prep.sh
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# Copyright 2019 Tomoki Hayashi
|
4 |
+
# MIT License (https://opensource.org/licenses/MIT)
|
5 |
+
|
6 |
+
# shellcheck disable=SC1091
|
7 |
+
. ./path.sh || exit 1;
|
8 |
+
|
9 |
+
num_dev=100
|
10 |
+
num_eval=100
|
11 |
+
train_set="train_nodev"
|
12 |
+
dev_set="dev"
|
13 |
+
eval_set="eval"
|
14 |
+
shuffle=false
|
15 |
+
|
16 |
+
# shellcheck disable=SC1091
|
17 |
+
. utils/parse_options.sh || exit 1;
|
18 |
+
|
19 |
+
db_root=$1
|
20 |
+
spk=$2
|
21 |
+
data_dir=$3
|
22 |
+
|
23 |
+
# check arguments
|
24 |
+
if [ $# != 3 ]; then
|
25 |
+
echo "Usage: $0 <db_root> <spk> <data_dir>"
|
26 |
+
echo "e.g.: $0 downloads/cms_us_slt_arctic slt data"
|
27 |
+
echo ""
|
28 |
+
echo "Options:"
|
29 |
+
echo " --num_dev: number of development uttreances (default=250)."
|
30 |
+
echo " --num_eval: number of evaluation uttreances (default=250)."
|
31 |
+
echo " --train_set: name of train set (default=train_nodev)."
|
32 |
+
echo " --dev_set: name of dev set (default=dev)."
|
33 |
+
echo " --eval_set: name of eval set (default=eval)."
|
34 |
+
echo " --shuffle: whether to perform shuffle in making dev / eval set (default=false)."
|
35 |
+
exit 1
|
36 |
+
fi
|
37 |
+
|
38 |
+
set -euo pipefail
|
39 |
+
|
40 |
+
# check speaker
|
41 |
+
available_spks=(
|
42 |
+
"slt" "clb" "bdl" "rms" "jmk" "awb" "ksp"
|
43 |
+
)
|
44 |
+
if ! echo "${available_spks[*]}" | grep -q "${spk}"; then
|
45 |
+
echo "Specified speaker ${spk} is not available."
|
46 |
+
echo "Available speakers: ${available_spks[*]}"
|
47 |
+
exit 1
|
48 |
+
fi
|
49 |
+
|
50 |
+
[ ! -e "${data_dir}/all" ] && mkdir -p "${data_dir}/all"
|
51 |
+
|
52 |
+
# set filenames
|
53 |
+
scp="${data_dir}/all/wav.scp"
|
54 |
+
segments="${data_dir}/all/segments"
|
55 |
+
|
56 |
+
# check file existence
|
57 |
+
[ -e "${scp}" ] && rm "${scp}"
|
58 |
+
[ -e "${segments}" ] && rm "${segments}"
|
59 |
+
|
60 |
+
# make scp
|
61 |
+
find "${db_root}" -name "*.wav" -follow | sort | while read -r filename; do
|
62 |
+
id="${spk}_$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")"
|
63 |
+
echo "${id} ${filename}" >> "${scp}"
|
64 |
+
done
|
65 |
+
|
66 |
+
# make segments
|
67 |
+
find "${db_root}/lab" -name "*.lab" -follow | sort | while read -r filename; do
|
68 |
+
# get start time
|
69 |
+
while read -r line; do
|
70 |
+
phn=$(echo "${line}" | cut -d " " -f 3)
|
71 |
+
if [ "${phn}" != "pau" ]; then
|
72 |
+
break
|
73 |
+
fi
|
74 |
+
start=$(echo "${line}" | cut -d " " -f 1)
|
75 |
+
done < <(tail -n +2 "$filename")
|
76 |
+
# get end time
|
77 |
+
while read -r line; do
|
78 |
+
end=$(echo "${line}" | cut -d " " -f 1)
|
79 |
+
phn=$(echo "${line}" | cut -d " " -f 3)
|
80 |
+
if [ "${phn}" != "pau" ]; then
|
81 |
+
break
|
82 |
+
fi
|
83 |
+
done < <(tail -n +2 "$filename" | tac)
|
84 |
+
echo "${spk}_$(basename "${filename}" .lab) ${spk}_$(basename "${filename}" .lab) ${start} ${end}" >> "${segments}"
|
85 |
+
done
|
86 |
+
|
87 |
+
# check
|
88 |
+
diff -q <(awk '{print $1}' "${scp}") <(awk '{print $1}' "${segments}") > /dev/null
|
89 |
+
|
90 |
+
# split
|
91 |
+
num_all=$(wc -l < "${scp}")
|
92 |
+
num_deveval=$((num_dev + num_eval))
|
93 |
+
num_train=$((num_all - num_deveval))
|
94 |
+
utils/split_data.sh \
|
95 |
+
--num_first "${num_train}" \
|
96 |
+
--num_second "${num_deveval}" \
|
97 |
+
--shuffle "${shuffle}" \
|
98 |
+
"${data_dir}/all" \
|
99 |
+
"${data_dir}/${train_set}" \
|
100 |
+
"${data_dir}/deveval"
|
101 |
+
utils/split_data.sh \
|
102 |
+
--num_first "${num_dev}" \
|
103 |
+
--num_second "${num_eval}" \
|
104 |
+
--shuffle "${shuffle}" \
|
105 |
+
"${data_dir}/deveval" \
|
106 |
+
"${data_dir}/${dev_set}" \
|
107 |
+
"${data_dir}/${eval_set}"
|
108 |
+
|
109 |
+
# remove tmp directories
|
110 |
+
rm -rf "${data_dir}/all"
|
111 |
+
rm -rf "${data_dir}/deveval"
|
112 |
+
|
113 |
+
echo "Successfully prepared data."
|
ParallelWaveGAN/egs/arctic/voc1/path.sh
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# cuda related
|
2 |
+
export CUDA_HOME=/usr/local/cuda-10.0
|
3 |
+
export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
|
4 |
+
|
5 |
+
# path related
|
6 |
+
export PRJ_ROOT="${PWD}/../../.."
|
7 |
+
if [ -e "${PRJ_ROOT}/tools/venv/bin/activate" ]; then
|
8 |
+
# shellcheck disable=SC1090
|
9 |
+
. "${PRJ_ROOT}/tools/venv/bin/activate"
|
10 |
+
fi
|
11 |
+
|
12 |
+
# python related
|
13 |
+
export OMP_NUM_THREADS=1
|
14 |
+
export PYTHONIOENCODING=UTF-8
|
15 |
+
export MPL_BACKEND=Agg
|
16 |
+
|
17 |
+
# check installation
|
18 |
+
if ! command -v parallel-wavegan-train > /dev/null; then
|
19 |
+
echo "Error: It seems setup is not finished." >&2
|
20 |
+
echo "Error: Please setup your environment by following README.md" >&2
|
21 |
+
return 1
|
22 |
+
fi
|
23 |
+
if ! command -v jq > /dev/null; then
|
24 |
+
echo "Error: It seems jq is not installed." >&2
|
25 |
+
echo "Error: Please install via \`sudo apt-get install jq\`." >&2
|
26 |
+
echo "Error: If you do not have sudo, please download from https://stedolan.github.io/jq/download/." >&2
|
27 |
+
return 1
|
28 |
+
fi
|
29 |
+
if ! command -v yq > /dev/null; then
|
30 |
+
echo "Error: It seems yq is not installed." >&2
|
31 |
+
echo "Error: Please install via \`pip install yq\`." >&2
|
32 |
+
return 1
|
33 |
+
fi
|
ParallelWaveGAN/egs/arctic/voc1/run.sh
ADDED
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# Copyright 2019 Tomoki Hayashi
|
4 |
+
# MIT License (https://opensource.org/licenses/MIT)
|
5 |
+
|
6 |
+
. ./cmd.sh || exit 1;
|
7 |
+
. ./path.sh || exit 1;
|
8 |
+
|
9 |
+
# basic settings
|
10 |
+
stage=-1 # stage to start
|
11 |
+
stop_stage=100 # stage to stop
|
12 |
+
verbose=1 # verbosity level (lower is less info)
|
13 |
+
n_gpus=1 # number of gpus in training
|
14 |
+
n_jobs=16 # number of parallel jobs in feature extraction
|
15 |
+
|
16 |
+
# NOTE(kan-bayashi): renamed to conf to avoid conflict in parse_options.sh
|
17 |
+
conf=conf/parallel_wavegan.v1.yaml
|
18 |
+
|
19 |
+
# directory path setting
|
20 |
+
download_dir=downloads # direcotry to save downloaded files
|
21 |
+
dumpdir=dump # directory to dump features
|
22 |
+
|
23 |
+
# target speaker setting
|
24 |
+
spk=slt # you can select from slt, clb, bdl, rms, awb, jmk, ksp
|
25 |
+
|
26 |
+
# training related setting
|
27 |
+
tag="" # tag for directory to save model
|
28 |
+
resume="" # checkpoint path to resume training
|
29 |
+
# (e.g. <path>/<to>/checkpoint-10000steps.pkl)
|
30 |
+
|
31 |
+
# decoding related setting
|
32 |
+
checkpoint="" # checkpoint path to be used for decoding
|
33 |
+
# if not provided, the latest one will be used
|
34 |
+
# (e.g. <path>/<to>/checkpoint-400000steps.pkl)
|
35 |
+
|
36 |
+
# shellcheck disable=SC1091
|
37 |
+
. utils/parse_options.sh || exit 1;
|
38 |
+
|
39 |
+
train_set="train_nodev_${spk}" # name of training data directory
|
40 |
+
dev_set="dev_${spk}" # name of development data direcotry
|
41 |
+
eval_set="eval_${spk}" # name of evaluation data direcotry
|
42 |
+
|
43 |
+
set -euo pipefail
|
44 |
+
|
45 |
+
if [ "${stage}" -le -1 ] && [ "${stop_stage}" -ge -1 ]; then
|
46 |
+
echo "Stage -1: Data download"
|
47 |
+
local/data_download.sh "${download_dir}" "${spk}"
|
48 |
+
fi
|
49 |
+
|
50 |
+
if [ "${stage}" -le 0 ] && [ "${stop_stage}" -ge 0 ]; then
|
51 |
+
echo "Stage 0: Data preparation"
|
52 |
+
local/data_prep.sh \
|
53 |
+
--train_set "${train_set}" \
|
54 |
+
--dev_set "${dev_set}" \
|
55 |
+
--eval_set "${eval_set}" \
|
56 |
+
"${download_dir}/cmu_us_${spk}_arctic" "${spk}" data
|
57 |
+
fi
|
58 |
+
|
59 |
+
stats_ext=$(grep -q "hdf5" <(yq ".format" "${conf}") && echo "h5" || echo "npy")
|
60 |
+
if [ "${stage}" -le 1 ] && [ "${stop_stage}" -ge 1 ]; then
|
61 |
+
echo "Stage 1: Feature extraction"
|
62 |
+
# extract raw features
|
63 |
+
pids=()
|
64 |
+
for name in "${train_set}" "${dev_set}" "${eval_set}"; do
|
65 |
+
(
|
66 |
+
[ ! -e "${dumpdir}/${name}/raw" ] && mkdir -p "${dumpdir}/${name}/raw"
|
67 |
+
echo "Feature extraction start. See the progress via ${dumpdir}/${name}/raw/preprocessing.*.log."
|
68 |
+
utils/make_subset_data.sh "data/${name}" "${n_jobs}" "${dumpdir}/${name}/raw"
|
69 |
+
${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/raw/preprocessing.JOB.log" \
|
70 |
+
parallel-wavegan-preprocess \
|
71 |
+
--config "${conf}" \
|
72 |
+
--scp "${dumpdir}/${name}/raw/wav.JOB.scp" \
|
73 |
+
--segments "${dumpdir}/${name}/raw/segments.JOB" \
|
74 |
+
--dumpdir "${dumpdir}/${name}/raw/dump.JOB" \
|
75 |
+
--verbose "${verbose}"
|
76 |
+
echo "Successfully finished feature extraction of ${name} set."
|
77 |
+
) &
|
78 |
+
pids+=($!)
|
79 |
+
done
|
80 |
+
i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
|
81 |
+
[ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
|
82 |
+
echo "Successfully finished feature extraction."
|
83 |
+
|
84 |
+
# calculate statistics for normalization
|
85 |
+
echo "Statistics computation start. See the progress via ${dumpdir}/${train_set}/compute_statistics.log."
|
86 |
+
${train_cmd} "${dumpdir}/${train_set}/compute_statistics.log" \
|
87 |
+
parallel-wavegan-compute-statistics \
|
88 |
+
--config "${conf}" \
|
89 |
+
--rootdir "${dumpdir}/${train_set}/raw" \
|
90 |
+
--dumpdir "${dumpdir}/${train_set}" \
|
91 |
+
--verbose "${verbose}"
|
92 |
+
echo "Successfully finished calculation of statistics."
|
93 |
+
|
94 |
+
# normalize and dump them
|
95 |
+
pids=()
|
96 |
+
for name in "${train_set}" "${dev_set}" "${eval_set}"; do
|
97 |
+
(
|
98 |
+
[ ! -e "${dumpdir}/${name}/norm" ] && mkdir -p "${dumpdir}/${name}/norm"
|
99 |
+
echo "Nomalization start. See the progress via ${dumpdir}/${name}/norm/normalize.*.log."
|
100 |
+
${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/norm/normalize.JOB.log" \
|
101 |
+
parallel-wavegan-normalize \
|
102 |
+
--config "${conf}" \
|
103 |
+
--stats "${dumpdir}/${train_set}/stats.${stats_ext}" \
|
104 |
+
--rootdir "${dumpdir}/${name}/raw/dump.JOB" \
|
105 |
+
--dumpdir "${dumpdir}/${name}/norm/dump.JOB" \
|
106 |
+
--verbose "${verbose}"
|
107 |
+
echo "Successfully finished normalization of ${name} set."
|
108 |
+
) &
|
109 |
+
pids+=($!)
|
110 |
+
done
|
111 |
+
i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
|
112 |
+
[ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
|
113 |
+
echo "Successfully finished normalization."
|
114 |
+
fi
|
115 |
+
|
116 |
+
if [ -z "${tag}" ]; then
|
117 |
+
expdir="exp/${train_set}_arctic_$(basename "${conf}" .yaml)"
|
118 |
+
else
|
119 |
+
expdir="exp/${train_set}_arctic_${tag}"
|
120 |
+
fi
|
121 |
+
if [ "${stage}" -le 2 ] && [ "${stop_stage}" -ge 2 ]; then
|
122 |
+
echo "Stage 2: Network training"
|
123 |
+
[ ! -e "${expdir}" ] && mkdir -p "${expdir}"
|
124 |
+
cp "${dumpdir}/${train_set}/stats.${stats_ext}" "${expdir}"
|
125 |
+
if [ "${n_gpus}" -gt 1 ]; then
|
126 |
+
train="python -m parallel_wavegan.distributed.launch --nproc_per_node ${n_gpus} -c parallel-wavegan-train"
|
127 |
+
else
|
128 |
+
train="parallel-wavegan-train"
|
129 |
+
fi
|
130 |
+
echo "Training start. See the progress via ${expdir}/train.log."
|
131 |
+
${cuda_cmd} --gpu "${n_gpus}" "${expdir}/train.log" \
|
132 |
+
${train} \
|
133 |
+
--config "${conf}" \
|
134 |
+
--train-dumpdir "${dumpdir}/${train_set}/norm" \
|
135 |
+
--dev-dumpdir "${dumpdir}/${dev_set}/norm" \
|
136 |
+
--outdir "${expdir}" \
|
137 |
+
--resume "${resume}" \
|
138 |
+
--verbose "${verbose}"
|
139 |
+
echo "Successfully finished training."
|
140 |
+
fi
|
141 |
+
|
142 |
+
if [ "${stage}" -le 3 ] && [ "${stop_stage}" -ge 3 ]; then
|
143 |
+
echo "Stage 3: Network decoding"
|
144 |
+
# shellcheck disable=SC2012
|
145 |
+
[ -z "${checkpoint}" ] && checkpoint="$(ls -dt "${expdir}"/*.pkl | head -1 || true)"
|
146 |
+
outdir="${expdir}/wav/$(basename "${checkpoint}" .pkl)"
|
147 |
+
pids=()
|
148 |
+
for name in "${dev_set}" "${eval_set}"; do
|
149 |
+
(
|
150 |
+
[ ! -e "${outdir}/${name}" ] && mkdir -p "${outdir}/${name}"
|
151 |
+
[ "${n_gpus}" -gt 1 ] && n_gpus=1
|
152 |
+
echo "Decoding start. See the progress via ${outdir}/${name}/decode.log."
|
153 |
+
${cuda_cmd} --gpu "${n_gpus}" "${outdir}/${name}/decode.log" \
|
154 |
+
parallel-wavegan-decode \
|
155 |
+
--dumpdir "${dumpdir}/${name}/norm" \
|
156 |
+
--checkpoint "${checkpoint}" \
|
157 |
+
--outdir "${outdir}/${name}" \
|
158 |
+
--verbose "${verbose}"
|
159 |
+
echo "Successfully finished decoding of ${name} set."
|
160 |
+
) &
|
161 |
+
pids+=($!)
|
162 |
+
done
|
163 |
+
i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
|
164 |
+
[ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
|
165 |
+
echo "Successfully finished decoding."
|
166 |
+
fi
|
167 |
+
echo "Finished."
|
ParallelWaveGAN/egs/arctic/voc1/utils
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
../../../utils
|
ParallelWaveGAN/egs/csmsc/voc1/cmd.sh
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
|
2 |
+
# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
|
3 |
+
# e.g.
|
4 |
+
# run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
|
5 |
+
#
|
6 |
+
# Options:
|
7 |
+
# --time <time>: Limit the maximum time to execute.
|
8 |
+
# --mem <mem>: Limit the maximum memory usage.
|
9 |
+
# -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
|
10 |
+
# --num-threads <ngpu>: Specify the number of CPU core.
|
11 |
+
# --gpu <ngpu>: Specify the number of GPU devices.
|
12 |
+
# --config: Change the configuration file from default.
|
13 |
+
#
|
14 |
+
# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
|
15 |
+
# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
|
16 |
+
# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
|
17 |
+
# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
|
18 |
+
#
|
19 |
+
# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
|
20 |
+
# These options are mapping to specific options for each backend and
|
21 |
+
# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
|
22 |
+
# If jobs failed, your configuration might be wrong for your environment.
|
23 |
+
#
|
24 |
+
#
|
25 |
+
# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
|
26 |
+
# "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
|
27 |
+
# =========================================================~
|
28 |
+
|
29 |
+
|
30 |
+
# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
|
31 |
+
cmd_backend="local"
|
32 |
+
|
33 |
+
# Local machine, without any Job scheduling system
|
34 |
+
if [ "${cmd_backend}" = local ]; then
|
35 |
+
|
36 |
+
# The other usage
|
37 |
+
export train_cmd="utils/run.pl"
|
38 |
+
# Used for "*_train.py": "--gpu" is appended optionally by run.sh
|
39 |
+
export cuda_cmd="utils/run.pl"
|
40 |
+
# Used for "*_recog.py"
|
41 |
+
export decode_cmd="utils/run.pl"
|
42 |
+
|
43 |
+
# Local machine, without any Job scheduling system
|
44 |
+
elif [ "${cmd_backend}" = stdout ]; then
|
45 |
+
|
46 |
+
# The other usage
|
47 |
+
export train_cmd="utils/stdout.pl"
|
48 |
+
# Used for "*_train.py": "--gpu" is appended optionally by run.sh
|
49 |
+
export cuda_cmd="utils/stdout.pl"
|
50 |
+
# Used for "*_recog.py"
|
51 |
+
export decode_cmd="utils/stdout.pl"
|
52 |
+
|
53 |
+
# "qsub" (SGE, Torque, PBS, etc.)
|
54 |
+
elif [ "${cmd_backend}" = sge ]; then
|
55 |
+
# The default setting is written in conf/queue.conf.
|
56 |
+
# You must change "-q g.q" for the "queue" for your environment.
|
57 |
+
# To know the "queue" names, type "qhost -q"
|
58 |
+
# Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
|
59 |
+
|
60 |
+
export train_cmd="utils/queue.pl"
|
61 |
+
export cuda_cmd="utils/queue.pl"
|
62 |
+
export decode_cmd="utils/queue.pl"
|
63 |
+
|
64 |
+
# "sbatch" (Slurm)
|
65 |
+
elif [ "${cmd_backend}" = slurm ]; then
|
66 |
+
# The default setting is written in conf/slurm.conf.
|
67 |
+
# You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
|
68 |
+
# To know the "partion" names, type "sinfo".
|
69 |
+
# You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
|
70 |
+
# The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
|
71 |
+
|
72 |
+
export train_cmd="utils/slurm.pl"
|
73 |
+
export cuda_cmd="utils/slurm.pl"
|
74 |
+
export decode_cmd="utils/slurm.pl"
|
75 |
+
|
76 |
+
elif [ "${cmd_backend}" = ssh ]; then
|
77 |
+
# You have to create ".queue/machines" to specify the host to execute jobs.
|
78 |
+
# e.g. .queue/machines
|
79 |
+
# host1
|
80 |
+
# host2
|
81 |
+
# host3
|
82 |
+
# Assuming you can login them without any password, i.e. You have to set ssh keys.
|
83 |
+
|
84 |
+
export train_cmd="utils/ssh.pl"
|
85 |
+
export cuda_cmd="utils/ssh.pl"
|
86 |
+
export decode_cmd="utils/ssh.pl"
|
87 |
+
|
88 |
+
else
|
89 |
+
echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
|
90 |
+
return 1
|
91 |
+
fi
|
ParallelWaveGAN/egs/csmsc/voc1/conf/hifigan.v1.yaml
ADDED
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This is the configuration file for CSMSC dataset.
|
2 |
+
# This configuration is based on HiFiGAN V1, which is
|
3 |
+
# an official configuration. But I found that the optimizer
|
4 |
+
# setting does not work well with my implementation.
|
5 |
+
# So I changed optimizer settings as follows:
|
6 |
+
# - AdamW -> Adam
|
7 |
+
# - betas: [0.8, 0.99] -> betas: [0.5, 0.9]
|
8 |
+
# - Scheduler: ExponentialLR -> MultiStepLR
|
9 |
+
# To match the shift size difference, the upsample scales
|
10 |
+
# is also modified from the original 256 shift setting.
|
11 |
+
|
12 |
+
###########################################################
|
13 |
+
# FEATURE EXTRACTION SETTING #
|
14 |
+
###########################################################
|
15 |
+
sampling_rate: 24000 # Sampling rate.
|
16 |
+
fft_size: 2048 # FFT size.
|
17 |
+
hop_size: 300 # Hop size.
|
18 |
+
win_length: 1200 # Window length.
|
19 |
+
# If set to null, it will be the same as fft_size.
|
20 |
+
window: "hann" # Window function.
|
21 |
+
num_mels: 80 # Number of mel basis.
|
22 |
+
fmin: 80 # Minimum freq in mel basis calculation.
|
23 |
+
fmax: 7600 # Maximum frequency in mel basis calculation.
|
24 |
+
global_gain_scale: 1.0 # Will be multiplied to all of waveform.
|
25 |
+
trim_silence: false # Whether to trim the start and end of silence.
|
26 |
+
trim_threshold_in_db: 20 # Need to tune carefully if the recording is not good.
|
27 |
+
trim_frame_size: 1024 # Frame size in trimming.
|
28 |
+
trim_hop_size: 256 # Hop size in trimming.
|
29 |
+
format: "hdf5" # Feature file format. "npy" or "hdf5" is supported.
|
30 |
+
|
31 |
+
###########################################################
|
32 |
+
# GENERATOR NETWORK ARCHITECTURE SETTING #
|
33 |
+
###########################################################
|
34 |
+
generator_type: HiFiGANGenerator
|
35 |
+
generator_params:
|
36 |
+
in_channels: 80 # Number of input channels.
|
37 |
+
out_channels: 1 # Number of output channels.
|
38 |
+
channels: 512 # Number of initial channels.
|
39 |
+
kernel_size: 7 # Kernel size of initial and final conv layers.
|
40 |
+
upsample_scales: [5, 5, 4, 3] # Upsampling scales.
|
41 |
+
upsample_kernel_sizes: [10, 10, 8, 6] # Kernel size for upsampling layers.
|
42 |
+
resblock_kernel_sizes: [3, 7, 11] # Kernel size for residual blocks.
|
43 |
+
resblock_dilations: # Dilations for residual blocks.
|
44 |
+
- [1, 3, 5]
|
45 |
+
- [1, 3, 5]
|
46 |
+
- [1, 3, 5]
|
47 |
+
use_additional_convs: true # Whether to use additional conv layer in residual blocks.
|
48 |
+
bias: true # Whether to use bias parameter in conv.
|
49 |
+
nonlinear_activation: "LeakyReLU" # Nonlinear activation type.
|
50 |
+
nonlinear_activation_params: # Nonlinear activation paramters.
|
51 |
+
negative_slope: 0.1
|
52 |
+
use_weight_norm: true # Whether to apply weight normalization.
|
53 |
+
|
54 |
+
###########################################################
|
55 |
+
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
|
56 |
+
###########################################################
|
57 |
+
discriminator_type: HiFiGANMultiScaleMultiPeriodDiscriminator
|
58 |
+
discriminator_params:
|
59 |
+
scales: 3 # Number of multi-scale discriminator.
|
60 |
+
scale_downsample_pooling: "AvgPool1d" # Pooling operation for scale discriminator.
|
61 |
+
scale_downsample_pooling_params:
|
62 |
+
kernel_size: 4 # Pooling kernel size.
|
63 |
+
stride: 2 # Pooling stride.
|
64 |
+
padding: 2 # Padding size.
|
65 |
+
scale_discriminator_params:
|
66 |
+
in_channels: 1 # Number of input channels.
|
67 |
+
out_channels: 1 # Number of output channels.
|
68 |
+
kernel_sizes: [15, 41, 5, 3] # List of kernel sizes.
|
69 |
+
channels: 128 # Initial number of channels.
|
70 |
+
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
|
71 |
+
max_groups: 16 # Maximum number of groups in downsampling conv layers.
|
72 |
+
bias: true
|
73 |
+
downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales.
|
74 |
+
nonlinear_activation: "LeakyReLU" # Nonlinear activation.
|
75 |
+
nonlinear_activation_params:
|
76 |
+
negative_slope: 0.1
|
77 |
+
follow_official_norm: true # Whether to follow the official norm setting.
|
78 |
+
periods: [2, 3, 5, 7, 11] # List of period for multi-period discriminator.
|
79 |
+
period_discriminator_params:
|
80 |
+
in_channels: 1 # Number of input channels.
|
81 |
+
out_channels: 1 # Number of output channels.
|
82 |
+
kernel_sizes: [5, 3] # List of kernel sizes.
|
83 |
+
channels: 32 # Initial number of channels.
|
84 |
+
downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
|
85 |
+
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
|
86 |
+
bias: true # Whether to use bias parameter in conv layer."
|
87 |
+
nonlinear_activation: "LeakyReLU" # Nonlinear activation.
|
88 |
+
nonlinear_activation_params: # Nonlinear activation paramters.
|
89 |
+
negative_slope: 0.1
|
90 |
+
use_weight_norm: true # Whether to apply weight normalization.
|
91 |
+
use_spectral_norm: false # Whether to apply spectral normalization.
|
92 |
+
|
93 |
+
###########################################################
|
94 |
+
# STFT LOSS SETTING #
|
95 |
+
###########################################################
|
96 |
+
use_stft_loss: false # Whether to use multi-resolution STFT loss.
|
97 |
+
use_mel_loss: true # Whether to use Mel-spectrogram loss.
|
98 |
+
mel_loss_params:
|
99 |
+
fs: 24000
|
100 |
+
fft_size: 2048
|
101 |
+
hop_size: 300
|
102 |
+
win_length: 1200
|
103 |
+
window: "hann"
|
104 |
+
num_mels: 80
|
105 |
+
fmin: 0
|
106 |
+
fmax: 12000
|
107 |
+
log_base: null
|
108 |
+
generator_adv_loss_params:
|
109 |
+
average_by_discriminators: false # Whether to average loss by #discriminators.
|
110 |
+
discriminator_adv_loss_params:
|
111 |
+
average_by_discriminators: false # Whether to average loss by #discriminators.
|
112 |
+
use_feat_match_loss: true
|
113 |
+
feat_match_loss_params:
|
114 |
+
average_by_discriminators: false # Whether to average loss by #discriminators.
|
115 |
+
average_by_layers: false # Whether to average loss by #layers in each discriminator.
|
116 |
+
include_final_outputs: false # Whether to include final outputs in feat match loss calculation.
|
117 |
+
|
118 |
+
###########################################################
|
119 |
+
# ADVERSARIAL LOSS SETTING #
|
120 |
+
###########################################################
|
121 |
+
lambda_aux: 45.0 # Loss balancing coefficient for STFT loss.
|
122 |
+
lambda_adv: 1.0 # Loss balancing coefficient for adversarial loss.
|
123 |
+
lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss..
|
124 |
+
|
125 |
+
###########################################################
|
126 |
+
# DATA LOADER SETTING #
|
127 |
+
###########################################################
|
128 |
+
batch_size: 16 # Batch size.
|
129 |
+
batch_max_steps: 8400 # Length of each audio in batch. Make sure dividable by hop_size.
|
130 |
+
pin_memory: true # Whether to pin memory in Pytorch DataLoader.
|
131 |
+
num_workers: 2 # Number of workers in Pytorch DataLoader.
|
132 |
+
remove_short_samples: false # Whether to remove samples the length of which are less than batch_max_steps.
|
133 |
+
allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.
|
134 |
+
|
135 |
+
###########################################################
|
136 |
+
# OPTIMIZER & SCHEDULER SETTING #
|
137 |
+
###########################################################
|
138 |
+
generator_optimizer_type: Adam
|
139 |
+
generator_optimizer_params:
|
140 |
+
lr: 2.0e-4
|
141 |
+
betas: [0.5, 0.9]
|
142 |
+
weight_decay: 0.0
|
143 |
+
generator_scheduler_type: MultiStepLR
|
144 |
+
generator_scheduler_params:
|
145 |
+
gamma: 0.5
|
146 |
+
milestones:
|
147 |
+
- 200000
|
148 |
+
- 400000
|
149 |
+
- 600000
|
150 |
+
- 800000
|
151 |
+
generator_grad_norm: -1
|
152 |
+
discriminator_optimizer_type: Adam
|
153 |
+
discriminator_optimizer_params:
|
154 |
+
lr: 2.0e-4
|
155 |
+
betas: [0.5, 0.9]
|
156 |
+
weight_decay: 0.0
|
157 |
+
discriminator_scheduler_type: MultiStepLR
|
158 |
+
discriminator_scheduler_params:
|
159 |
+
gamma: 0.5
|
160 |
+
milestones:
|
161 |
+
- 200000
|
162 |
+
- 400000
|
163 |
+
- 600000
|
164 |
+
- 800000
|
165 |
+
discriminator_grad_norm: -1
|
166 |
+
|
167 |
+
###########################################################
|
168 |
+
# INTERVAL SETTING #
|
169 |
+
###########################################################
|
170 |
+
generator_train_start_steps: 1 # Number of steps to start to train discriminator.
|
171 |
+
discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
|
172 |
+
train_max_steps: 2500000 # Number of training steps.
|
173 |
+
save_interval_steps: 10000 # Interval steps to save checkpoint.
|
174 |
+
eval_interval_steps: 1000 # Interval steps to evaluate the network.
|
175 |
+
log_interval_steps: 100 # Interval steps to record the training log.
|
176 |
+
|
177 |
+
###########################################################
|
178 |
+
# OTHER SETTING #
|
179 |
+
###########################################################
|
180 |
+
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
|
ParallelWaveGAN/egs/csmsc/voc1/conf/multi_band_melgan.v2.yaml
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This is the hyperparameter configuration file for MelGAN.
|
2 |
+
# Please make sure this is adjusted for the CSMSC dataset. If you want to
|
3 |
+
# apply to the other dataset, you might need to carefully change some parameters.
|
4 |
+
# This configuration requires ~ 8GB memory and will finish within 4 days on Titan V.
|
5 |
+
|
6 |
+
# This configuration is based on full-band MelGAN but the hop size and sampling
|
7 |
+
# rate is different from the paper (16kHz vs 24kHz). The number of iteraions
|
8 |
+
# is now shown in the paper so currently we train 1M iterations (not sure enough
|
9 |
+
# to converge). The optimizer setting is based on @dathudeptrai advice.
|
10 |
+
# https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906
|
11 |
+
|
12 |
+
###########################################################
|
13 |
+
# FEATURE EXTRACTION SETTING #
|
14 |
+
###########################################################
|
15 |
+
sampling_rate: 24000 # Sampling rate.
|
16 |
+
fft_size: 2048 # FFT size.
|
17 |
+
hop_size: 300 # Hop size.
|
18 |
+
win_length: 1200 # Window length.
|
19 |
+
# If set to null, it will be the same as fft_size.
|
20 |
+
window: "hann" # Window function.
|
21 |
+
num_mels: 80 # Number of mel basis.
|
22 |
+
fmin: 80 # Minimum freq in mel basis calculation.
|
23 |
+
fmax: 7600 # Maximum frequency in mel basis calculation.
|
24 |
+
global_gain_scale: 1.0 # Will be multiplied to all of waveform.
|
25 |
+
trim_silence: false # Whether to trim the start and end of silence.
|
26 |
+
trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
|
27 |
+
trim_frame_size: 2048 # Frame size in trimming.
|
28 |
+
trim_hop_size: 512 # Hop size in trimming.
|
29 |
+
format: "hdf5" # Feature file format. "npy" or "hdf5" is supported.
|
30 |
+
|
31 |
+
###########################################################
|
32 |
+
# GENERATOR NETWORK ARCHITECTURE SETTING #
|
33 |
+
###########################################################
|
34 |
+
generator_type: "MelGANGenerator" # Generator type.
|
35 |
+
generator_params:
|
36 |
+
in_channels: 80 # Number of input channels.
|
37 |
+
out_channels: 4 # Number of output channels.
|
38 |
+
kernel_size: 7 # Kernel size of initial and final conv layers.
|
39 |
+
channels: 384 # Initial number of channels for conv layers.
|
40 |
+
upsample_scales: [5, 5, 3] # List of Upsampling scales.
|
41 |
+
stack_kernel_size: 3 # Kernel size of dilated conv layers in residual stack.
|
42 |
+
stacks: 4 # Number of stacks in a single residual stack module.
|
43 |
+
use_weight_norm: True # Whether to use weight normalization.
|
44 |
+
use_causal_conv: False # Whether to use causal convolution.
|
45 |
+
|
46 |
+
###########################################################
|
47 |
+
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
|
48 |
+
###########################################################
|
49 |
+
discriminator_type: "MelGANMultiScaleDiscriminator" # Discriminator type.
|
50 |
+
discriminator_params:
|
51 |
+
in_channels: 1 # Number of input channels.
|
52 |
+
out_channels: 1 # Number of output channels.
|
53 |
+
scales: 3 # Number of multi-scales.
|
54 |
+
downsample_pooling: "AvgPool1d" # Pooling type for the input downsampling.
|
55 |
+
downsample_pooling_params: # Parameters of the above pooling function.
|
56 |
+
kernel_size: 4
|
57 |
+
stride: 2
|
58 |
+
padding: 1
|
59 |
+
count_include_pad: False
|
60 |
+
kernel_sizes: [5, 3] # List of kernel size.
|
61 |
+
channels: 16 # Number of channels of the initial conv layer.
|
62 |
+
max_downsample_channels: 512 # Maximum number of channels of downsampling layers.
|
63 |
+
downsample_scales: [4, 4, 4] # List of downsampling scales.
|
64 |
+
nonlinear_activation: "LeakyReLU" # Nonlinear activation function.
|
65 |
+
nonlinear_activation_params: # Parameters of nonlinear activation function.
|
66 |
+
negative_slope: 0.2
|
67 |
+
use_weight_norm: True # Whether to use weight norm.
|
68 |
+
|
69 |
+
###########################################################
|
70 |
+
# STFT LOSS SETTING #
|
71 |
+
###########################################################
|
72 |
+
stft_loss_params:
|
73 |
+
fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
|
74 |
+
hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss
|
75 |
+
win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
|
76 |
+
window: "hann_window" # Window function for STFT-based loss
|
77 |
+
use_subband_stft_loss: true
|
78 |
+
subband_stft_loss_params:
|
79 |
+
fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss.
|
80 |
+
hop_sizes: [30, 60, 10] # List of hop size for STFT-based loss
|
81 |
+
win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
|
82 |
+
window: "hann_window" # Window function for STFT-based loss
|
83 |
+
|
84 |
+
###########################################################
|
85 |
+
# ADVERSARIAL LOSS SETTING #
|
86 |
+
###########################################################
|
87 |
+
use_feat_match_loss: false # Whether to use feature matching loss.
|
88 |
+
lambda_adv: 2.5 # Loss balancing coefficient for adversarial loss.
|
89 |
+
|
90 |
+
###########################################################
|
91 |
+
# DATA LOADER SETTING #
|
92 |
+
###########################################################
|
93 |
+
batch_size: 64 # Batch size.
|
94 |
+
batch_max_steps: 16200 # Length of each audio in batch. Make sure dividable by hop_size.
|
95 |
+
pin_memory: true # Whether to pin memory in Pytorch DataLoader.
|
96 |
+
num_workers: 4 # Number of workers in Pytorch DataLoader.
|
97 |
+
remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
|
98 |
+
allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.
|
99 |
+
|
100 |
+
###########################################################
|
101 |
+
# OPTIMIZER & SCHEDULER SETTING #
|
102 |
+
###########################################################
|
103 |
+
generator_optimizer_type: "Adam" # Generator's optimizer type.
|
104 |
+
generator_optimizer_params:
|
105 |
+
lr: 1.0e-3 # Generator's learning rate.
|
106 |
+
eps: 1.0e-7 # Generator's epsilon.
|
107 |
+
weight_decay: 0.0 # Generator's weight decay coefficient.
|
108 |
+
amsgrad: true
|
109 |
+
generator_grad_norm: -1 # Generator's gradient norm.
|
110 |
+
generator_scheduler_type: "MultiStepLR" # Generator's scheduler type.
|
111 |
+
generator_scheduler_params:
|
112 |
+
gamma: 0.5 # Generator's scheduler gamma.
|
113 |
+
milestones: # At each milestone, lr will be multiplied by gamma.
|
114 |
+
- 100000
|
115 |
+
- 200000
|
116 |
+
- 300000
|
117 |
+
- 400000
|
118 |
+
- 500000
|
119 |
+
- 600000
|
120 |
+
discriminator_optimizer_type: "Adam" # Discriminator's optimizer type.
|
121 |
+
discriminator_optimizer_params:
|
122 |
+
lr: 1.0e-3 # Discriminator's learning rate.
|
123 |
+
eps: 1.0e-7 # Discriminator's epsilon.
|
124 |
+
weight_decay: 0.0 # Discriminator's weight decay coefficient.
|
125 |
+
amsgrad: true
|
126 |
+
discriminator_grad_norm: -1 # Discriminator's gradient norm.
|
127 |
+
discriminator_scheduler_type: "MultiStepLR" # Discriminator's scheduler type.
|
128 |
+
discriminator_scheduler_params:
|
129 |
+
gamma: 0.5 # Discriminator's scheduler gamma.
|
130 |
+
milestones: # At each milestone, lr will be multiplied by gamma.
|
131 |
+
- 100000
|
132 |
+
- 200000
|
133 |
+
- 300000
|
134 |
+
- 400000
|
135 |
+
- 500000
|
136 |
+
- 600000
|
137 |
+
|
138 |
+
###########################################################
|
139 |
+
# INTERVAL SETTING #
|
140 |
+
###########################################################
|
141 |
+
discriminator_train_start_steps: 200000 # Number of steps to start to train discriminator.
|
142 |
+
train_max_steps: 1000000 # Number of training steps.
|
143 |
+
save_interval_steps: 50000 # Interval steps to save checkpoint.
|
144 |
+
eval_interval_steps: 1000 # Interval steps to evaluate the network.
|
145 |
+
log_interval_steps: 1000 # Interval steps to record the training log.
|
146 |
+
|
147 |
+
###########################################################
|
148 |
+
# OTHER SETTING #
|
149 |
+
###########################################################
|
150 |
+
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
|
ParallelWaveGAN/egs/csmsc/voc1/conf/parallel_wavegan.v1.yaml
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This is the hyperparameter configuration file for Parallel WaveGAN.
|
2 |
+
# Please make sure this is adjusted for the CSMSC dataset. If you want to
|
3 |
+
# apply to the other dataset, you might need to carefully change some parameters.
|
4 |
+
# This configuration requires 12 GB GPU memory and takes ~3 days on RTX TITAN.
|
5 |
+
|
6 |
+
###########################################################
|
7 |
+
# FEATURE EXTRACTION SETTING #
|
8 |
+
###########################################################
|
9 |
+
sampling_rate: 24000 # Sampling rate.
|
10 |
+
fft_size: 2048 # FFT size.
|
11 |
+
hop_size: 300 # Hop size.
|
12 |
+
win_length: 1200 # Window length.
|
13 |
+
# If set to null, it will be the same as fft_size.
|
14 |
+
window: "hann" # Window function.
|
15 |
+
num_mels: 80 # Number of mel basis.
|
16 |
+
fmin: 80 # Minimum freq in mel basis calculation.
|
17 |
+
fmax: 7600 # Maximum frequency in mel basis calculation.
|
18 |
+
global_gain_scale: 1.0 # Will be multiplied to all of waveform.
|
19 |
+
trim_silence: false # Whether to trim the start and end of silence.
|
20 |
+
trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
|
21 |
+
trim_frame_size: 2048 # Frame size in trimming.
|
22 |
+
trim_hop_size: 512 # Hop size in trimming.
|
23 |
+
format: "hdf5" # Feature file format. "npy" or "hdf5" is supported.
|
24 |
+
|
25 |
+
###########################################################
|
26 |
+
# GENERATOR NETWORK ARCHITECTURE SETTING #
|
27 |
+
###########################################################
|
28 |
+
generator_params:
|
29 |
+
in_channels: 1 # Number of input channels.
|
30 |
+
out_channels: 1 # Number of output channels.
|
31 |
+
kernel_size: 3 # Kernel size of dilated convolution.
|
32 |
+
layers: 30 # Number of residual block layers.
|
33 |
+
stacks: 3 # Number of stacks i.e., dilation cycles.
|
34 |
+
residual_channels: 64 # Number of channels in residual conv.
|
35 |
+
gate_channels: 128 # Number of channels in gated conv.
|
36 |
+
skip_channels: 64 # Number of channels in skip conv.
|
37 |
+
aux_channels: 80 # Number of channels for auxiliary feature conv.
|
38 |
+
# Must be the same as num_mels.
|
39 |
+
aux_context_window: 2 # Context window size for auxiliary feature.
|
40 |
+
# If set to 2, previous 2 and future 2 frames will be considered.
|
41 |
+
dropout: 0.0 # Dropout rate. 0.0 means no dropout applied.
|
42 |
+
use_weight_norm: true # Whether to use weight norm.
|
43 |
+
# If set to true, it will be applied to all of the conv layers.
|
44 |
+
upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
|
45 |
+
upsample_params: # Upsampling network parameters.
|
46 |
+
upsample_scales: [4, 5, 3, 5] # Upsampling scales. Prodcut of these must be the same as hop size.
|
47 |
+
|
48 |
+
###########################################################
|
49 |
+
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
|
50 |
+
###########################################################
|
51 |
+
discriminator_params:
|
52 |
+
in_channels: 1 # Number of input channels.
|
53 |
+
out_channels: 1 # Number of output channels.
|
54 |
+
kernel_size: 3 # Number of output channels.
|
55 |
+
layers: 10 # Number of conv layers.
|
56 |
+
conv_channels: 64 # Number of chnn layers.
|
57 |
+
bias: true # Whether to use bias parameter in conv.
|
58 |
+
use_weight_norm: true # Whether to use weight norm.
|
59 |
+
# If set to true, it will be applied to all of the conv layers.
|
60 |
+
nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
|
61 |
+
nonlinear_activation_params: # Nonlinear function parameters
|
62 |
+
negative_slope: 0.2 # Alpha in LeakyReLU.
|
63 |
+
|
64 |
+
###########################################################
|
65 |
+
# STFT LOSS SETTING #
|
66 |
+
###########################################################
|
67 |
+
stft_loss_params:
|
68 |
+
fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
|
69 |
+
hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss
|
70 |
+
win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
|
71 |
+
window: "hann_window" # Window function for STFT-based loss
|
72 |
+
|
73 |
+
###########################################################
|
74 |
+
# ADVERSARIAL LOSS SETTING #
|
75 |
+
###########################################################
|
76 |
+
lambda_adv: 4.0 # Loss balancing coefficient.
|
77 |
+
|
78 |
+
###########################################################
|
79 |
+
# DATA LOADER SETTING #
|
80 |
+
###########################################################
|
81 |
+
batch_size: 6 # Batch size.
|
82 |
+
batch_max_steps: 25500 # Length of each audio in batch. Make sure dividable by hop_size.
|
83 |
+
pin_memory: true # Whether to pin memory in Pytorch DataLoader.
|
84 |
+
num_workers: 2 # Number of workers in Pytorch DataLoader.
|
85 |
+
remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
|
86 |
+
allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.
|
87 |
+
|
88 |
+
###########################################################
|
89 |
+
# OPTIMIZER & SCHEDULER SETTING #
|
90 |
+
###########################################################
|
91 |
+
generator_optimizer_params:
|
92 |
+
lr: 0.0001 # Generator's learning rate.
|
93 |
+
eps: 1.0e-6 # Generator's epsilon.
|
94 |
+
weight_decay: 0.0 # Generator's weight decay coefficient.
|
95 |
+
generator_scheduler_params:
|
96 |
+
step_size: 200000 # Generator's scheduler step size.
|
97 |
+
gamma: 0.5 # Generator's scheduler gamma.
|
98 |
+
# At each step size, lr will be multiplied by this parameter.
|
99 |
+
generator_grad_norm: 10 # Generator's gradient norm.
|
100 |
+
discriminator_optimizer_params:
|
101 |
+
lr: 0.00005 # Discriminator's learning rate.
|
102 |
+
eps: 1.0e-6 # Discriminator's epsilon.
|
103 |
+
weight_decay: 0.0 # Discriminator's weight decay coefficient.
|
104 |
+
discriminator_scheduler_params:
|
105 |
+
step_size: 200000 # Discriminator's scheduler step size.
|
106 |
+
gamma: 0.5 # Discriminator's scheduler gamma.
|
107 |
+
# At each step size, lr will be multiplied by this parameter.
|
108 |
+
discriminator_grad_norm: 1 # Discriminator's gradient norm.
|
109 |
+
|
110 |
+
###########################################################
|
111 |
+
# INTERVAL SETTING #
|
112 |
+
###########################################################
|
113 |
+
discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
|
114 |
+
train_max_steps: 400000 # Number of training steps.
|
115 |
+
save_interval_steps: 5000 # Interval steps to save checkpoint.
|
116 |
+
eval_interval_steps: 1000 # Interval steps to evaluate the network.
|
117 |
+
log_interval_steps: 100 # Interval steps to record the training log.
|
118 |
+
|
119 |
+
###########################################################
|
120 |
+
# OTHER SETTING #
|
121 |
+
###########################################################
|
122 |
+
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
|
ParallelWaveGAN/egs/csmsc/voc1/conf/slurm.conf
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Default configuration
|
2 |
+
command sbatch --export=PATH --ntasks-per-node=1
|
3 |
+
option time=* --time $0
|
4 |
+
option mem=* --mem-per-cpu $0
|
5 |
+
option mem=0 # Do not add anything to qsub_opts
|
6 |
+
option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
|
7 |
+
option num_threads=1 --cpus-per-task 1 --ntasks-per-node=1 # Do not add anything to qsub_opts
|
8 |
+
default gpu=0
|
9 |
+
option gpu=0 -p cpu
|
10 |
+
option gpu=* -p gpu --gres=gpu:$0
|
11 |
+
# note: the --max-jobs-run option is supported as a special case
|
12 |
+
# by slurm.pl and you don't have to handle it in the config file.
|
ParallelWaveGAN/egs/csmsc/voc1/conf/style_melgan.v1.yaml
ADDED
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This is the configuration file for CSMSC dataset.
|
2 |
+
# This configuration is based on StyleMelGAN paper but
|
3 |
+
# uses MSE loss instead of Hinge loss. And I found that
|
4 |
+
# batch_size = 8 is also working good. So maybe if you
|
5 |
+
# want to accelerate the training, you can reduce the
|
6 |
+
# batch size (e.g. 8 or 16). Upsampling scales is modified
|
7 |
+
# to fit the shift size 300 pt.
|
8 |
+
|
9 |
+
###########################################################
|
10 |
+
# FEATURE EXTRACTION SETTING #
|
11 |
+
###########################################################
|
12 |
+
sampling_rate: 24000 # Sampling rate.
|
13 |
+
fft_size: 2048 # FFT size.
|
14 |
+
hop_size: 300 # Hop size.
|
15 |
+
win_length: 1200 # Window length.
|
16 |
+
# If set to null, it will be the same as fft_size.
|
17 |
+
window: "hann" # Window function.
|
18 |
+
num_mels: 80 # Number of mel basis.
|
19 |
+
fmin: 80 # Minimum freq in mel basis calculation.
|
20 |
+
fmax: 7600 # Maximum frequency in mel basis calculation.
|
21 |
+
global_gain_scale: 1.0 # Will be multiplied to all of waveform.
|
22 |
+
trim_silence: false # Whether to trim the start and end of silence.
|
23 |
+
trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
|
24 |
+
trim_frame_size: 1024 # Frame size in trimming.
|
25 |
+
trim_hop_size: 256 # Hop size in trimming.
|
26 |
+
format: "hdf5" # Feature file format. " npy " or " hdf5 " is supported.
|
27 |
+
|
28 |
+
###########################################################
|
29 |
+
# GENERATOR NETWORK ARCHITECTURE SETTING #
|
30 |
+
###########################################################
|
31 |
+
generator_type: "StyleMelGANGenerator" # Generator type.
|
32 |
+
generator_params:
|
33 |
+
in_channels: 128
|
34 |
+
aux_channels: 80
|
35 |
+
channels: 64
|
36 |
+
out_channels: 1
|
37 |
+
kernel_size: 9
|
38 |
+
dilation: 2
|
39 |
+
bias: True
|
40 |
+
noise_upsample_scales: [10, 2, 2, 2]
|
41 |
+
noise_upsample_activation: "LeakyReLU"
|
42 |
+
noise_upsample_activation_params:
|
43 |
+
negative_slope: 0.2
|
44 |
+
upsample_scales: [5, 1, 5, 1, 3, 1, 2, 2, 1]
|
45 |
+
upsample_mode: "nearest"
|
46 |
+
gated_function: "softmax"
|
47 |
+
use_weight_norm: True
|
48 |
+
|
49 |
+
###########################################################
|
50 |
+
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
|
51 |
+
###########################################################
|
52 |
+
discriminator_type: "StyleMelGANDiscriminator" # Discriminator type.
|
53 |
+
discriminator_params:
|
54 |
+
repeats: 4
|
55 |
+
window_sizes: [512, 1024, 2048, 4096]
|
56 |
+
pqmf_params:
|
57 |
+
- [1, None, None, None]
|
58 |
+
- [2, 62, 0.26700, 9.0]
|
59 |
+
- [4, 62, 0.14200, 9.0]
|
60 |
+
- [8, 62, 0.07949, 9.0]
|
61 |
+
discriminator_params:
|
62 |
+
out_channels: 1
|
63 |
+
kernel_sizes: [5, 3]
|
64 |
+
channels: 16
|
65 |
+
max_downsample_channels: 512
|
66 |
+
bias: True
|
67 |
+
downsample_scales: [4, 4, 4, 1]
|
68 |
+
nonlinear_activation: "LeakyReLU"
|
69 |
+
nonlinear_activation_params:
|
70 |
+
negative_slope: 0.2
|
71 |
+
use_weight_norm: True
|
72 |
+
|
73 |
+
###########################################################
|
74 |
+
# STFT LOSS SETTING #
|
75 |
+
###########################################################
|
76 |
+
stft_loss_params:
|
77 |
+
fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
|
78 |
+
hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss
|
79 |
+
win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
|
80 |
+
window: "hann_window" # Window function for STFT-based loss
|
81 |
+
lambda_aux: 1.0 # Loss balancing coefficient for aux loss.
|
82 |
+
|
83 |
+
###########################################################
|
84 |
+
# ADVERSARIAL LOSS SETTING #
|
85 |
+
###########################################################
|
86 |
+
lambda_adv: 1.0 # Loss balancing coefficient for adv loss.
|
87 |
+
generator_adv_loss_params:
|
88 |
+
average_by_discriminators: false # Whether to average loss by #discriminators.
|
89 |
+
discriminator_adv_loss_params:
|
90 |
+
average_by_discriminators: false # Whether to average loss by #discriminators.
|
91 |
+
|
92 |
+
###########################################################
|
93 |
+
# DATA LOADER SETTING #
|
94 |
+
###########################################################
|
95 |
+
batch_size: 32 # Batch size.
|
96 |
+
batch_max_steps: 24000 # Length of each audio in batch. Make sure dividable by hop_size.
|
97 |
+
pin_memory: true # Whether to pin memory in Pytorch DataLoader.
|
98 |
+
num_workers: 2 # Number of workers in Pytorch DataLoader.
|
99 |
+
remove_short_samples: false # Whether to remove samples the length of which are less than batch_max_steps.
|
100 |
+
allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.
|
101 |
+
|
102 |
+
###########################################################
|
103 |
+
# OPTIMIZER & SCHEDULER SETTING #
|
104 |
+
###########################################################
|
105 |
+
generator_optimizer_type: Adam
|
106 |
+
generator_optimizer_params:
|
107 |
+
lr: 1.0e-4
|
108 |
+
betas: [0.5, 0.9]
|
109 |
+
weight_decay: 0.0
|
110 |
+
generator_scheduler_type: MultiStepLR
|
111 |
+
generator_scheduler_params:
|
112 |
+
gamma: 0.5
|
113 |
+
milestones:
|
114 |
+
- 100000
|
115 |
+
- 300000
|
116 |
+
- 500000
|
117 |
+
- 700000
|
118 |
+
- 900000
|
119 |
+
generator_grad_norm: -1
|
120 |
+
discriminator_optimizer_type: Adam
|
121 |
+
discriminator_optimizer_params:
|
122 |
+
lr: 2.0e-4
|
123 |
+
betas: [0.5, 0.9]
|
124 |
+
weight_decay: 0.0
|
125 |
+
discriminator_scheduler_type: MultiStepLR
|
126 |
+
discriminator_scheduler_params:
|
127 |
+
gamma: 0.5
|
128 |
+
milestones:
|
129 |
+
- 200000
|
130 |
+
- 400000
|
131 |
+
- 600000
|
132 |
+
- 800000
|
133 |
+
discriminator_grad_norm: -1
|
134 |
+
|
135 |
+
###########################################################
|
136 |
+
# INTERVAL SETTING #
|
137 |
+
###########################################################
|
138 |
+
discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
|
139 |
+
train_max_steps: 1500000 # Number of training steps.
|
140 |
+
save_interval_steps: 50000 # Interval steps to save checkpoint.
|
141 |
+
eval_interval_steps: 1000 # Interval steps to evaluate the network.
|
142 |
+
log_interval_steps: 100 # Interval steps to record the training log.
|
143 |
+
|
144 |
+
###########################################################
|
145 |
+
# OTHER SETTING #
|
146 |
+
###########################################################
|
147 |
+
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
|
ParallelWaveGAN/egs/csmsc/voc1/local/data_download.sh
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# Copyright 2019 Tomoki Hayashi
|
4 |
+
# MIT License (https://opensource.org/licenses/MIT)
|
5 |
+
|
6 |
+
download_dir=$1
|
7 |
+
|
8 |
+
# check arguments
|
9 |
+
if [ $# != 1 ]; then
|
10 |
+
echo "Usage: $0 <download_dir>"
|
11 |
+
exit 1
|
12 |
+
fi
|
13 |
+
|
14 |
+
set -euo pipefail
|
15 |
+
|
16 |
+
# download dataset
|
17 |
+
cwd=$(pwd)
|
18 |
+
if [ ! -e "${download_dir}/CSMSC" ]; then
|
19 |
+
mkdir -p "${download_dir}"
|
20 |
+
cd "${download_dir}"
|
21 |
+
wget https://weixinxcxdb.oss-cn-beijing.aliyuncs.com/gwYinPinKu/BZNSYP.rar
|
22 |
+
mkdir CSMSC && cd CSMSC && unrar x ../BZNSYP.rar
|
23 |
+
# convert new line code
|
24 |
+
find ./PhoneLabeling -name "*.interval" | while read -r line; do
|
25 |
+
nkf -Lu --overwrite "${line}"
|
26 |
+
done
|
27 |
+
rm ../BZNSYP.rar
|
28 |
+
cd "${cwd}"
|
29 |
+
echo "Successfully finished download."
|
30 |
+
else
|
31 |
+
echo "Already exists. Skip download."
|
32 |
+
fi
|
ParallelWaveGAN/egs/csmsc/voc1/local/data_prep.sh
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# Copyright 2019 Tomoki Hayashi
|
4 |
+
# MIT License (https://opensource.org/licenses/MIT)
|
5 |
+
|
6 |
+
# shellcheck disable=SC1091
|
7 |
+
. ./path.sh || exit 1;
|
8 |
+
|
9 |
+
fs=24000
|
10 |
+
num_dev=100
|
11 |
+
num_eval=100
|
12 |
+
train_set="train_nodev"
|
13 |
+
dev_set="dev"
|
14 |
+
eval_set="eval"
|
15 |
+
shuffle=false
|
16 |
+
|
17 |
+
# shellcheck disable=SC1091
|
18 |
+
. utils/parse_options.sh || exit 1;
|
19 |
+
|
20 |
+
db_root=$1
|
21 |
+
data_dir=$2
|
22 |
+
|
23 |
+
# check arguments
|
24 |
+
if [ $# != 2 ]; then
|
25 |
+
echo "Usage: $0 [Options] <db_root> <data_dir>"
|
26 |
+
echo "e.g.: $0 downloads/CSMSC data"
|
27 |
+
echo ""
|
28 |
+
echo "Options:"
|
29 |
+
echo " --fs: target sampling rate (default=24000)."
|
30 |
+
echo " --num_dev: number of development uttreances (default=100)."
|
31 |
+
echo " --num_eval: number of evaluation uttreances (default=100)."
|
32 |
+
echo " --train_set: name of train set (default=train_nodev)."
|
33 |
+
echo " --dev_set: name of dev set (default=dev)."
|
34 |
+
echo " --eval_set: name of eval set (default=eval)."
|
35 |
+
echo " --shuffle: whether to perform shuffle in making dev / eval set (default=false)."
|
36 |
+
exit 1
|
37 |
+
fi
|
38 |
+
|
39 |
+
set -euo pipefail
|
40 |
+
|
41 |
+
[ ! -e "${data_dir}/all" ] && mkdir -p "${data_dir}/all"
|
42 |
+
|
43 |
+
# set filenames
|
44 |
+
scp="${data_dir}/all/wav.scp"
|
45 |
+
segments="${data_dir}/all/segments"
|
46 |
+
|
47 |
+
# check file existence
|
48 |
+
[ -e "${scp}" ] && rm "${scp}"
|
49 |
+
[ -e "${segments}" ] && rm "${segments}"
|
50 |
+
|
51 |
+
# make wav.scp
|
52 |
+
find "${db_root}/Wave" -name "*.wav" -follow | sort | while read -r filename; do
|
53 |
+
id="$(basename "${filename}" .wav)"
|
54 |
+
echo "csmsc_${id} cat ${filename} | sox -t wav - -c 1 -b 16 -t wav - rate ${fs} |" >> "${scp}"
|
55 |
+
done
|
56 |
+
|
57 |
+
# make segments
|
58 |
+
find "${db_root}/PhoneLabeling" -name "*.interval" -follow | sort | while read -r filename; do
|
59 |
+
nkf -Lu --overwrite "${filename}"
|
60 |
+
id="$(basename "${filename}" .interval)"
|
61 |
+
start_sec=$(tail -n +14 "${filename}" | head -n 1)
|
62 |
+
end_sec=$(head -n -2 "${filename}" | tail -n 1)
|
63 |
+
[ -z "${start_sec}" ] && echo "Start second is missing (utt_id=${id}). " >&2 && exit 1;
|
64 |
+
[ -z "${end_sec}" ] && echo "End second is missing (utt_id=${id})." >&2 && exit 1;
|
65 |
+
echo "csmsc_${id} csmsc_${id} ${start_sec} ${end_sec}" >> "${segments}"
|
66 |
+
done
|
67 |
+
|
68 |
+
# check
|
69 |
+
diff -q <(awk '{print $1}' "${scp}") <(awk '{print $1}' "${segments}") > /dev/null
|
70 |
+
|
71 |
+
# split
|
72 |
+
num_all=$(wc -l < "${scp}")
|
73 |
+
num_deveval=$((num_dev + num_eval))
|
74 |
+
num_train=$((num_all - num_deveval))
|
75 |
+
utils/split_data.sh \
|
76 |
+
--num_first "${num_train}" \
|
77 |
+
--num_second "${num_deveval}" \
|
78 |
+
--shuffle "${shuffle}" \
|
79 |
+
"${data_dir}/all" \
|
80 |
+
"${data_dir}/${train_set}" \
|
81 |
+
"${data_dir}/deveval"
|
82 |
+
utils/split_data.sh \
|
83 |
+
--num_first "${num_dev}" \
|
84 |
+
--num_second "${num_eval}" \
|
85 |
+
--shuffle "${shuffle}" \
|
86 |
+
"${data_dir}/deveval" \
|
87 |
+
"${data_dir}/${dev_set}" \
|
88 |
+
"${data_dir}/${eval_set}"
|
89 |
+
|
90 |
+
# remove tmp directories
|
91 |
+
rm -rf "${data_dir}/all"
|
92 |
+
rm -rf "${data_dir}/deveval"
|
93 |
+
|
94 |
+
echo "Successfully prepared data."
|
ParallelWaveGAN/egs/csmsc/voc1/path.sh
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# cuda related
|
2 |
+
export CUDA_HOME=/usr/local/cuda-10.0
|
3 |
+
export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
|
4 |
+
|
5 |
+
# path related
|
6 |
+
export PRJ_ROOT="${PWD}/../../.."
|
7 |
+
if [ -e "${PRJ_ROOT}/tools/venv/bin/activate" ]; then
|
8 |
+
# shellcheck disable=SC1090
|
9 |
+
. "${PRJ_ROOT}/tools/venv/bin/activate"
|
10 |
+
fi
|
11 |
+
|
12 |
+
# python related
|
13 |
+
export OMP_NUM_THREADS=1
|
14 |
+
export PYTHONIOENCODING=UTF-8
|
15 |
+
export MPL_BACKEND=Agg
|
16 |
+
|
17 |
+
# check installation
|
18 |
+
if ! command -v parallel-wavegan-train > /dev/null; then
|
19 |
+
echo "Error: It seems setup is not finished." >&2
|
20 |
+
echo "Error: Please setup your environment by following README.md" >&2
|
21 |
+
return 1
|
22 |
+
fi
|
23 |
+
if ! command -v jq > /dev/null; then
|
24 |
+
echo "Error: It seems jq is not installed." >&2
|
25 |
+
echo "Error: Please install via \`sudo apt-get install jq\`." >&2
|
26 |
+
echo "Error: If you do not have sudo, please download from https://stedolan.github.io/jq/download/." >&2
|
27 |
+
return 1
|
28 |
+
fi
|
29 |
+
if ! command -v yq > /dev/null; then
|
30 |
+
echo "Error: It seems yq is not installed." >&2
|
31 |
+
echo "Error: Please install via \`pip install yq\`." >&2
|
32 |
+
return 1
|
33 |
+
fi
|
ParallelWaveGAN/egs/csmsc/voc1/run.sh
ADDED
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# Copyright 2019 Tomoki Hayashi
|
4 |
+
# MIT License (https://opensource.org/licenses/MIT)
|
5 |
+
|
6 |
+
. ./cmd.sh || exit 1;
|
7 |
+
. ./path.sh || exit 1;
|
8 |
+
|
9 |
+
# basic settings
|
10 |
+
stage=-1 # stage to start
|
11 |
+
stop_stage=100 # stage to stop
|
12 |
+
verbose=1 # verbosity level (lower is less info)
|
13 |
+
n_gpus=1 # number of gpus in training
|
14 |
+
n_jobs=16 # number of parallel jobs in feature extraction
|
15 |
+
|
16 |
+
# NOTE(kan-bayashi): renamed to conf to avoid conflict in parse_options.sh
|
17 |
+
conf=conf/parallel_wavegan.v1.yaml
|
18 |
+
|
19 |
+
# directory path setting
|
20 |
+
download_dir=downloads # direcotry to save downloaded files
|
21 |
+
dumpdir=dump # directory to dump features
|
22 |
+
|
23 |
+
# training related setting
|
24 |
+
tag="" # tag for directory to save model
|
25 |
+
resume="" # checkpoint path to resume training
|
26 |
+
# (e.g. <path>/<to>/checkpoint-10000steps.pkl)
|
27 |
+
|
28 |
+
# decoding related setting
|
29 |
+
checkpoint="" # checkpoint path to be used for decoding
|
30 |
+
# if not provided, the latest one will be used
|
31 |
+
# (e.g. <path>/<to>/checkpoint-400000steps.pkl)
|
32 |
+
|
33 |
+
# shellcheck disable=SC1091
|
34 |
+
. utils/parse_options.sh || exit 1;
|
35 |
+
|
36 |
+
train_set="train_nodev" # name of training data directory
|
37 |
+
dev_set="dev" # name of development data direcotry
|
38 |
+
eval_set="eval" # name of evaluation data direcotry
|
39 |
+
|
40 |
+
set -euo pipefail
|
41 |
+
|
42 |
+
if [ "${stage}" -le -1 ] && [ "${stop_stage}" -ge -1 ]; then
|
43 |
+
echo "Stage -1: Data download"
|
44 |
+
local/data_download.sh "${download_dir}"
|
45 |
+
fi
|
46 |
+
|
47 |
+
if [ "${stage}" -le 0 ] && [ "${stop_stage}" -ge 0 ]; then
|
48 |
+
echo "Stage 0: Data preparation"
|
49 |
+
local/data_prep.sh \
|
50 |
+
--train_set "${train_set}" \
|
51 |
+
--dev_set "${dev_set}" \
|
52 |
+
--eval_set "${eval_set}" \
|
53 |
+
"${download_dir}/CSMSC" data
|
54 |
+
fi
|
55 |
+
|
56 |
+
stats_ext=$(grep -q "hdf5" <(yq ".format" "${conf}") && echo "h5" || echo "npy")
|
57 |
+
if [ "${stage}" -le 1 ] && [ "${stop_stage}" -ge 1 ]; then
|
58 |
+
echo "Stage 1: Feature extraction"
|
59 |
+
# extract raw features
|
60 |
+
pids=()
|
61 |
+
for name in "${train_set}" "${dev_set}" "${eval_set}"; do
|
62 |
+
(
|
63 |
+
[ ! -e "${dumpdir}/${name}/raw" ] && mkdir -p "${dumpdir}/${name}/raw"
|
64 |
+
echo "Feature extraction start. See the progress via ${dumpdir}/${name}/raw/preprocessing.*.log."
|
65 |
+
utils/make_subset_data.sh "data/${name}" "${n_jobs}" "${dumpdir}/${name}/raw"
|
66 |
+
${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/raw/preprocessing.JOB.log" \
|
67 |
+
parallel-wavegan-preprocess \
|
68 |
+
--config "${conf}" \
|
69 |
+
--scp "${dumpdir}/${name}/raw/wav.JOB.scp" \
|
70 |
+
--segments "${dumpdir}/${name}/raw/segments.JOB" \
|
71 |
+
--dumpdir "${dumpdir}/${name}/raw/dump.JOB" \
|
72 |
+
--verbose "${verbose}"
|
73 |
+
echo "Successfully finished feature extraction of ${name} set."
|
74 |
+
) &
|
75 |
+
pids+=($!)
|
76 |
+
done
|
77 |
+
i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
|
78 |
+
[ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
|
79 |
+
echo "Successfully finished feature extraction."
|
80 |
+
|
81 |
+
# calculate statistics for normalization
|
82 |
+
echo "Statistics computation start. See the progress via ${dumpdir}/${train_set}/compute_statistics.log."
|
83 |
+
${train_cmd} "${dumpdir}/${train_set}/compute_statistics.log" \
|
84 |
+
parallel-wavegan-compute-statistics \
|
85 |
+
--config "${conf}" \
|
86 |
+
--rootdir "${dumpdir}/${train_set}/raw" \
|
87 |
+
--dumpdir "${dumpdir}/${train_set}" \
|
88 |
+
--verbose "${verbose}"
|
89 |
+
echo "Successfully finished calculation of statistics."
|
90 |
+
|
91 |
+
# normalize and dump them
|
92 |
+
pids=()
|
93 |
+
for name in "${train_set}" "${dev_set}" "${eval_set}"; do
|
94 |
+
(
|
95 |
+
[ ! -e "${dumpdir}/${name}/norm" ] && mkdir -p "${dumpdir}/${name}/norm"
|
96 |
+
echo "Nomalization start. See the progress via ${dumpdir}/${name}/norm/normalize.*.log."
|
97 |
+
${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/norm/normalize.JOB.log" \
|
98 |
+
parallel-wavegan-normalize \
|
99 |
+
--config "${conf}" \
|
100 |
+
--stats "${dumpdir}/${train_set}/stats.${stats_ext}" \
|
101 |
+
--rootdir "${dumpdir}/${name}/raw/dump.JOB" \
|
102 |
+
--dumpdir "${dumpdir}/${name}/norm/dump.JOB" \
|
103 |
+
--verbose "${verbose}"
|
104 |
+
echo "Successfully finished normalization of ${name} set."
|
105 |
+
) &
|
106 |
+
pids+=($!)
|
107 |
+
done
|
108 |
+
i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
|
109 |
+
[ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
|
110 |
+
echo "Successfully finished normalization."
|
111 |
+
fi
|
112 |
+
|
113 |
+
if [ -z "${tag}" ]; then
|
114 |
+
expdir="exp/${train_set}_csmsc_$(basename "${conf}" .yaml)"
|
115 |
+
else
|
116 |
+
expdir="exp/${train_set}_csmsc_${tag}"
|
117 |
+
fi
|
118 |
+
if [ "${stage}" -le 2 ] && [ "${stop_stage}" -ge 2 ]; then
|
119 |
+
echo "Stage 2: Network training"
|
120 |
+
[ ! -e "${expdir}" ] && mkdir -p "${expdir}"
|
121 |
+
cp "${dumpdir}/${train_set}/stats.${stats_ext}" "${expdir}"
|
122 |
+
if [ "${n_gpus}" -gt 1 ]; then
|
123 |
+
train="python -m parallel_wavegan.distributed.launch --nproc_per_node ${n_gpus} -c parallel-wavegan-train"
|
124 |
+
else
|
125 |
+
train="parallel-wavegan-train"
|
126 |
+
fi
|
127 |
+
echo "Training start. See the progress via ${expdir}/train.log."
|
128 |
+
${cuda_cmd} --gpu "${n_gpus}" "${expdir}/train.log" \
|
129 |
+
${train} \
|
130 |
+
--config "${conf}" \
|
131 |
+
--train-dumpdir "${dumpdir}/${train_set}/norm" \
|
132 |
+
--dev-dumpdir "${dumpdir}/${dev_set}/norm" \
|
133 |
+
--outdir "${expdir}" \
|
134 |
+
--resume "${resume}" \
|
135 |
+
--verbose "${verbose}"
|
136 |
+
echo "Successfully finished training."
|
137 |
+
fi
|
138 |
+
|
139 |
+
if [ "${stage}" -le 3 ] && [ "${stop_stage}" -ge 3 ]; then
|
140 |
+
echo "Stage 3: Network decoding"
|
141 |
+
# shellcheck disable=SC2012
|
142 |
+
[ -z "${checkpoint}" ] && checkpoint="$(ls -dt "${expdir}"/*.pkl | head -1 || true)"
|
143 |
+
outdir="${expdir}/wav/$(basename "${checkpoint}" .pkl)"
|
144 |
+
pids=()
|
145 |
+
for name in "${dev_set}" "${eval_set}"; do
|
146 |
+
(
|
147 |
+
[ ! -e "${outdir}/${name}" ] && mkdir -p "${outdir}/${name}"
|
148 |
+
[ "${n_gpus}" -gt 1 ] && n_gpus=1
|
149 |
+
echo "Decoding start. See the progress via ${outdir}/${name}/decode.log."
|
150 |
+
${cuda_cmd} --gpu "${n_gpus}" "${outdir}/${name}/decode.log" \
|
151 |
+
parallel-wavegan-decode \
|
152 |
+
--dumpdir "${dumpdir}/${name}/norm" \
|
153 |
+
--checkpoint "${checkpoint}" \
|
154 |
+
--outdir "${outdir}/${name}" \
|
155 |
+
--verbose "${verbose}"
|
156 |
+
echo "Successfully finished decoding of ${name} set."
|
157 |
+
) &
|
158 |
+
pids+=($!)
|
159 |
+
done
|
160 |
+
i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
|
161 |
+
[ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
|
162 |
+
echo "Successfully finished decoding."
|
163 |
+
fi
|
164 |
+
echo "Finished."
|
ParallelWaveGAN/egs/csmsc/voc1/utils
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
../../../utils
|
ParallelWaveGAN/egs/jnas/voc1/cmd.sh
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
|
2 |
+
# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
|
3 |
+
# e.g.
|
4 |
+
# run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
|
5 |
+
#
|
6 |
+
# Options:
|
7 |
+
# --time <time>: Limit the maximum time to execute.
|
8 |
+
# --mem <mem>: Limit the maximum memory usage.
|
9 |
+
# -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
|
10 |
+
# --num-threads <ngpu>: Specify the number of CPU core.
|
11 |
+
# --gpu <ngpu>: Specify the number of GPU devices.
|
12 |
+
# --config: Change the configuration file from default.
|
13 |
+
#
|
14 |
+
# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
|
15 |
+
# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
|
16 |
+
# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
|
17 |
+
# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
|
18 |
+
#
|
19 |
+
# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
|
20 |
+
# These options are mapping to specific options for each backend and
|
21 |
+
# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
|
22 |
+
# If jobs failed, your configuration might be wrong for your environment.
|
23 |
+
#
|
24 |
+
#
|
25 |
+
# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
|
26 |
+
# "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
|
27 |
+
# =========================================================~
|
28 |
+
|
29 |
+
|
30 |
+
# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
|
31 |
+
cmd_backend="local"
|
32 |
+
|
33 |
+
# Local machine, without any Job scheduling system
|
34 |
+
if [ "${cmd_backend}" = local ]; then
|
35 |
+
|
36 |
+
# The other usage
|
37 |
+
export train_cmd="utils/run.pl"
|
38 |
+
# Used for "*_train.py": "--gpu" is appended optionally by run.sh
|
39 |
+
export cuda_cmd="utils/run.pl"
|
40 |
+
# Used for "*_recog.py"
|
41 |
+
export decode_cmd="utils/run.pl"
|
42 |
+
|
43 |
+
# Local machine, without any Job scheduling system
|
44 |
+
elif [ "${cmd_backend}" = stdout ]; then
|
45 |
+
|
46 |
+
# The other usage
|
47 |
+
export train_cmd="utils/stdout.pl"
|
48 |
+
# Used for "*_train.py": "--gpu" is appended optionally by run.sh
|
49 |
+
export cuda_cmd="utils/stdout.pl"
|
50 |
+
# Used for "*_recog.py"
|
51 |
+
export decode_cmd="utils/stdout.pl"
|
52 |
+
|
53 |
+
# "qsub" (SGE, Torque, PBS, etc.)
|
54 |
+
elif [ "${cmd_backend}" = sge ]; then
|
55 |
+
# The default setting is written in conf/queue.conf.
|
56 |
+
# You must change "-q g.q" for the "queue" for your environment.
|
57 |
+
# To know the "queue" names, type "qhost -q"
|
58 |
+
# Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
|
59 |
+
|
60 |
+
export train_cmd="utils/queue.pl"
|
61 |
+
export cuda_cmd="utils/queue.pl"
|
62 |
+
export decode_cmd="utils/queue.pl"
|
63 |
+
|
64 |
+
# "sbatch" (Slurm)
|
65 |
+
elif [ "${cmd_backend}" = slurm ]; then
|
66 |
+
# The default setting is written in conf/slurm.conf.
|
67 |
+
# You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
|
68 |
+
# To know the "partion" names, type "sinfo".
|
69 |
+
# You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
|
70 |
+
# The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
|
71 |
+
|
72 |
+
export train_cmd="utils/slurm.pl"
|
73 |
+
export cuda_cmd="utils/slurm.pl"
|
74 |
+
export decode_cmd="utils/slurm.pl"
|
75 |
+
|
76 |
+
elif [ "${cmd_backend}" = ssh ]; then
|
77 |
+
# You have to create ".queue/machines" to specify the host to execute jobs.
|
78 |
+
# e.g. .queue/machines
|
79 |
+
# host1
|
80 |
+
# host2
|
81 |
+
# host3
|
82 |
+
# Assuming you can login them without any password, i.e. You have to set ssh keys.
|
83 |
+
|
84 |
+
export train_cmd="utils/ssh.pl"
|
85 |
+
export cuda_cmd="utils/ssh.pl"
|
86 |
+
export decode_cmd="utils/ssh.pl"
|
87 |
+
|
88 |
+
else
|
89 |
+
echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
|
90 |
+
return 1
|
91 |
+
fi
|
ParallelWaveGAN/egs/jnas/voc1/conf/parallel_wavegan.v1.long.yaml
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This is the hyperparameter configuration file for Parallel WaveGAN.
|
2 |
+
# Please make sure this is adjusted for the JNAS dataset. If you want to
|
3 |
+
# apply to the other dataset, you might need to carefully change some parameters.
|
4 |
+
# This configuration trains more steps up to 1000k compared to v1 config.
|
5 |
+
# It requires 12 GB GPU memory and takes ~7 days on TITAN V.
|
6 |
+
|
7 |
+
###########################################################
|
8 |
+
# FEATURE EXTRACTION SETTING #
|
9 |
+
###########################################################
|
10 |
+
sampling_rate: 16000 # Sampling rate.
|
11 |
+
fft_size: 1024 # FFT size.
|
12 |
+
hop_size: 256 # Hop size.
|
13 |
+
win_length: null # Window length.
|
14 |
+
# If set to null, it will be the same as fft_size.
|
15 |
+
window: "hann" # Window function.
|
16 |
+
num_mels: 80 # Number of mel basis.
|
17 |
+
fmin: 80 # Minimum freq in mel basis calculation.
|
18 |
+
fmax: 7600 # Maximum frequency in mel basis calculation.
|
19 |
+
global_gain_scale: 1.0 # Will be multiplied to all of waveform.
|
20 |
+
trim_silence: true # Whether to trim the start and end of silence.
|
21 |
+
trim_threshold_in_db: 20 # Need to tune carefully if the recording is not good.
|
22 |
+
trim_frame_size: 1024 # Frame size in trimming.
|
23 |
+
trim_hop_size: 256 # Hop size in trimming.
|
24 |
+
format: "hdf5" # Feature file format. "npy" or "hdf5" is supported.
|
25 |
+
|
26 |
+
###########################################################
|
27 |
+
# GENERATOR NETWORK ARCHITECTURE SETTING #
|
28 |
+
###########################################################
|
29 |
+
generator_params:
|
30 |
+
in_channels: 1 # Number of input channels.
|
31 |
+
out_channels: 1 # Number of output channels.
|
32 |
+
kernel_size: 3 # Kernel size of dilated convolution.
|
33 |
+
layers: 30 # Number of residual block layers.
|
34 |
+
stacks: 3 # Number of stacks i.e., dilation cycles.
|
35 |
+
residual_channels: 64 # Number of channels in residual conv.
|
36 |
+
gate_channels: 128 # Number of channels in gated conv.
|
37 |
+
skip_channels: 64 # Number of channels in skip conv.
|
38 |
+
aux_channels: 80 # Number of channels for auxiliary feature conv.
|
39 |
+
# Must be the same as num_mels.
|
40 |
+
aux_context_window: 2 # Context window size for auxiliary feature.
|
41 |
+
# If set to 2, previous 2 and future 2 frames will be considered.
|
42 |
+
dropout: 0.0 # Dropout rate. 0.0 means no dropout applied.
|
43 |
+
use_weight_norm: true # Whether to use weight norm.
|
44 |
+
# If set to true, it will be applied to all of the conv layers.
|
45 |
+
upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
|
46 |
+
upsample_params: # Upsampling network parameters.
|
47 |
+
upsample_scales: [4, 4, 4, 4] # Upsampling scales. Prodcut of these must be the same as hop size.
|
48 |
+
|
49 |
+
###########################################################
|
50 |
+
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
|
51 |
+
###########################################################
|
52 |
+
discriminator_params:
|
53 |
+
in_channels: 1 # Number of input channels.
|
54 |
+
out_channels: 1 # Number of output channels.
|
55 |
+
kernel_size: 3 # Number of output channels.
|
56 |
+
layers: 10 # Number of conv layers.
|
57 |
+
conv_channels: 64 # Number of chnn layers.
|
58 |
+
bias: true # Whether to use bias parameter in conv.
|
59 |
+
use_weight_norm: true # Whether to use weight norm.
|
60 |
+
# If set to true, it will be applied to all of the conv layers.
|
61 |
+
nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
|
62 |
+
nonlinear_activation_params: # Nonlinear function parameters
|
63 |
+
negative_slope: 0.2 # Alpha in LeakyReLU.
|
64 |
+
|
65 |
+
###########################################################
|
66 |
+
# STFT LOSS SETTING #
|
67 |
+
###########################################################
|
68 |
+
stft_loss_params:
|
69 |
+
fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
|
70 |
+
hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss
|
71 |
+
win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
|
72 |
+
window: "hann_window" # Window function for STFT-based loss
|
73 |
+
|
74 |
+
###########################################################
|
75 |
+
# ADVERSARIAL LOSS SETTING #
|
76 |
+
###########################################################
|
77 |
+
lambda_adv: 4.0 # Loss balancing coefficient.
|
78 |
+
|
79 |
+
###########################################################
|
80 |
+
# DATA LOADER SETTING #
|
81 |
+
###########################################################
|
82 |
+
batch_size: 10 # Batch size.
|
83 |
+
batch_max_steps: 15360 # Length of each audio in batch. Make sure dividable by hop_size.
|
84 |
+
pin_memory: true # Whether to pin memory in Pytorch DataLoader.
|
85 |
+
num_workers: 2 # Number of workers in Pytorch DataLoader.
|
86 |
+
remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
|
87 |
+
allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.
|
88 |
+
|
89 |
+
###########################################################
|
90 |
+
# OPTIMIZER & SCHEDULER SETTING #
|
91 |
+
###########################################################
|
92 |
+
generator_optimizer_params:
|
93 |
+
lr: 0.0001 # Generator's learning rate.
|
94 |
+
eps: 1.0e-6 # Generator's epsilon.
|
95 |
+
weight_decay: 0.0 # Generator's weight decay coefficient.
|
96 |
+
generator_scheduler_params:
|
97 |
+
step_size: 200000 # Generator's scheduler step size.
|
98 |
+
gamma: 0.5 # Generator's scheduler gamma.
|
99 |
+
# At each step size, lr will be multiplied by this parameter.
|
100 |
+
generator_grad_norm: 10 # Generator's gradient norm.
|
101 |
+
discriminator_optimizer_params:
|
102 |
+
lr: 0.00005 # Discriminator's learning rate.
|
103 |
+
eps: 1.0e-6 # Discriminator's epsilon.
|
104 |
+
weight_decay: 0.0 # Discriminator's weight decay coefficient.
|
105 |
+
discriminator_scheduler_params:
|
106 |
+
step_size: 200000 # Discriminator's scheduler step size.
|
107 |
+
gamma: 0.5 # Discriminator's scheduler gamma.
|
108 |
+
# At each step size, lr will be multiplied by this parameter.
|
109 |
+
discriminator_grad_norm: 1 # Discriminator's gradient norm.
|
110 |
+
|
111 |
+
###########################################################
|
112 |
+
# INTERVAL SETTING #
|
113 |
+
###########################################################
|
114 |
+
discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
|
115 |
+
train_max_steps: 1000000 # Number of training steps.
|
116 |
+
save_interval_steps: 5000 # Interval steps to save checkpoint.
|
117 |
+
eval_interval_steps: 1000 # Interval steps to evaluate the network.
|
118 |
+
log_interval_steps: 100 # Interval steps to record the training log.
|
119 |
+
|
120 |
+
###########################################################
|
121 |
+
# OTHER SETTING #
|
122 |
+
###########################################################
|
123 |
+
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
|
ParallelWaveGAN/egs/jnas/voc1/conf/parallel_wavegan.v1.yaml
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This is the hyperparameter configuration file for Parallel WaveGAN.
|
2 |
+
# Please make sure this is adjusted for the JNAS dataset. If you want to
|
3 |
+
# apply to the other dataset, you might need to carefully change some parameters.
|
4 |
+
# This configuration requires 12 GB GPU memory and takes ~3 days on TITAN V.
|
5 |
+
|
6 |
+
###########################################################
|
7 |
+
# FEATURE EXTRACTION SETTING #
|
8 |
+
###########################################################
|
9 |
+
sampling_rate: 16000 # Sampling rate.
|
10 |
+
fft_size: 1024 # FFT size.
|
11 |
+
hop_size: 256 # Hop size.
|
12 |
+
win_length: null # Window length.
|
13 |
+
# If set to null, it will be the same as fft_size.
|
14 |
+
window: "hann" # Window function.
|
15 |
+
num_mels: 80 # Number of mel basis.
|
16 |
+
fmin: 80 # Minimum freq in mel basis calculation.
|
17 |
+
fmax: 7600 # Maximum frequency in mel basis calculation.
|
18 |
+
global_gain_scale: 1.0 # Will be multiplied to all of waveform.
|
19 |
+
trim_silence: true # Whether to trim the start and end of silence.
|
20 |
+
trim_threshold_in_db: 20 # Need to tune carefully if the recording is not good.
|
21 |
+
trim_frame_size: 1024 # Frame size in trimming.
|
22 |
+
trim_hop_size: 256 # Hop size in trimming.
|
23 |
+
format: "hdf5" # Feature file format. "npy" or "hdf5" is supported.
|
24 |
+
|
25 |
+
###########################################################
|
26 |
+
# GENERATOR NETWORK ARCHITECTURE SETTING #
|
27 |
+
###########################################################
|
28 |
+
generator_params:
|
29 |
+
in_channels: 1 # Number of input channels.
|
30 |
+
out_channels: 1 # Number of output channels.
|
31 |
+
kernel_size: 3 # Kernel size of dilated convolution.
|
32 |
+
layers: 30 # Number of residual block layers.
|
33 |
+
stacks: 3 # Number of stacks i.e., dilation cycles.
|
34 |
+
residual_channels: 64 # Number of channels in residual conv.
|
35 |
+
gate_channels: 128 # Number of channels in gated conv.
|
36 |
+
skip_channels: 64 # Number of channels in skip conv.
|
37 |
+
aux_channels: 80 # Number of channels for auxiliary feature conv.
|
38 |
+
# Must be the same as num_mels.
|
39 |
+
aux_context_window: 2 # Context window size for auxiliary feature.
|
40 |
+
# If set to 2, previous 2 and future 2 frames will be considered.
|
41 |
+
dropout: 0.0 # Dropout rate. 0.0 means no dropout applied.
|
42 |
+
use_weight_norm: true # Whether to use weight norm.
|
43 |
+
# If set to true, it will be applied to all of the conv layers.
|
44 |
+
upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
|
45 |
+
upsample_params: # Upsampling network parameters.
|
46 |
+
upsample_scales: [4, 4, 4, 4] # Upsampling scales. Prodcut of these must be the same as hop size.
|
47 |
+
|
48 |
+
###########################################################
|
49 |
+
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
|
50 |
+
###########################################################
|
51 |
+
discriminator_params:
|
52 |
+
in_channels: 1 # Number of input channels.
|
53 |
+
out_channels: 1 # Number of output channels.
|
54 |
+
kernel_size: 3 # Number of output channels.
|
55 |
+
layers: 10 # Number of conv layers.
|
56 |
+
conv_channels: 64 # Number of chnn layers.
|
57 |
+
bias: true # Whether to use bias parameter in conv.
|
58 |
+
use_weight_norm: true # Whether to use weight norm.
|
59 |
+
# If set to true, it will be applied to all of the conv layers.
|
60 |
+
nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
|
61 |
+
nonlinear_activation_params: # Nonlinear function parameters
|
62 |
+
negative_slope: 0.2 # Alpha in LeakyReLU.
|
63 |
+
|
64 |
+
###########################################################
|
65 |
+
# STFT LOSS SETTING #
|
66 |
+
###########################################################
|
67 |
+
stft_loss_params:
|
68 |
+
fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
|
69 |
+
hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss
|
70 |
+
win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
|
71 |
+
window: "hann_window" # Window function for STFT-based loss
|
72 |
+
|
73 |
+
###########################################################
|
74 |
+
# ADVERSARIAL LOSS SETTING #
|
75 |
+
###########################################################
|
76 |
+
lambda_adv: 4.0 # Loss balancing coefficient.
|
77 |
+
|
78 |
+
###########################################################
|
79 |
+
# DATA LOADER SETTING #
|
80 |
+
###########################################################
|
81 |
+
batch_size: 10 # Batch size.
|
82 |
+
batch_max_steps: 15360 # Length of each audio in batch. Make sure dividable by hop_size.
|
83 |
+
pin_memory: true # Whether to pin memory in Pytorch DataLoader.
|
84 |
+
num_workers: 2 # Number of workers in Pytorch DataLoader.
|
85 |
+
remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
|
86 |
+
allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.
|
87 |
+
|
88 |
+
###########################################################
|
89 |
+
# OPTIMIZER & SCHEDULER SETTING #
|
90 |
+
###########################################################
|
91 |
+
generator_optimizer_params:
|
92 |
+
lr: 0.0001 # Generator's learning rate.
|
93 |
+
eps: 1.0e-6 # Generator's epsilon.
|
94 |
+
weight_decay: 0.0 # Generator's weight decay coefficient.
|
95 |
+
generator_scheduler_params:
|
96 |
+
step_size: 200000 # Generator's scheduler step size.
|
97 |
+
gamma: 0.5 # Generator's scheduler gamma.
|
98 |
+
# At each step size, lr will be multiplied by this parameter.
|
99 |
+
generator_grad_norm: 10 # Generator's gradient norm.
|
100 |
+
discriminator_optimizer_params:
|
101 |
+
lr: 0.00005 # Discriminator's learning rate.
|
102 |
+
eps: 1.0e-6 # Discriminator's epsilon.
|
103 |
+
weight_decay: 0.0 # Discriminator's weight decay coefficient.
|
104 |
+
discriminator_scheduler_params:
|
105 |
+
step_size: 200000 # Discriminator's scheduler step size.
|
106 |
+
gamma: 0.5 # Discriminator's scheduler gamma.
|
107 |
+
# At each step size, lr will be multiplied by this parameter.
|
108 |
+
discriminator_grad_norm: 1 # Discriminator's gradient norm.
|
109 |
+
|
110 |
+
###########################################################
|
111 |
+
# INTERVAL SETTING #
|
112 |
+
###########################################################
|
113 |
+
discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
|
114 |
+
train_max_steps: 400000 # Number of training steps.
|
115 |
+
save_interval_steps: 5000 # Interval steps to save checkpoint.
|
116 |
+
eval_interval_steps: 1000 # Interval steps to evaluate the network.
|
117 |
+
log_interval_steps: 100 # Interval steps to record the training log.
|
118 |
+
|
119 |
+
###########################################################
|
120 |
+
# OTHER SETTING #
|
121 |
+
###########################################################
|
122 |
+
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
|
ParallelWaveGAN/egs/jnas/voc1/conf/slurm.conf
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Default configuration
|
2 |
+
command sbatch --export=PATH --ntasks-per-node=1
|
3 |
+
option time=* --time $0
|
4 |
+
option mem=* --mem-per-cpu $0
|
5 |
+
option mem=0 # Do not add anything to qsub_opts
|
6 |
+
option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
|
7 |
+
option num_threads=1 --cpus-per-task 1 --ntasks-per-node=1 # Do not add anything to qsub_opts
|
8 |
+
default gpu=0
|
9 |
+
option gpu=0 -p cpu
|
10 |
+
option gpu=* -p gpu --gres=gpu:$0
|
11 |
+
# note: the --max-jobs-run option is supported as a special case
|
12 |
+
# by slurm.pl and you don't have to handle it in the config file.
|
ParallelWaveGAN/egs/jnas/voc1/conf/train_speakers.txt
ADDED
@@ -0,0 +1,261 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
F001
|
2 |
+
F002
|
3 |
+
F003
|
4 |
+
F004
|
5 |
+
F007
|
6 |
+
F008
|
7 |
+
F009
|
8 |
+
F010
|
9 |
+
F012
|
10 |
+
F013
|
11 |
+
F015
|
12 |
+
F016
|
13 |
+
F018
|
14 |
+
F019
|
15 |
+
F020
|
16 |
+
F022
|
17 |
+
F023
|
18 |
+
F024
|
19 |
+
F025
|
20 |
+
F026
|
21 |
+
F027
|
22 |
+
F028
|
23 |
+
F029
|
24 |
+
F030
|
25 |
+
F031
|
26 |
+
F032
|
27 |
+
F033
|
28 |
+
F034
|
29 |
+
F035
|
30 |
+
F036
|
31 |
+
F037
|
32 |
+
F038
|
33 |
+
F039
|
34 |
+
F040
|
35 |
+
F041
|
36 |
+
F042
|
37 |
+
F043
|
38 |
+
F044
|
39 |
+
F045
|
40 |
+
F046
|
41 |
+
F047
|
42 |
+
F049
|
43 |
+
F050
|
44 |
+
F051
|
45 |
+
F052
|
46 |
+
F053
|
47 |
+
F054
|
48 |
+
F055
|
49 |
+
F056
|
50 |
+
F057
|
51 |
+
F058
|
52 |
+
F059
|
53 |
+
F061
|
54 |
+
F062
|
55 |
+
F063
|
56 |
+
F065
|
57 |
+
F066
|
58 |
+
F067
|
59 |
+
F069
|
60 |
+
F070
|
61 |
+
F071
|
62 |
+
F073
|
63 |
+
F074
|
64 |
+
F076
|
65 |
+
F077
|
66 |
+
F079
|
67 |
+
F081
|
68 |
+
F083
|
69 |
+
F084
|
70 |
+
F085
|
71 |
+
F087
|
72 |
+
F090
|
73 |
+
F091
|
74 |
+
F092
|
75 |
+
F093
|
76 |
+
F094
|
77 |
+
F095
|
78 |
+
F096
|
79 |
+
F097
|
80 |
+
F098
|
81 |
+
F099
|
82 |
+
F100
|
83 |
+
F101
|
84 |
+
F103
|
85 |
+
F104
|
86 |
+
F105
|
87 |
+
F106
|
88 |
+
F107
|
89 |
+
F108
|
90 |
+
F110
|
91 |
+
F111
|
92 |
+
F112
|
93 |
+
F113
|
94 |
+
F114
|
95 |
+
F115
|
96 |
+
F116
|
97 |
+
F117
|
98 |
+
F118
|
99 |
+
F119
|
100 |
+
F120
|
101 |
+
F121
|
102 |
+
F123
|
103 |
+
F124
|
104 |
+
F125
|
105 |
+
F126
|
106 |
+
F127
|
107 |
+
F128
|
108 |
+
F130
|
109 |
+
F131
|
110 |
+
F132
|
111 |
+
F133
|
112 |
+
F134
|
113 |
+
F135
|
114 |
+
F136
|
115 |
+
F137
|
116 |
+
F138
|
117 |
+
F143A
|
118 |
+
F143B
|
119 |
+
F144A
|
120 |
+
F145A
|
121 |
+
F145B
|
122 |
+
F146A
|
123 |
+
F146B
|
124 |
+
F149
|
125 |
+
F150
|
126 |
+
FP01
|
127 |
+
FP02
|
128 |
+
FP03
|
129 |
+
FP04
|
130 |
+
FP05
|
131 |
+
M001
|
132 |
+
M002
|
133 |
+
M003
|
134 |
+
M004
|
135 |
+
M007
|
136 |
+
M008
|
137 |
+
M009
|
138 |
+
M010
|
139 |
+
M011
|
140 |
+
M012
|
141 |
+
M013
|
142 |
+
M015
|
143 |
+
M016
|
144 |
+
M018
|
145 |
+
M019
|
146 |
+
M020
|
147 |
+
M021
|
148 |
+
M022
|
149 |
+
M023
|
150 |
+
M024
|
151 |
+
M025
|
152 |
+
M027
|
153 |
+
M028
|
154 |
+
M029
|
155 |
+
M030
|
156 |
+
M031
|
157 |
+
M032
|
158 |
+
M033
|
159 |
+
M034
|
160 |
+
M035
|
161 |
+
M036
|
162 |
+
M037
|
163 |
+
M038
|
164 |
+
M039
|
165 |
+
M040
|
166 |
+
M041
|
167 |
+
M042
|
168 |
+
M043
|
169 |
+
M044
|
170 |
+
M045
|
171 |
+
M046
|
172 |
+
M047
|
173 |
+
M049
|
174 |
+
M050
|
175 |
+
M051
|
176 |
+
M052
|
177 |
+
M053
|
178 |
+
M054
|
179 |
+
M055
|
180 |
+
M056
|
181 |
+
M057
|
182 |
+
M058
|
183 |
+
M059
|
184 |
+
M061
|
185 |
+
M062
|
186 |
+
M063
|
187 |
+
M065
|
188 |
+
M066
|
189 |
+
M067
|
190 |
+
M069
|
191 |
+
M070
|
192 |
+
M071
|
193 |
+
M073
|
194 |
+
M074
|
195 |
+
M076
|
196 |
+
M077
|
197 |
+
M079
|
198 |
+
M081
|
199 |
+
M083
|
200 |
+
M084
|
201 |
+
M085
|
202 |
+
M087
|
203 |
+
M090
|
204 |
+
M091
|
205 |
+
M092
|
206 |
+
M093
|
207 |
+
M094
|
208 |
+
M095
|
209 |
+
M096
|
210 |
+
M097
|
211 |
+
M098
|
212 |
+
M099
|
213 |
+
M100
|
214 |
+
M101
|
215 |
+
M103
|
216 |
+
M104
|
217 |
+
M105
|
218 |
+
M106
|
219 |
+
M107
|
220 |
+
M108
|
221 |
+
M110
|
222 |
+
M111
|
223 |
+
M112
|
224 |
+
M113
|
225 |
+
M114
|
226 |
+
M116
|
227 |
+
M117
|
228 |
+
M118
|
229 |
+
M119
|
230 |
+
M120
|
231 |
+
M121
|
232 |
+
M123
|
233 |
+
M124
|
234 |
+
M125
|
235 |
+
M126
|
236 |
+
M127
|
237 |
+
M128
|
238 |
+
M130
|
239 |
+
M131
|
240 |
+
M132
|
241 |
+
M133
|
242 |
+
M134
|
243 |
+
M135
|
244 |
+
M136
|
245 |
+
M137
|
246 |
+
M138
|
247 |
+
M139A
|
248 |
+
M139B
|
249 |
+
M140A
|
250 |
+
M140B
|
251 |
+
M141A
|
252 |
+
M141B
|
253 |
+
M142A
|
254 |
+
M142B
|
255 |
+
M147
|
256 |
+
M148
|
257 |
+
MP01
|
258 |
+
MP02
|
259 |
+
MP03
|
260 |
+
MP04
|
261 |
+
MP05
|
ParallelWaveGAN/egs/jnas/voc1/local/data_prep.sh
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# Copyright 2019 Tomoki Hayashi
|
4 |
+
# MIT License (https://opensource.org/licenses/MIT)
|
5 |
+
|
6 |
+
# shellcheck disable=SC1091
|
7 |
+
. ./path.sh || exit 1;
|
8 |
+
|
9 |
+
num_dev=500
|
10 |
+
train_set="train_nodev"
|
11 |
+
dev_set="dev"
|
12 |
+
eval_set="eval"
|
13 |
+
shuffle=false
|
14 |
+
|
15 |
+
# shellcheck disable=SC1091
|
16 |
+
. utils/parse_options.sh || exit 1;
|
17 |
+
|
18 |
+
# check arguments
|
19 |
+
if [ $# != 3 ]; then
|
20 |
+
echo "Usage: $0 <db_root> <data_dir> <spk_list>"
|
21 |
+
echo "e.g.: $0 /database/JNAS data conf/train_speakers.txt"
|
22 |
+
echo ""
|
23 |
+
echo "Options:"
|
24 |
+
echo " --num_dev: number of development uttreances (default=500)."
|
25 |
+
echo " --train_set: name of train set (default=train_nodev)."
|
26 |
+
echo " --dev_set: name of dev set (default=dev)."
|
27 |
+
echo " --eval_set: name of eval set (default=eval)."
|
28 |
+
echo " --shuffle: whether to perform shuffle in making dev / eval set (default=false)."
|
29 |
+
exit 1
|
30 |
+
fi
|
31 |
+
|
32 |
+
set -euo pipefail
|
33 |
+
|
34 |
+
db_root=$1 # database root directory
|
35 |
+
data_dir=$2
|
36 |
+
spk_list=$3
|
37 |
+
|
38 |
+
eval_db_root=${db_root}/DOCS/Test_set
|
39 |
+
wav_type=HS # DT or HS
|
40 |
+
|
41 |
+
# make directories
|
42 |
+
for name in train "${eval_set}"; do
|
43 |
+
[ ! -e "${data_dir}/${name}" ] && mkdir -p "${data_dir}/${name}"
|
44 |
+
done
|
45 |
+
|
46 |
+
# make training & development data
|
47 |
+
scp="${data_dir}/train/wav.scp"
|
48 |
+
|
49 |
+
# check file existence
|
50 |
+
[ -e "${scp}" ] && rm "${scp}"
|
51 |
+
|
52 |
+
# shellcheck disable=SC2013
|
53 |
+
for spk in $(cat "${spk_list}"); do
|
54 |
+
wavdir=${db_root}/WAVES_${wav_type}/${spk}
|
55 |
+
[ ! -e "${wavdir}" ] && echo "There are no such a directory (${wavdir})" && exit 1
|
56 |
+
find "${wavdir}" -follow -name "*.wav" | sort | while read -r filename; do
|
57 |
+
id=$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")
|
58 |
+
echo "${spk}_${id} ${filename}" >> "${scp}"
|
59 |
+
done
|
60 |
+
done
|
61 |
+
|
62 |
+
# shuffle
|
63 |
+
cp "${scp}" "${scp}.tmp"
|
64 |
+
sort -R "${scp}.tmp" > "${scp}"
|
65 |
+
rm -r "${scp}.tmp"
|
66 |
+
|
67 |
+
# split
|
68 |
+
utils/split_data.sh \
|
69 |
+
--num_second ${num_dev} \
|
70 |
+
--shuffle "${shuffle}" \
|
71 |
+
"${data_dir}/train" \
|
72 |
+
"${data_dir}/${train_set}" \
|
73 |
+
"${data_dir}/${dev_set}"
|
74 |
+
|
75 |
+
# make evaluation data
|
76 |
+
scp="${data_dir}/${eval_set}/wav.scp"
|
77 |
+
|
78 |
+
# check file existence
|
79 |
+
[ -e "${scp}" ] && rm "${scp}"
|
80 |
+
|
81 |
+
for name in JNAS_testset_100 JNAS_testset_500; do
|
82 |
+
find "${eval_db_root}/${name}/WAVES" -follow -name "*.wav" | sort | while read -r filename; do
|
83 |
+
id=$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")
|
84 |
+
dirname=$(basename "$(dirname "${filename}")")
|
85 |
+
echo "${name}_${dirname}_${id} ${filename}" >> "${scp}"
|
86 |
+
done
|
87 |
+
done
|
88 |
+
|
89 |
+
echo "Successfully prepared data."
|
ParallelWaveGAN/egs/jnas/voc1/path.sh
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# cuda related
|
2 |
+
export CUDA_HOME=/usr/local/cuda-10.0
|
3 |
+
export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
|
4 |
+
|
5 |
+
# path related
|
6 |
+
export PRJ_ROOT="${PWD}/../../.."
|
7 |
+
if [ -e "${PRJ_ROOT}/tools/venv/bin/activate" ]; then
|
8 |
+
# shellcheck disable=SC1090
|
9 |
+
. "${PRJ_ROOT}/tools/venv/bin/activate"
|
10 |
+
fi
|
11 |
+
|
12 |
+
# python related
|
13 |
+
export OMP_NUM_THREADS=1
|
14 |
+
export PYTHONIOENCODING=UTF-8
|
15 |
+
export MPL_BACKEND=Agg
|
16 |
+
|
17 |
+
# check installation
|
18 |
+
if ! command -v parallel-wavegan-train > /dev/null; then
|
19 |
+
echo "Error: It seems setup is not finished." >&2
|
20 |
+
echo "Error: Please setup your environment by following README.md" >&2
|
21 |
+
return 1
|
22 |
+
fi
|
23 |
+
if ! command -v jq > /dev/null; then
|
24 |
+
echo "Error: It seems jq is not installed." >&2
|
25 |
+
echo "Error: Please install via \`sudo apt-get install jq\`." >&2
|
26 |
+
echo "Error: If you do not have sudo, please download from https://stedolan.github.io/jq/download/." >&2
|
27 |
+
return 1
|
28 |
+
fi
|
29 |
+
if ! command -v yq > /dev/null; then
|
30 |
+
echo "Error: It seems yq is not installed." >&2
|
31 |
+
echo "Error: Please install via \`pip install yq\`." >&2
|
32 |
+
return 1
|
33 |
+
fi
|
ParallelWaveGAN/egs/jnas/voc1/run.sh
ADDED
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# Copyright 2019 Tomoki Hayashi
|
4 |
+
# MIT License (https://opensource.org/licenses/MIT)
|
5 |
+
|
6 |
+
. ./cmd.sh || exit 1;
|
7 |
+
. ./path.sh || exit 1;
|
8 |
+
|
9 |
+
# basic settings
|
10 |
+
stage=0 # stage to start
|
11 |
+
stop_stage=100 # stage to stop
|
12 |
+
verbose=1 # verbosity level (lower is less info)
|
13 |
+
n_gpus=1 # number of gpus in training
|
14 |
+
n_jobs=16 # number of parallel jobs in feature extraction
|
15 |
+
|
16 |
+
# NOTE(kan-bayashi): renamed to conf to avoid conflict in parse_options.sh
|
17 |
+
conf=conf/parallel_wavegan.v1.yaml
|
18 |
+
|
19 |
+
# directory path setting
|
20 |
+
db_root=/database/JNAS # database direcotry
|
21 |
+
dumpdir=dump # directory to dump features
|
22 |
+
|
23 |
+
# training related setting
|
24 |
+
tag="" # tag for directory to save model
|
25 |
+
resume="" # checkpoint path to resume training
|
26 |
+
# (e.g. <path>/<to>/checkpoint-10000steps.pkl)
|
27 |
+
|
28 |
+
# decoding related setting
|
29 |
+
checkpoint="" # checkpoint path to be used for decoding
|
30 |
+
# if not provided, the latest one will be used
|
31 |
+
# (e.g. <path>/<to>/checkpoint-400000steps.pkl)
|
32 |
+
|
33 |
+
# shellcheck disable=SC1091
|
34 |
+
. utils/parse_options.sh || exit 1;
|
35 |
+
|
36 |
+
train_set="train_nodev" # name of training data directory
|
37 |
+
dev_set="dev" # name of development data direcotry
|
38 |
+
eval_set="eval" # name of evaluation data direcotry
|
39 |
+
|
40 |
+
set -euo pipefail
|
41 |
+
|
42 |
+
if [ "${stage}" -le 0 ] && [ "${stop_stage}" -ge 0 ]; then
|
43 |
+
echo "Stage 0: Data preparation"
|
44 |
+
local/data_prep.sh \
|
45 |
+
--train_set "${train_set}" \
|
46 |
+
--dev_set "${dev_set}" \
|
47 |
+
--eval_set "${eval_set}" \
|
48 |
+
"${db_root}" data conf/train_speakers.txt
|
49 |
+
fi
|
50 |
+
|
51 |
+
stats_ext=$(grep -q "hdf5" <(yq ".format" "${conf}") && echo "h5" || echo "npy")
|
52 |
+
if [ "${stage}" -le 1 ] && [ "${stop_stage}" -ge 1 ]; then
|
53 |
+
echo "Stage 1: Feature extraction"
|
54 |
+
# extract raw features
|
55 |
+
pids=()
|
56 |
+
for name in "${train_set}" "${dev_set}" "${eval_set}"; do
|
57 |
+
(
|
58 |
+
[ ! -e "${dumpdir}/${name}/raw" ] && mkdir -p "${dumpdir}/${name}/raw"
|
59 |
+
echo "Feature extraction start. See the progress via ${dumpdir}/${name}/raw/preprocessing.*.log."
|
60 |
+
utils/make_subset_data.sh "data/${name}" "${n_jobs}" "${dumpdir}/${name}/raw"
|
61 |
+
${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/raw/preprocessing.JOB.log" \
|
62 |
+
parallel-wavegan-preprocess \
|
63 |
+
--config "${conf}" \
|
64 |
+
--scp "${dumpdir}/${name}/raw/wav.JOB.scp" \
|
65 |
+
--dumpdir "${dumpdir}/${name}/raw/dump.JOB" \
|
66 |
+
--verbose "${verbose}"
|
67 |
+
echo "Successfully finished feature extraction of ${name} set."
|
68 |
+
) &
|
69 |
+
pids+=($!)
|
70 |
+
done
|
71 |
+
i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
|
72 |
+
[ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
|
73 |
+
echo "Successfully finished feature extraction."
|
74 |
+
|
75 |
+
# calculate statistics for normalization
|
76 |
+
echo "Statistics computation start. See the progress via ${dumpdir}/${train_set}/compute_statistics.log."
|
77 |
+
${train_cmd} "${dumpdir}/${train_set}/compute_statistics.log" \
|
78 |
+
parallel-wavegan-compute-statistics \
|
79 |
+
--config "${conf}" \
|
80 |
+
--rootdir "${dumpdir}/${train_set}/raw" \
|
81 |
+
--dumpdir "${dumpdir}/${train_set}" \
|
82 |
+
--verbose "${verbose}"
|
83 |
+
echo "Successfully finished calculation of statistics."
|
84 |
+
|
85 |
+
# normalize and dump them
|
86 |
+
pids=()
|
87 |
+
for name in "${train_set}" "${dev_set}" "${eval_set}"; do
|
88 |
+
(
|
89 |
+
[ ! -e "${dumpdir}/${name}/norm" ] && mkdir -p "${dumpdir}/${name}/norm"
|
90 |
+
echo "Nomalization start. See the progress via ${dumpdir}/${name}/norm/normalize.*.log."
|
91 |
+
${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/norm/normalize.JOB.log" \
|
92 |
+
parallel-wavegan-normalize \
|
93 |
+
--config "${conf}" \
|
94 |
+
--stats "${dumpdir}/${train_set}/stats.${stats_ext}" \
|
95 |
+
--rootdir "${dumpdir}/${name}/raw/dump.JOB" \
|
96 |
+
--dumpdir "${dumpdir}/${name}/norm/dump.JOB" \
|
97 |
+
--verbose "${verbose}"
|
98 |
+
echo "Successfully finished normalization of ${name} set."
|
99 |
+
) &
|
100 |
+
pids+=($!)
|
101 |
+
done
|
102 |
+
i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
|
103 |
+
[ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
|
104 |
+
echo "Successfully finished normalization."
|
105 |
+
fi
|
106 |
+
|
107 |
+
if [ -z "${tag}" ]; then
|
108 |
+
expdir="exp/${train_set}_jnas_$(basename "${conf}" .yaml)"
|
109 |
+
else
|
110 |
+
expdir="exp/${train_set}_jnas_${tag}"
|
111 |
+
fi
|
112 |
+
if [ "${stage}" -le 2 ] && [ "${stop_stage}" -ge 2 ]; then
|
113 |
+
echo "Stage 2: Network training"
|
114 |
+
[ ! -e "${expdir}" ] && mkdir -p "${expdir}"
|
115 |
+
cp "${dumpdir}/${train_set}/stats.${stats_ext}" "${expdir}"
|
116 |
+
if [ "${n_gpus}" -gt 1 ]; then
|
117 |
+
train="python -m parallel_wavegan.distributed.launch --nproc_per_node ${n_gpus} -c parallel-wavegan-train"
|
118 |
+
else
|
119 |
+
train="parallel-wavegan-train"
|
120 |
+
fi
|
121 |
+
echo "Training start. See the progress via ${expdir}/train.log."
|
122 |
+
${cuda_cmd} --gpu "${n_gpus}" "${expdir}/train.log" \
|
123 |
+
${train} \
|
124 |
+
--config "${conf}" \
|
125 |
+
--train-dumpdir "${dumpdir}/${train_set}/norm" \
|
126 |
+
--dev-dumpdir "${dumpdir}/${dev_set}/norm" \
|
127 |
+
--outdir "${expdir}" \
|
128 |
+
--resume "${resume}" \
|
129 |
+
--verbose "${verbose}"
|
130 |
+
echo "Successfully finished training."
|
131 |
+
fi
|
132 |
+
|
133 |
+
if [ "${stage}" -le 3 ] && [ "${stop_stage}" -ge 3 ]; then
|
134 |
+
echo "Stage 3: Network decoding"
|
135 |
+
# shellcheck disable=SC2012
|
136 |
+
[ -z "${checkpoint}" ] && checkpoint="$(ls -dt "${expdir}"/*.pkl | head -1 || true)"
|
137 |
+
outdir="${expdir}/wav/$(basename "${checkpoint}" .pkl)"
|
138 |
+
pids=()
|
139 |
+
for name in "${dev_set}" "${eval_set}"; do
|
140 |
+
(
|
141 |
+
[ ! -e "${outdir}/${name}" ] && mkdir -p "${outdir}/${name}"
|
142 |
+
[ "${n_gpus}" -gt 1 ] && n_gpus=1
|
143 |
+
echo "Decoding start. See the progress via ${outdir}/${name}/decode.log."
|
144 |
+
${cuda_cmd} --gpu "${n_gpus}" "${outdir}/${name}/decode.log" \
|
145 |
+
parallel-wavegan-decode \
|
146 |
+
--dumpdir "${dumpdir}/${name}/norm" \
|
147 |
+
--checkpoint "${checkpoint}" \
|
148 |
+
--outdir "${outdir}/${name}" \
|
149 |
+
--verbose "${verbose}"
|
150 |
+
echo "Successfully finished decoding of ${name} set."
|
151 |
+
) &
|
152 |
+
pids+=($!)
|
153 |
+
done
|
154 |
+
i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
|
155 |
+
[ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
|
156 |
+
echo "Successfully finished decoding."
|
157 |
+
fi
|
158 |
+
echo "Finished."
|
ParallelWaveGAN/egs/jnas/voc1/utils
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
../../../utils
|
ParallelWaveGAN/egs/jsss/voc1/cmd.sh
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
|
2 |
+
# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
|
3 |
+
# e.g.
|
4 |
+
# run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
|
5 |
+
#
|
6 |
+
# Options:
|
7 |
+
# --time <time>: Limit the maximum time to execute.
|
8 |
+
# --mem <mem>: Limit the maximum memory usage.
|
9 |
+
# -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
|
10 |
+
# --num-threads <ngpu>: Specify the number of CPU core.
|
11 |
+
# --gpu <ngpu>: Specify the number of GPU devices.
|
12 |
+
# --config: Change the configuration file from default.
|
13 |
+
#
|
14 |
+
# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
|
15 |
+
# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
|
16 |
+
# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
|
17 |
+
# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
|
18 |
+
#
|
19 |
+
# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
|
20 |
+
# These options are mapping to specific options for each backend and
|
21 |
+
# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
|
22 |
+
# If jobs failed, your configuration might be wrong for your environment.
|
23 |
+
#
|
24 |
+
#
|
25 |
+
# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
|
26 |
+
# "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
|
27 |
+
# =========================================================~
|
28 |
+
|
29 |
+
|
30 |
+
# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
|
31 |
+
cmd_backend="local"
|
32 |
+
|
33 |
+
# Local machine, without any Job scheduling system
|
34 |
+
if [ "${cmd_backend}" = local ]; then
|
35 |
+
|
36 |
+
# The other usage
|
37 |
+
export train_cmd="utils/run.pl"
|
38 |
+
# Used for "*_train.py": "--gpu" is appended optionally by run.sh
|
39 |
+
export cuda_cmd="utils/run.pl"
|
40 |
+
# Used for "*_recog.py"
|
41 |
+
export decode_cmd="utils/run.pl"
|
42 |
+
|
43 |
+
# Local machine, without any Job scheduling system
|
44 |
+
elif [ "${cmd_backend}" = stdout ]; then
|
45 |
+
|
46 |
+
# The other usage
|
47 |
+
export train_cmd="utils/stdout.pl"
|
48 |
+
# Used for "*_train.py": "--gpu" is appended optionally by run.sh
|
49 |
+
export cuda_cmd="utils/stdout.pl"
|
50 |
+
# Used for "*_recog.py"
|
51 |
+
export decode_cmd="utils/stdout.pl"
|
52 |
+
|
53 |
+
# "qsub" (SGE, Torque, PBS, etc.)
|
54 |
+
elif [ "${cmd_backend}" = sge ]; then
|
55 |
+
# The default setting is written in conf/queue.conf.
|
56 |
+
# You must change "-q g.q" for the "queue" for your environment.
|
57 |
+
# To know the "queue" names, type "qhost -q"
|
58 |
+
# Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
|
59 |
+
|
60 |
+
export train_cmd="utils/queue.pl"
|
61 |
+
export cuda_cmd="utils/queue.pl"
|
62 |
+
export decode_cmd="utils/queue.pl"
|
63 |
+
|
64 |
+
# "sbatch" (Slurm)
|
65 |
+
elif [ "${cmd_backend}" = slurm ]; then
|
66 |
+
# The default setting is written in conf/slurm.conf.
|
67 |
+
# You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
|
68 |
+
# To know the "partion" names, type "sinfo".
|
69 |
+
# You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
|
70 |
+
# The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
|
71 |
+
|
72 |
+
export train_cmd="utils/slurm.pl"
|
73 |
+
export cuda_cmd="utils/slurm.pl"
|
74 |
+
export decode_cmd="utils/slurm.pl"
|
75 |
+
|
76 |
+
elif [ "${cmd_backend}" = ssh ]; then
|
77 |
+
# You have to create ".queue/machines" to specify the host to execute jobs.
|
78 |
+
# e.g. .queue/machines
|
79 |
+
# host1
|
80 |
+
# host2
|
81 |
+
# host3
|
82 |
+
# Assuming you can login them without any password, i.e. You have to set ssh keys.
|
83 |
+
|
84 |
+
export train_cmd="utils/ssh.pl"
|
85 |
+
export cuda_cmd="utils/ssh.pl"
|
86 |
+
export decode_cmd="utils/ssh.pl"
|
87 |
+
|
88 |
+
else
|
89 |
+
echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
|
90 |
+
return 1
|
91 |
+
fi
|
ParallelWaveGAN/egs/jsss/voc1/conf/parallel_wavegan.v1.yaml
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This is the hyperparameter configuration file for Parallel WaveGAN.
|
2 |
+
# Please make sure this is adjusted for the JSSS dataset. If you want to
|
3 |
+
# apply to the other dataset, you might need to carefully change some parameters.
|
4 |
+
# This configuration requires 12 GB GPU memory and takes ~3 days on RTX TITAN.
|
5 |
+
|
6 |
+
###########################################################
|
7 |
+
# FEATURE EXTRACTION SETTING #
|
8 |
+
###########################################################
|
9 |
+
sampling_rate: 24000 # Sampling rate.
|
10 |
+
fft_size: 2048 # FFT size.
|
11 |
+
hop_size: 300 # Hop size.
|
12 |
+
win_length: 1200 # Window length.
|
13 |
+
# If set to null, it will be the same as fft_size.
|
14 |
+
window: "hann" # Window function.
|
15 |
+
num_mels: 80 # Number of mel basis.
|
16 |
+
fmin: 80 # Minimum freq in mel basis calculation.
|
17 |
+
fmax: 7600 # Maximum frequency in mel basis calculation.
|
18 |
+
global_gain_scale: 1.0 # Will be multiplied to all of waveform.
|
19 |
+
trim_silence: false # Whether to trim the start and end of silence.
|
20 |
+
trim_threshold_in_db: 40 # Need to tune carefully if the recording is not good.
|
21 |
+
trim_frame_size: 2048 # Frame size in trimming.
|
22 |
+
trim_hop_size: 512 # Hop size in trimming.
|
23 |
+
format: "hdf5" # Feature file format. "npy" or "hdf5" is supported.
|
24 |
+
|
25 |
+
###########################################################
|
26 |
+
# GENERATOR NETWORK ARCHITECTURE SETTING #
|
27 |
+
###########################################################
|
28 |
+
generator_params:
|
29 |
+
in_channels: 1 # Number of input channels.
|
30 |
+
out_channels: 1 # Number of output channels.
|
31 |
+
kernel_size: 3 # Kernel size of dilated convolution.
|
32 |
+
layers: 30 # Number of residual block layers.
|
33 |
+
stacks: 3 # Number of stacks i.e., dilation cycles.
|
34 |
+
residual_channels: 64 # Number of channels in residual conv.
|
35 |
+
gate_channels: 128 # Number of channels in gated conv.
|
36 |
+
skip_channels: 64 # Number of channels in skip conv.
|
37 |
+
aux_channels: 80 # Number of channels for auxiliary feature conv.
|
38 |
+
# Must be the same as num_mels.
|
39 |
+
aux_context_window: 2 # Context window size for auxiliary feature.
|
40 |
+
# If set to 2, previous 2 and future 2 frames will be considered.
|
41 |
+
dropout: 0.0 # Dropout rate. 0.0 means no dropout applied.
|
42 |
+
use_weight_norm: true # Whether to use weight norm.
|
43 |
+
# If set to true, it will be applied to all of the conv layers.
|
44 |
+
upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
|
45 |
+
upsample_params: # Upsampling network parameters.
|
46 |
+
upsample_scales: [4, 5, 3, 5] # Upsampling scales. Prodcut of these must be the same as hop size.
|
47 |
+
|
48 |
+
###########################################################
|
49 |
+
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
|
50 |
+
###########################################################
|
51 |
+
discriminator_params:
|
52 |
+
in_channels: 1 # Number of input channels.
|
53 |
+
out_channels: 1 # Number of output channels.
|
54 |
+
kernel_size: 3 # Number of output channels.
|
55 |
+
layers: 10 # Number of conv layers.
|
56 |
+
conv_channels: 64 # Number of chnn layers.
|
57 |
+
bias: true # Whether to use bias parameter in conv.
|
58 |
+
use_weight_norm: true # Whether to use weight norm.
|
59 |
+
# If set to true, it will be applied to all of the conv layers.
|
60 |
+
nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
|
61 |
+
nonlinear_activation_params: # Nonlinear function parameters
|
62 |
+
negative_slope: 0.2 # Alpha in LeakyReLU.
|
63 |
+
|
64 |
+
###########################################################
|
65 |
+
# STFT LOSS SETTING #
|
66 |
+
###########################################################
|
67 |
+
stft_loss_params:
|
68 |
+
fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
|
69 |
+
hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss
|
70 |
+
win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
|
71 |
+
window: "hann_window" # Window function for STFT-based loss
|
72 |
+
|
73 |
+
###########################################################
|
74 |
+
# ADVERSARIAL LOSS SETTING #
|
75 |
+
###########################################################
|
76 |
+
lambda_adv: 4.0 # Loss balancing coefficient.
|
77 |
+
|
78 |
+
###########################################################
|
79 |
+
# DATA LOADER SETTING #
|
80 |
+
###########################################################
|
81 |
+
batch_size: 6 # Batch size.
|
82 |
+
batch_max_steps: 25500 # Length of each audio in batch. Make sure dividable by hop_size.
|
83 |
+
pin_memory: true # Whether to pin memory in Pytorch DataLoader.
|
84 |
+
num_workers: 2 # Number of workers in Pytorch DataLoader.
|
85 |
+
remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
|
86 |
+
allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.
|
87 |
+
|
88 |
+
###########################################################
|
89 |
+
# OPTIMIZER & SCHEDULER SETTING #
|
90 |
+
###########################################################
|
91 |
+
generator_optimizer_params:
|
92 |
+
lr: 0.0001 # Generator's learning rate.
|
93 |
+
eps: 1.0e-6 # Generator's epsilon.
|
94 |
+
weight_decay: 0.0 # Generator's weight decay coefficient.
|
95 |
+
generator_scheduler_params:
|
96 |
+
step_size: 200000 # Generator's scheduler step size.
|
97 |
+
gamma: 0.5 # Generator's scheduler gamma.
|
98 |
+
# At each step size, lr will be multiplied by this parameter.
|
99 |
+
generator_grad_norm: 10 # Generator's gradient norm.
|
100 |
+
discriminator_optimizer_params:
|
101 |
+
lr: 0.00005 # Discriminator's learning rate.
|
102 |
+
eps: 1.0e-6 # Discriminator's epsilon.
|
103 |
+
weight_decay: 0.0 # Discriminator's weight decay coefficient.
|
104 |
+
discriminator_scheduler_params:
|
105 |
+
step_size: 200000 # Discriminator's scheduler step size.
|
106 |
+
gamma: 0.5 # Discriminator's scheduler gamma.
|
107 |
+
# At each step size, lr will be multiplied by this parameter.
|
108 |
+
discriminator_grad_norm: 1 # Discriminator's gradient norm.
|
109 |
+
|
110 |
+
###########################################################
|
111 |
+
# INTERVAL SETTING #
|
112 |
+
###########################################################
|
113 |
+
discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
|
114 |
+
train_max_steps: 400000 # Number of training steps.
|
115 |
+
save_interval_steps: 5000 # Interval steps to save checkpoint.
|
116 |
+
eval_interval_steps: 1000 # Interval steps to evaluate the network.
|
117 |
+
log_interval_steps: 100 # Interval steps to record the training log.
|
118 |
+
|
119 |
+
###########################################################
|
120 |
+
# OTHER SETTING #
|
121 |
+
###########################################################
|
122 |
+
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
|
ParallelWaveGAN/egs/jsss/voc1/conf/slurm.conf
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Default configuration
|
2 |
+
command sbatch --export=PATH --ntasks-per-node=1
|
3 |
+
option time=* --time $0
|
4 |
+
option mem=* --mem-per-cpu $0
|
5 |
+
option mem=0 # Do not add anything to qsub_opts
|
6 |
+
option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
|
7 |
+
option num_threads=1 --cpus-per-task 1 --ntasks-per-node=1 # Do not add anything to qsub_opts
|
8 |
+
default gpu=0
|
9 |
+
option gpu=0 -p cpu
|
10 |
+
option gpu=* -p gpu --gres=gpu:$0
|
11 |
+
# note: the --max-jobs-run option is supported as a special case
|
12 |
+
# by slurm.pl and you don't have to handle it in the config file.
|
ParallelWaveGAN/egs/jsss/voc1/local/data_download.sh
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# Copyright 2020 Tomoki Hayashi
|
4 |
+
# MIT License (https://opensource.org/licenses/MIT)
|
5 |
+
|
6 |
+
# Download JSSS Corpus
|
7 |
+
|
8 |
+
# shellcheck disable=SC1091
|
9 |
+
. ./path.sh || exit 1
|
10 |
+
|
11 |
+
download_dir=$1
|
12 |
+
|
13 |
+
# check arguments
|
14 |
+
if [ $# != 1 ]; then
|
15 |
+
echo "Usage: $0 <download_dir>"
|
16 |
+
exit 1
|
17 |
+
fi
|
18 |
+
|
19 |
+
set -euo pipefail
|
20 |
+
|
21 |
+
url="https://drive.google.com/a/g.sp.m.is.nagoya-u.ac.jp/uc?id=1NyiZCXkYTdYBNtD1B-IMAYCVa-0SQsKX"
|
22 |
+
if [ ! -e "${download_dir}/jsss_ver1" ]; then
|
23 |
+
utils/download_from_google_drive.sh "${url}" "${download_dir}" zip
|
24 |
+
echo "Successfully downloaded JSSS corpus."
|
25 |
+
else
|
26 |
+
echo "Already exists. Skipped."
|
27 |
+
fi
|
28 |
+
|
29 |
+
cwd=$(pwd)
|
30 |
+
if [ ! -e "${download_dir}/JSSSLabel" ]; then
|
31 |
+
echo "Downloading full-context labels for jsut v1.1..."
|
32 |
+
cd "${download_dir}"
|
33 |
+
git clone https://github.com/kan-bayashi/JSSSLabel
|
34 |
+
for name in long-form short-form simplification summarization; do
|
35 |
+
cp -vr JSSSLabel/${name} jsss_ver1
|
36 |
+
done
|
37 |
+
cd "${cwd}"
|
38 |
+
echo "Successfully downloaded JSSS label."
|
39 |
+
else
|
40 |
+
echo "Already exists. Skipped."
|
41 |
+
fi
|
ParallelWaveGAN/egs/jsss/voc1/local/data_prep.sh
ADDED
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# Copyright 2020 Tomoki Hayashi
|
4 |
+
# MIT License (https://opensource.org/licenses/MIT)
|
5 |
+
|
6 |
+
# Prepare kaldi-style data directory for JSSS corpus
|
7 |
+
|
8 |
+
fs=24000
|
9 |
+
num_dev=50
|
10 |
+
num_eval=50
|
11 |
+
train_set="train_nodev"
|
12 |
+
dev_set="dev"
|
13 |
+
eval_set="eval"
|
14 |
+
shuffle=false
|
15 |
+
|
16 |
+
# shellcheck disable=SC1091
|
17 |
+
. utils/parse_options.sh || exit 1;
|
18 |
+
|
19 |
+
db=$1
|
20 |
+
data_dir_root=$2
|
21 |
+
|
22 |
+
# check arguments
|
23 |
+
if [ $# != 2 ]; then
|
24 |
+
echo "Usage: $0 [Options] <db> <data_dir>"
|
25 |
+
echo "e.g.: $0 downloads/jsss_ver1 data"
|
26 |
+
echo ""
|
27 |
+
echo "Options:"
|
28 |
+
echo " --fs: target sampling rate (default=24000)."
|
29 |
+
echo " --num_dev: number of development uttreances (default=50)."
|
30 |
+
echo " --num_eval: number of evaluation uttreances (default=50)."
|
31 |
+
echo " --train_set: name of train set (default=train_nodev)."
|
32 |
+
echo " --dev_set: name of dev set (default=dev)."
|
33 |
+
echo " --eval_set: name of eval set (default=eval)."
|
34 |
+
echo " --shuffle: whether to perform shuffle in making dev / eval set (default=false)."
|
35 |
+
exit 1
|
36 |
+
fi
|
37 |
+
|
38 |
+
set -euo pipefail
|
39 |
+
|
40 |
+
######################################
|
41 |
+
# process data without segments #
|
42 |
+
######################################
|
43 |
+
dsets_without_segments="
|
44 |
+
short-form/basic5000
|
45 |
+
short-form/onomatopee300
|
46 |
+
short-form/voiceactress100
|
47 |
+
simplification
|
48 |
+
"
|
49 |
+
for dset in ${dsets_without_segments}; do
|
50 |
+
# check directory existence
|
51 |
+
_data_dir=${data_dir_root}/$(basename "${dset}")
|
52 |
+
[ ! -e "${_data_dir}" ] && mkdir -p "${_data_dir}"
|
53 |
+
|
54 |
+
# set filenames
|
55 |
+
scp=${_data_dir}/wav.scp
|
56 |
+
segments=${_data_dir}/segments
|
57 |
+
|
58 |
+
# check file existence
|
59 |
+
[ -e "${scp}" ] && rm "${scp}"
|
60 |
+
[ -e "${segments}" ] && rm "${segments}"
|
61 |
+
|
62 |
+
# make wav.scp and segments
|
63 |
+
find "${db}/${dset}/wav24kHz16bit" -name "*.wav" | sort | while read -r filename; do
|
64 |
+
utt_id=$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")
|
65 |
+
lab_filename="${db}/${dset}/lab/$(basename "${filename}" .wav).lab"
|
66 |
+
if [ ! -e "${lab_filename}" ]; then
|
67 |
+
echo "${lab_filename} does not exist. Skipped."
|
68 |
+
continue
|
69 |
+
fi
|
70 |
+
start_sec=$(head -n 1 "${lab_filename}" | cut -d " " -f 2)
|
71 |
+
end_sec=$(tail -n 1 "${lab_filename}" | cut -d " " -f 1)
|
72 |
+
echo "${utt_id} ${utt_id} ${start_sec} ${end_sec}" >> "${segments}"
|
73 |
+
if [ "${fs}" -eq 24000 ]; then
|
74 |
+
# default sampling rate
|
75 |
+
echo "${utt_id} ${filename}" >> "${scp}"
|
76 |
+
else
|
77 |
+
echo "${utt_id} sox ${filename} -t wav -r $fs - |" >> "${scp}"
|
78 |
+
fi
|
79 |
+
done
|
80 |
+
echo "Successfully prepared ${dset}."
|
81 |
+
done
|
82 |
+
|
83 |
+
######################################
|
84 |
+
# process data with segments #
|
85 |
+
######################################
|
86 |
+
dsets_with_segments="
|
87 |
+
long-form/katsura-masakazu
|
88 |
+
long-form/udon
|
89 |
+
long-form/washington-dc
|
90 |
+
summarization
|
91 |
+
"
|
92 |
+
for dset in ${dsets_with_segments}; do
|
93 |
+
# check directory existence
|
94 |
+
_data_dir=${data_dir_root}/$(basename "${dset}")
|
95 |
+
[ ! -e "${_data_dir}" ] && mkdir -p "${_data_dir}"
|
96 |
+
|
97 |
+
# set filenames
|
98 |
+
scp=${_data_dir}/wav.scp
|
99 |
+
segments=${_data_dir}/segments
|
100 |
+
|
101 |
+
# check file existence
|
102 |
+
[ -e "${scp}" ] && rm "${scp}"
|
103 |
+
[ -e "${segments}" ] && rm "${segments}"
|
104 |
+
|
105 |
+
# make wav.scp
|
106 |
+
find "${db}/${dset}/wav24kHz16bit" -name "*.wav" | sort | while read -r filename; do
|
107 |
+
wav_id=$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")
|
108 |
+
if [ "${fs}" -eq 24000 ]; then
|
109 |
+
# default sampling rate
|
110 |
+
echo "${wav_id} ${filename}" >> "${scp}"
|
111 |
+
else
|
112 |
+
echo "${wav_id} sox ${filename} -t wav -r $fs - |" >> "${scp}"
|
113 |
+
fi
|
114 |
+
done
|
115 |
+
|
116 |
+
# make segments
|
117 |
+
find "${db}/${dset}/transcript_utf8" -name "*.txt" | sort | while read -r filename; do
|
118 |
+
wav_id=$(basename "${filename}" .txt)
|
119 |
+
while read -r line; do
|
120 |
+
start_sec=$(echo "${line}" | cut -f 1)
|
121 |
+
end_sec=$(echo "${line}" | cut -f 2)
|
122 |
+
utt_id=${wav_id}
|
123 |
+
utt_id+="_$(printf %010d "$(echo "${start_sec}" | tr -d "." | sed -e "s/^[0]*//g")")"
|
124 |
+
utt_id+="_$(printf %010d "$(echo "${end_sec}" | tr -d "." | sed -e "s/^[0]*//g")")"
|
125 |
+
|
126 |
+
# modify segment information with force alignment results
|
127 |
+
lab_filename=${db}/${dset}/lab/${utt_id}.lab
|
128 |
+
if [ ! -e "${lab_filename}" ]; then
|
129 |
+
echo "${lab_filename} does not exist. Skipped."
|
130 |
+
continue
|
131 |
+
fi
|
132 |
+
start_sec_offset=$(head -n 1 "${lab_filename}" | cut -d " " -f 2)
|
133 |
+
end_sec_offset=$(tail -n 1 "${lab_filename}" | cut -d " " -f 1)
|
134 |
+
start_sec=$(python -c "print(${start_sec} + ${start_sec_offset})")
|
135 |
+
end_sec=$(python -c "print(${start_sec} + ${end_sec_offset} - ${start_sec_offset})")
|
136 |
+
echo "${utt_id} ${wav_id} ${start_sec} ${end_sec}" >> "${segments}"
|
137 |
+
done < "${filename}"
|
138 |
+
done
|
139 |
+
|
140 |
+
# fix
|
141 |
+
echo "Successfully prepared ${dset}."
|
142 |
+
done
|
143 |
+
|
144 |
+
######################################
|
145 |
+
# combine and split data #
|
146 |
+
######################################
|
147 |
+
# combine all data
|
148 |
+
combined_data_dirs=""
|
149 |
+
for dset in ${dsets_without_segments} ${dsets_with_segments}; do
|
150 |
+
combined_data_dirs+="${data_dir_root}/$(basename "${dset}") "
|
151 |
+
done
|
152 |
+
# shellcheck disable=SC2086
|
153 |
+
utils/combine_data.sh "${data_dir_root}/all" ${combined_data_dirs}
|
154 |
+
# shellcheck disable=SC2086
|
155 |
+
rm -rf ${combined_data_dirs}
|
156 |
+
|
157 |
+
# split
|
158 |
+
num_all=$(wc -l < "${data_dir_root}/all/segments")
|
159 |
+
num_deveval=$((num_dev + num_eval))
|
160 |
+
num_train=$((num_all - num_deveval))
|
161 |
+
utils/split_data.sh \
|
162 |
+
--num_first "${num_deveval}" \
|
163 |
+
--num_second "${num_train}" \
|
164 |
+
--shuffle "${shuffle}" \
|
165 |
+
"${data_dir_root}/all" \
|
166 |
+
"${data_dir_root}/deveval" \
|
167 |
+
"${data_dir_root}/${train_set}"
|
168 |
+
utils/split_data.sh \
|
169 |
+
--num_first "${num_eval}" \
|
170 |
+
--num_second "${num_dev}" \
|
171 |
+
--shuffle "${shuffle}" \
|
172 |
+
"${data_dir_root}/deveval" \
|
173 |
+
"${data_dir_root}/${eval_set}" \
|
174 |
+
"${data_dir_root}/${dev_set}"
|
175 |
+
|
176 |
+
# remove tmp directories
|
177 |
+
rm -rf "${data_dir_root}/all"
|
178 |
+
rm -rf "${data_dir_root}/deveval"
|
179 |
+
|
180 |
+
echo "Successfully prepared data."
|
ParallelWaveGAN/egs/jsss/voc1/path.sh
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# cuda related
|
2 |
+
export CUDA_HOME=/usr/local/cuda-10.0
|
3 |
+
export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
|
4 |
+
|
5 |
+
# path related
|
6 |
+
export PRJ_ROOT="${PWD}/../../.."
|
7 |
+
if [ -e "${PRJ_ROOT}/tools/venv/bin/activate" ]; then
|
8 |
+
# shellcheck disable=SC1090
|
9 |
+
. "${PRJ_ROOT}/tools/venv/bin/activate"
|
10 |
+
fi
|
11 |
+
|
12 |
+
# python related
|
13 |
+
export OMP_NUM_THREADS=1
|
14 |
+
export PYTHONIOENCODING=UTF-8
|
15 |
+
export MPL_BACKEND=Agg
|
16 |
+
|
17 |
+
# check installation
|
18 |
+
if ! command -v parallel-wavegan-train > /dev/null; then
|
19 |
+
echo "Error: It seems setup is not finished." >&2
|
20 |
+
echo "Error: Please setup your environment by following README.md" >&2
|
21 |
+
return 1
|
22 |
+
fi
|
23 |
+
if ! command -v jq > /dev/null; then
|
24 |
+
echo "Error: It seems jq is not installed." >&2
|
25 |
+
echo "Error: Please install via \`sudo apt-get install jq\`." >&2
|
26 |
+
echo "Error: If you do not have sudo, please download from https://stedolan.github.io/jq/download/." >&2
|
27 |
+
return 1
|
28 |
+
fi
|
29 |
+
if ! command -v yq > /dev/null; then
|
30 |
+
echo "Error: It seems yq is not installed." >&2
|
31 |
+
echo "Error: Please install via \`pip install yq\`." >&2
|
32 |
+
return 1
|
33 |
+
fi
|
ParallelWaveGAN/egs/jsss/voc1/run.sh
ADDED
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# Copyright 2020 Tomoki Hayashi
|
4 |
+
# MIT License (https://opensource.org/licenses/MIT)
|
5 |
+
|
6 |
+
. ./cmd.sh || exit 1;
|
7 |
+
. ./path.sh || exit 1;
|
8 |
+
|
9 |
+
# basic settings
|
10 |
+
stage=-1 # stage to start
|
11 |
+
stop_stage=100 # stage to stop
|
12 |
+
verbose=1 # verbosity level (lower is less info)
|
13 |
+
n_gpus=1 # number of gpus in training
|
14 |
+
n_jobs=4 # number of parallel jobs in feature extraction
|
15 |
+
|
16 |
+
# NOTE(kan-bayashi): renamed to conf to avoid conflict in parse_options.sh
|
17 |
+
conf=conf/parallel_wavegan.v1.yaml
|
18 |
+
|
19 |
+
# directory path setting
|
20 |
+
download_dir=downloads # direcotry to save downloaded files
|
21 |
+
dumpdir=dump # directory to dump features
|
22 |
+
|
23 |
+
# subset setting
|
24 |
+
shuffle=false # whether to shuffle the data to create subset
|
25 |
+
num_dev=50 # the number of development data
|
26 |
+
num_eval=50 # the number of evaluation data
|
27 |
+
# (if set to 0, the same dev set is used as eval set)
|
28 |
+
|
29 |
+
# training related setting
|
30 |
+
tag="" # tag for directory to save model
|
31 |
+
resume="" # checkpoint path to resume training
|
32 |
+
# (e.g. <path>/<to>/checkpoint-10000steps.pkl)
|
33 |
+
pretrain="" # checkpoint path to load pretrained parameters
|
34 |
+
# (e.g. ../../jsut/<path>/<to>/checkpoint-400000steps.pkl)
|
35 |
+
|
36 |
+
# decoding related setting
|
37 |
+
checkpoint="" # checkpoint path to be used for decoding
|
38 |
+
# if not provided, the latest one will be used
|
39 |
+
# (e.g. <path>/<to>/checkpoint-400000steps.pkl)
|
40 |
+
|
41 |
+
# shellcheck disable=SC1091
|
42 |
+
. utils/parse_options.sh || exit 1;
|
43 |
+
|
44 |
+
train_set="train_nodev" # name of training data directory
|
45 |
+
dev_set="dev" # name of development data direcotry
|
46 |
+
eval_set="eval" # name of evaluation data direcotry
|
47 |
+
|
48 |
+
set -euo pipefail
|
49 |
+
|
50 |
+
if [ "${stage}" -le -1 ] && [ "${stop_stage}" -ge -1 ]; then
|
51 |
+
echo "Stage -1: Data download"
|
52 |
+
local/data_download.sh "${download_dir}"
|
53 |
+
fi
|
54 |
+
|
55 |
+
if [ "${stage}" -le 0 ] && [ "${stop_stage}" -ge 0 ]; then
|
56 |
+
echo "Stage 0: Data preparation"
|
57 |
+
local/data_prep.sh \
|
58 |
+
--fs "$(yq ".sampling_rate" "${conf}")" \
|
59 |
+
--num_dev "${num_dev}" \
|
60 |
+
--num_eval "${num_eval}" \
|
61 |
+
--train_set "${train_set}" \
|
62 |
+
--dev_set "${dev_set}" \
|
63 |
+
--eval_set "${eval_set}" \
|
64 |
+
--shuffle "${shuffle}" \
|
65 |
+
"${download_dir}/jsss_ver1" data
|
66 |
+
fi
|
67 |
+
|
68 |
+
stats_ext=$(grep -q "hdf5" <(yq ".format" "${conf}") && echo "h5" || echo "npy")
|
69 |
+
if [ "${stage}" -le 1 ] && [ "${stop_stage}" -ge 1 ]; then
|
70 |
+
echo "Stage 1: Feature extraction"
|
71 |
+
# extract raw features
|
72 |
+
pids=()
|
73 |
+
for name in "${train_set}" "${dev_set}" "${eval_set}"; do
|
74 |
+
(
|
75 |
+
[ ! -e "${dumpdir}/${name}/raw" ] && mkdir -p "${dumpdir}/${name}/raw"
|
76 |
+
echo "Feature extraction start. See the progress via ${dumpdir}/${name}/raw/preprocessing.*.log."
|
77 |
+
utils/make_subset_data.sh "data/${name}" "${n_jobs}" "${dumpdir}/${name}/raw"
|
78 |
+
${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/raw/preprocessing.JOB.log" \
|
79 |
+
parallel-wavegan-preprocess \
|
80 |
+
--config "${conf}" \
|
81 |
+
--scp "${dumpdir}/${name}/raw/wav.JOB.scp" \
|
82 |
+
--dumpdir "${dumpdir}/${name}/raw/dump.JOB" \
|
83 |
+
--verbose "${verbose}"
|
84 |
+
echo "Successfully finished feature extraction of ${name} set."
|
85 |
+
) &
|
86 |
+
pids+=($!)
|
87 |
+
done
|
88 |
+
i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
|
89 |
+
[ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
|
90 |
+
echo "Successfully finished feature extraction."
|
91 |
+
|
92 |
+
# calculate statistics for normalization
|
93 |
+
if [ -z "${pretrain}" ]; then
|
94 |
+
# calculate statistics for normalization
|
95 |
+
echo "Statistics computation start. See the progress via ${dumpdir}/${train_set}/compute_statistics.log."
|
96 |
+
${train_cmd} "${dumpdir}/${train_set}/compute_statistics.log" \
|
97 |
+
parallel-wavegan-compute-statistics \
|
98 |
+
--config "${conf}" \
|
99 |
+
--rootdir "${dumpdir}/${train_set}/raw" \
|
100 |
+
--dumpdir "${dumpdir}/${train_set}" \
|
101 |
+
--verbose "${verbose}"
|
102 |
+
echo "Successfully finished calculation of statistics."
|
103 |
+
else
|
104 |
+
echo "Use statistics of pretrained model. Skip statistics computation."
|
105 |
+
cp "$(dirname "${pretrain}")/stats.${stats_ext}" "${dumpdir}/${train_set}"
|
106 |
+
fi
|
107 |
+
|
108 |
+
# normalize and dump them
|
109 |
+
pids=()
|
110 |
+
for name in "${train_set}" "${dev_set}" "${eval_set}"; do
|
111 |
+
(
|
112 |
+
[ ! -e "${dumpdir}/${name}/norm" ] && mkdir -p "${dumpdir}/${name}/norm"
|
113 |
+
echo "Nomalization start. See the progress via ${dumpdir}/${name}/norm/normalize.*.log."
|
114 |
+
${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/norm/normalize.JOB.log" \
|
115 |
+
parallel-wavegan-normalize \
|
116 |
+
--config "${conf}" \
|
117 |
+
--stats "${dumpdir}/${train_set}/stats.${stats_ext}" \
|
118 |
+
--rootdir "${dumpdir}/${name}/raw/dump.JOB" \
|
119 |
+
--dumpdir "${dumpdir}/${name}/norm/dump.JOB" \
|
120 |
+
--verbose "${verbose}"
|
121 |
+
echo "Successfully finished normalization of ${name} set."
|
122 |
+
) &
|
123 |
+
pids+=($!)
|
124 |
+
done
|
125 |
+
i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
|
126 |
+
[ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
|
127 |
+
echo "Successfully finished normalization."
|
128 |
+
fi
|
129 |
+
|
130 |
+
if [ -z "${tag}" ]; then
|
131 |
+
expdir="exp/${train_set}_jsss_$(basename "${conf}" .yaml)"
|
132 |
+
if [ -n "${pretrain}" ]; then
|
133 |
+
pretrain_tag=$(basename "$(dirname "${pretrain}")")
|
134 |
+
expdir+="_${pretrain_tag}"
|
135 |
+
fi
|
136 |
+
else
|
137 |
+
expdir="exp/${train_set}_jsss_${tag}"
|
138 |
+
fi
|
139 |
+
if [ "${stage}" -le 2 ] && [ "${stop_stage}" -ge 2 ]; then
|
140 |
+
echo "Stage 2: Network training"
|
141 |
+
[ ! -e "${expdir}" ] && mkdir -p "${expdir}"
|
142 |
+
cp "${dumpdir}/${train_set}/stats.${stats_ext}" "${expdir}"
|
143 |
+
if [ "${n_gpus}" -gt 1 ]; then
|
144 |
+
train="python -m parallel_wavegan.distributed.launch --nproc_per_node ${n_gpus} -c parallel-wavegan-train"
|
145 |
+
else
|
146 |
+
train="parallel-wavegan-train"
|
147 |
+
fi
|
148 |
+
echo "Training start. See the progress via ${expdir}/train.log."
|
149 |
+
${cuda_cmd} --gpu "${n_gpus}" "${expdir}/train.log" \
|
150 |
+
${train} \
|
151 |
+
--config "${conf}" \
|
152 |
+
--train-dumpdir "${dumpdir}/${train_set}/norm" \
|
153 |
+
--dev-dumpdir "${dumpdir}/${dev_set}/norm" \
|
154 |
+
--outdir "${expdir}" \
|
155 |
+
--resume "${resume}" \
|
156 |
+
--pretrain "${pretrain}" \
|
157 |
+
--verbose "${verbose}"
|
158 |
+
echo "Successfully finished training."
|
159 |
+
fi
|
160 |
+
|
161 |
+
if [ "${stage}" -le 3 ] && [ "${stop_stage}" -ge 3 ]; then
|
162 |
+
echo "Stage 3: Network decoding"
|
163 |
+
# shellcheck disable=SC2012
|
164 |
+
[ -z "${checkpoint}" ] && checkpoint="$(ls -dt "${expdir}"/*.pkl | head -1 || true)"
|
165 |
+
outdir="${expdir}/wav/$(basename "${checkpoint}" .pkl)"
|
166 |
+
pids=()
|
167 |
+
for name in "${dev_set}" "${eval_set}"; do
|
168 |
+
(
|
169 |
+
[ ! -e "${outdir}/${name}" ] && mkdir -p "${outdir}/${name}"
|
170 |
+
[ "${n_gpus}" -gt 1 ] && n_gpus=1
|
171 |
+
echo "Decoding start. See the progress via ${outdir}/${name}/decode.log."
|
172 |
+
${cuda_cmd} --gpu "${n_gpus}" "${outdir}/${name}/decode.log" \
|
173 |
+
parallel-wavegan-decode \
|
174 |
+
--dumpdir "${dumpdir}/${name}/norm" \
|
175 |
+
--checkpoint "${checkpoint}" \
|
176 |
+
--outdir "${outdir}/${name}" \
|
177 |
+
--verbose "${verbose}"
|
178 |
+
echo "Successfully finished decoding of ${name} set."
|
179 |
+
) &
|
180 |
+
pids+=($!)
|
181 |
+
done
|
182 |
+
i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
|
183 |
+
[ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
|
184 |
+
echo "Successfully finished decoding."
|
185 |
+
fi
|
186 |
+
echo "Finished."
|
ParallelWaveGAN/egs/jsss/voc1/utils
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
../../../utils
|
ParallelWaveGAN/egs/jsut/voc1/cmd.sh
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
|
2 |
+
# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
|
3 |
+
# e.g.
|
4 |
+
# run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
|
5 |
+
#
|
6 |
+
# Options:
|
7 |
+
# --time <time>: Limit the maximum time to execute.
|
8 |
+
# --mem <mem>: Limit the maximum memory usage.
|
9 |
+
# -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
|
10 |
+
# --num-threads <ngpu>: Specify the number of CPU core.
|
11 |
+
# --gpu <ngpu>: Specify the number of GPU devices.
|
12 |
+
# --config: Change the configuration file from default.
|
13 |
+
#
|
14 |
+
# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
|
15 |
+
# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
|
16 |
+
# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
|
17 |
+
# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
|
18 |
+
#
|
19 |
+
# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
|
20 |
+
# These options are mapping to specific options for each backend and
|
21 |
+
# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
|
22 |
+
# If jobs failed, your configuration might be wrong for your environment.
|
23 |
+
#
|
24 |
+
#
|
25 |
+
# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
|
26 |
+
# "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
|
27 |
+
# =========================================================~
|
28 |
+
|
29 |
+
|
30 |
+
# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
|
31 |
+
cmd_backend="local"
|
32 |
+
|
33 |
+
# Local machine, without any Job scheduling system
|
34 |
+
if [ "${cmd_backend}" = local ]; then
|
35 |
+
|
36 |
+
# The other usage
|
37 |
+
export train_cmd="utils/run.pl"
|
38 |
+
# Used for "*_train.py": "--gpu" is appended optionally by run.sh
|
39 |
+
export cuda_cmd="utils/run.pl"
|
40 |
+
# Used for "*_recog.py"
|
41 |
+
export decode_cmd="utils/run.pl"
|
42 |
+
|
43 |
+
# Local machine, without any Job scheduling system
|
44 |
+
elif [ "${cmd_backend}" = stdout ]; then
|
45 |
+
|
46 |
+
# The other usage
|
47 |
+
export train_cmd="utils/stdout.pl"
|
48 |
+
# Used for "*_train.py": "--gpu" is appended optionally by run.sh
|
49 |
+
export cuda_cmd="utils/stdout.pl"
|
50 |
+
# Used for "*_recog.py"
|
51 |
+
export decode_cmd="utils/stdout.pl"
|
52 |
+
|
53 |
+
# "qsub" (SGE, Torque, PBS, etc.)
|
54 |
+
elif [ "${cmd_backend}" = sge ]; then
|
55 |
+
# The default setting is written in conf/queue.conf.
|
56 |
+
# You must change "-q g.q" for the "queue" for your environment.
|
57 |
+
# To know the "queue" names, type "qhost -q"
|
58 |
+
# Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
|
59 |
+
|
60 |
+
export train_cmd="utils/queue.pl"
|
61 |
+
export cuda_cmd="utils/queue.pl"
|
62 |
+
export decode_cmd="utils/queue.pl"
|
63 |
+
|
64 |
+
# "sbatch" (Slurm)
|
65 |
+
elif [ "${cmd_backend}" = slurm ]; then
|
66 |
+
# The default setting is written in conf/slurm.conf.
|
67 |
+
# You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
|
68 |
+
# To know the "partion" names, type "sinfo".
|
69 |
+
# You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
|
70 |
+
# The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
|
71 |
+
|
72 |
+
export train_cmd="utils/slurm.pl"
|
73 |
+
export cuda_cmd="utils/slurm.pl"
|
74 |
+
export decode_cmd="utils/slurm.pl"
|
75 |
+
|
76 |
+
elif [ "${cmd_backend}" = ssh ]; then
|
77 |
+
# You have to create ".queue/machines" to specify the host to execute jobs.
|
78 |
+
# e.g. .queue/machines
|
79 |
+
# host1
|
80 |
+
# host2
|
81 |
+
# host3
|
82 |
+
# Assuming you can login them without any password, i.e. You have to set ssh keys.
|
83 |
+
|
84 |
+
export train_cmd="utils/ssh.pl"
|
85 |
+
export cuda_cmd="utils/ssh.pl"
|
86 |
+
export decode_cmd="utils/ssh.pl"
|
87 |
+
|
88 |
+
else
|
89 |
+
echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
|
90 |
+
return 1
|
91 |
+
fi
|
ParallelWaveGAN/egs/jsut/voc1/conf/hifigan.v1.yaml
ADDED
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This is the configuration file for JSUT dataset.
|
2 |
+
# This configuration is based on HiFiGAN V1, which is
|
3 |
+
# an official configuration. But I found that the optimizer
|
4 |
+
# setting does not work well with my implementation.
|
5 |
+
# So I changed optimizer settings as follows:
|
6 |
+
# - AdamW -> Adam
|
7 |
+
# - betas: [0.8, 0.99] -> betas: [0.5, 0.9]
|
8 |
+
# - Scheduler: ExponentialLR -> MultiStepLR
|
9 |
+
# To match the shift size difference, the upsample scales
|
10 |
+
# is also modified from the original 256 shift setting.
|
11 |
+
|
12 |
+
###########################################################
|
13 |
+
# FEATURE EXTRACTION SETTING #
|
14 |
+
###########################################################
|
15 |
+
sampling_rate: 24000 # Sampling rate.
|
16 |
+
fft_size: 2048 # FFT size.
|
17 |
+
hop_size: 300 # Hop size.
|
18 |
+
win_length: 1200 # Window length.
|
19 |
+
# If set to null, it will be the same as fft_size.
|
20 |
+
window: "hann" # Window function.
|
21 |
+
num_mels: 80 # Number of mel basis.
|
22 |
+
fmin: 80 # Minimum freq in mel basis calculation.
|
23 |
+
fmax: 7600 # Maximum frequency in mel basis calculation.
|
24 |
+
global_gain_scale: 1.0 # Will be multiplied to all of waveform.
|
25 |
+
trim_silence: false # Whether to trim the start and end of silence.
|
26 |
+
trim_threshold_in_db: 20 # Need to tune carefully if the recording is not good.
|
27 |
+
trim_frame_size: 1024 # Frame size in trimming.
|
28 |
+
trim_hop_size: 256 # Hop size in trimming.
|
29 |
+
format: "hdf5" # Feature file format. "npy" or "hdf5" is supported.
|
30 |
+
|
31 |
+
###########################################################
|
32 |
+
# GENERATOR NETWORK ARCHITECTURE SETTING #
|
33 |
+
###########################################################
|
34 |
+
generator_type: HiFiGANGenerator
|
35 |
+
generator_params:
|
36 |
+
in_channels: 80 # Number of input channels.
|
37 |
+
out_channels: 1 # Number of output channels.
|
38 |
+
channels: 512 # Number of initial channels.
|
39 |
+
kernel_size: 7 # Kernel size of initial and final conv layers.
|
40 |
+
upsample_scales: [5, 5, 4, 3] # Upsampling scales.
|
41 |
+
upsample_kernel_sizes: [10, 10, 8, 6] # Kernel size for upsampling layers.
|
42 |
+
resblock_kernel_sizes: [3, 7, 11] # Kernel size for residual blocks.
|
43 |
+
resblock_dilations: # Dilations for residual blocks.
|
44 |
+
- [1, 3, 5]
|
45 |
+
- [1, 3, 5]
|
46 |
+
- [1, 3, 5]
|
47 |
+
use_additional_convs: true # Whether to use additional conv layer in residual blocks.
|
48 |
+
bias: true # Whether to use bias parameter in conv.
|
49 |
+
nonlinear_activation: "LeakyReLU" # Nonlinear activation type.
|
50 |
+
nonlinear_activation_params: # Nonlinear activation paramters.
|
51 |
+
negative_slope: 0.1
|
52 |
+
use_weight_norm: true # Whether to apply weight normalization.
|
53 |
+
|
54 |
+
###########################################################
|
55 |
+
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
|
56 |
+
###########################################################
|
57 |
+
discriminator_type: HiFiGANMultiScaleMultiPeriodDiscriminator
|
58 |
+
discriminator_params:
|
59 |
+
scales: 3 # Number of multi-scale discriminator.
|
60 |
+
scale_downsample_pooling: "AvgPool1d" # Pooling operation for scale discriminator.
|
61 |
+
scale_downsample_pooling_params:
|
62 |
+
kernel_size: 4 # Pooling kernel size.
|
63 |
+
stride: 2 # Pooling stride.
|
64 |
+
padding: 2 # Padding size.
|
65 |
+
scale_discriminator_params:
|
66 |
+
in_channels: 1 # Number of input channels.
|
67 |
+
out_channels: 1 # Number of output channels.
|
68 |
+
kernel_sizes: [15, 41, 5, 3] # List of kernel sizes.
|
69 |
+
channels: 128 # Initial number of channels.
|
70 |
+
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
|
71 |
+
max_groups: 16 # Maximum number of groups in downsampling conv layers.
|
72 |
+
bias: true
|
73 |
+
downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales.
|
74 |
+
nonlinear_activation: "LeakyReLU" # Nonlinear activation.
|
75 |
+
nonlinear_activation_params:
|
76 |
+
negative_slope: 0.1
|
77 |
+
follow_official_norm: true # Whether to follow the official norm setting.
|
78 |
+
periods: [2, 3, 5, 7, 11] # List of period for multi-period discriminator.
|
79 |
+
period_discriminator_params:
|
80 |
+
in_channels: 1 # Number of input channels.
|
81 |
+
out_channels: 1 # Number of output channels.
|
82 |
+
kernel_sizes: [5, 3] # List of kernel sizes.
|
83 |
+
channels: 32 # Initial number of channels.
|
84 |
+
downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
|
85 |
+
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
|
86 |
+
bias: true # Whether to use bias parameter in conv layer."
|
87 |
+
nonlinear_activation: "LeakyReLU" # Nonlinear activation.
|
88 |
+
nonlinear_activation_params: # Nonlinear activation paramters.
|
89 |
+
negative_slope: 0.1
|
90 |
+
use_weight_norm: true # Whether to apply weight normalization.
|
91 |
+
use_spectral_norm: false # Whether to apply spectral normalization.
|
92 |
+
|
93 |
+
###########################################################
|
94 |
+
# STFT LOSS SETTING #
|
95 |
+
###########################################################
|
96 |
+
use_stft_loss: false # Whether to use multi-resolution STFT loss.
|
97 |
+
use_mel_loss: true # Whether to use Mel-spectrogram loss.
|
98 |
+
mel_loss_params:
|
99 |
+
fs: 24000
|
100 |
+
fft_size: 2048
|
101 |
+
hop_size: 300
|
102 |
+
win_length: 1200
|
103 |
+
window: "hann"
|
104 |
+
num_mels: 80
|
105 |
+
fmin: 0
|
106 |
+
fmax: 12000
|
107 |
+
log_base: null
|
108 |
+
generator_adv_loss_params:
|
109 |
+
average_by_discriminators: false # Whether to average loss by #discriminators.
|
110 |
+
discriminator_adv_loss_params:
|
111 |
+
average_by_discriminators: false # Whether to average loss by #discriminators.
|
112 |
+
use_feat_match_loss: true
|
113 |
+
feat_match_loss_params:
|
114 |
+
average_by_discriminators: false # Whether to average loss by #discriminators.
|
115 |
+
average_by_layers: false # Whether to average loss by #layers in each discriminator.
|
116 |
+
include_final_outputs: false # Whether to include final outputs in feat match loss calculation.
|
117 |
+
|
118 |
+
###########################################################
|
119 |
+
# ADVERSARIAL LOSS SETTING #
|
120 |
+
###########################################################
|
121 |
+
lambda_aux: 45.0 # Loss balancing coefficient for STFT loss.
|
122 |
+
lambda_adv: 1.0 # Loss balancing coefficient for adversarial loss.
|
123 |
+
lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss..
|
124 |
+
|
125 |
+
###########################################################
|
126 |
+
# DATA LOADER SETTING #
|
127 |
+
###########################################################
|
128 |
+
batch_size: 16 # Batch size.
|
129 |
+
batch_max_steps: 8400 # Length of each audio in batch. Make sure dividable by hop_size.
|
130 |
+
pin_memory: true # Whether to pin memory in Pytorch DataLoader.
|
131 |
+
num_workers: 2 # Number of workers in Pytorch DataLoader.
|
132 |
+
remove_short_samples: false # Whether to remove samples the length of which are less than batch_max_steps.
|
133 |
+
allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.
|
134 |
+
|
135 |
+
###########################################################
|
136 |
+
# OPTIMIZER & SCHEDULER SETTING #
|
137 |
+
###########################################################
|
138 |
+
generator_optimizer_type: Adam
|
139 |
+
generator_optimizer_params:
|
140 |
+
lr: 2.0e-4
|
141 |
+
betas: [0.5, 0.9]
|
142 |
+
weight_decay: 0.0
|
143 |
+
generator_scheduler_type: MultiStepLR
|
144 |
+
generator_scheduler_params:
|
145 |
+
gamma: 0.5
|
146 |
+
milestones:
|
147 |
+
- 200000
|
148 |
+
- 400000
|
149 |
+
- 600000
|
150 |
+
- 800000
|
151 |
+
generator_grad_norm: -1
|
152 |
+
discriminator_optimizer_type: Adam
|
153 |
+
discriminator_optimizer_params:
|
154 |
+
lr: 2.0e-4
|
155 |
+
betas: [0.5, 0.9]
|
156 |
+
weight_decay: 0.0
|
157 |
+
discriminator_scheduler_type: MultiStepLR
|
158 |
+
discriminator_scheduler_params:
|
159 |
+
gamma: 0.5
|
160 |
+
milestones:
|
161 |
+
- 200000
|
162 |
+
- 400000
|
163 |
+
- 600000
|
164 |
+
- 800000
|
165 |
+
discriminator_grad_norm: -1
|
166 |
+
|
167 |
+
###########################################################
|
168 |
+
# INTERVAL SETTING #
|
169 |
+
###########################################################
|
170 |
+
generator_train_start_steps: 1 # Number of steps to start to train discriminator.
|
171 |
+
discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
|
172 |
+
train_max_steps: 2500000 # Number of training steps.
|
173 |
+
save_interval_steps: 10000 # Interval steps to save checkpoint.
|
174 |
+
eval_interval_steps: 1000 # Interval steps to evaluate the network.
|
175 |
+
log_interval_steps: 100 # Interval steps to record the training log.
|
176 |
+
|
177 |
+
###########################################################
|
178 |
+
# OTHER SETTING #
|
179 |
+
###########################################################
|
180 |
+
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
|
ParallelWaveGAN/egs/jsut/voc1/conf/multi_band_melgan.v2.yaml
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This is the hyperparameter configuration file for MelGAN.
|
2 |
+
# Please make sure this is adjusted for the JSUT dataset. If you want to
|
3 |
+
# apply to the other dataset, you might need to carefully change some parameters.
|
4 |
+
# This configuration requires ~ 8GB memory and will finish within 4 days on Titan V.
|
5 |
+
|
6 |
+
# This configuration is based on full-band MelGAN but the hop size and sampling
|
7 |
+
# rate is different from the paper (16kHz vs 24kHz). The number of iteraions
|
8 |
+
# is now shown in the paper so currently we train 1M iterations (not sure enough
|
9 |
+
# to converge). The optimizer setting is based on @dathudeptrai advice.
|
10 |
+
# https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906
|
11 |
+
|
12 |
+
###########################################################
|
13 |
+
# FEATURE EXTRACTION SETTING #
|
14 |
+
###########################################################
|
15 |
+
sampling_rate: 24000 # Sampling rate.
|
16 |
+
fft_size: 2048 # FFT size.
|
17 |
+
hop_size: 300 # Hop size.
|
18 |
+
win_length: 1200 # Window length.
|
19 |
+
# If set to null, it will be the same as fft_size.
|
20 |
+
window: "hann" # Window function.
|
21 |
+
num_mels: 80 # Number of mel basis.
|
22 |
+
fmin: 80 # Minimum freq in mel basis calculation.
|
23 |
+
fmax: 7600 # Maximum frequency in mel basis calculation.
|
24 |
+
global_gain_scale: 1.0 # Will be multiplied to all of waveform.
|
25 |
+
trim_silence: false # Whether to trim the start and end of silence.
|
26 |
+
trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
|
27 |
+
trim_frame_size: 2048 # Frame size in trimming.
|
28 |
+
trim_hop_size: 512 # Hop size in trimming.
|
29 |
+
format: "hdf5" # Feature file format. "npy" or "hdf5" is supported.
|
30 |
+
|
31 |
+
###########################################################
|
32 |
+
# GENERATOR NETWORK ARCHITECTURE SETTING #
|
33 |
+
###########################################################
|
34 |
+
generator_type: "MelGANGenerator" # Generator type.
|
35 |
+
generator_params:
|
36 |
+
in_channels: 80 # Number of input channels.
|
37 |
+
out_channels: 4 # Number of output channels.
|
38 |
+
kernel_size: 7 # Kernel size of initial and final conv layers.
|
39 |
+
channels: 384 # Initial number of channels for conv layers.
|
40 |
+
upsample_scales: [5, 5, 3] # List of Upsampling scales.
|
41 |
+
stack_kernel_size: 3 # Kernel size of dilated conv layers in residual stack.
|
42 |
+
stacks: 4 # Number of stacks in a single residual stack module.
|
43 |
+
use_weight_norm: True # Whether to use weight normalization.
|
44 |
+
use_causal_conv: False # Whether to use causal convolution.
|
45 |
+
|
46 |
+
###########################################################
|
47 |
+
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
|
48 |
+
###########################################################
|
49 |
+
discriminator_type: "MelGANMultiScaleDiscriminator" # Discriminator type.
|
50 |
+
discriminator_params:
|
51 |
+
in_channels: 1 # Number of input channels.
|
52 |
+
out_channels: 1 # Number of output channels.
|
53 |
+
scales: 3 # Number of multi-scales.
|
54 |
+
downsample_pooling: "AvgPool1d" # Pooling type for the input downsampling.
|
55 |
+
downsample_pooling_params: # Parameters of the above pooling function.
|
56 |
+
kernel_size: 4
|
57 |
+
stride: 2
|
58 |
+
padding: 1
|
59 |
+
count_include_pad: False
|
60 |
+
kernel_sizes: [5, 3] # List of kernel size.
|
61 |
+
channels: 16 # Number of channels of the initial conv layer.
|
62 |
+
max_downsample_channels: 512 # Maximum number of channels of downsampling layers.
|
63 |
+
downsample_scales: [4, 4, 4] # List of downsampling scales.
|
64 |
+
nonlinear_activation: "LeakyReLU" # Nonlinear activation function.
|
65 |
+
nonlinear_activation_params: # Parameters of nonlinear activation function.
|
66 |
+
negative_slope: 0.2
|
67 |
+
use_weight_norm: True # Whether to use weight norm.
|
68 |
+
|
69 |
+
###########################################################
|
70 |
+
# STFT LOSS SETTING #
|
71 |
+
###########################################################
|
72 |
+
stft_loss_params:
|
73 |
+
fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
|
74 |
+
hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss
|
75 |
+
win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
|
76 |
+
window: "hann_window" # Window function for STFT-based loss
|
77 |
+
use_subband_stft_loss: true
|
78 |
+
subband_stft_loss_params:
|
79 |
+
fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss.
|
80 |
+
hop_sizes: [30, 60, 10] # List of hop size for STFT-based loss
|
81 |
+
win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
|
82 |
+
window: "hann_window" # Window function for STFT-based loss
|
83 |
+
|
84 |
+
###########################################################
|
85 |
+
# ADVERSARIAL LOSS SETTING #
|
86 |
+
###########################################################
|
87 |
+
use_feat_match_loss: false # Whether to use feature matching loss.
|
88 |
+
lambda_adv: 2.5 # Loss balancing coefficient for adversarial loss.
|
89 |
+
|
90 |
+
###########################################################
|
91 |
+
# DATA LOADER SETTING #
|
92 |
+
###########################################################
|
93 |
+
batch_size: 64 # Batch size.
|
94 |
+
batch_max_steps: 16200 # Length of each audio in batch. Make sure dividable by hop_size.
|
95 |
+
pin_memory: true # Whether to pin memory in Pytorch DataLoader.
|
96 |
+
num_workers: 4 # Number of workers in Pytorch DataLoader.
|
97 |
+
remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
|
98 |
+
allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.
|
99 |
+
|
100 |
+
###########################################################
|
101 |
+
# OPTIMIZER & SCHEDULER SETTING #
|
102 |
+
###########################################################
|
103 |
+
generator_optimizer_type: "Adam" # Generator's optimizer type.
|
104 |
+
generator_optimizer_params:
|
105 |
+
lr: 1.0e-3 # Generator's learning rate.
|
106 |
+
eps: 1.0e-7 # Generator's epsilon.
|
107 |
+
weight_decay: 0.0 # Generator's weight decay coefficient.
|
108 |
+
amsgrad: true
|
109 |
+
generator_grad_norm: -1 # Generator's gradient norm.
|
110 |
+
generator_scheduler_type: "MultiStepLR" # Generator's scheduler type.
|
111 |
+
generator_scheduler_params:
|
112 |
+
gamma: 0.5 # Generator's scheduler gamma.
|
113 |
+
milestones: # At each milestone, lr will be multiplied by gamma.
|
114 |
+
- 100000
|
115 |
+
- 200000
|
116 |
+
- 300000
|
117 |
+
- 400000
|
118 |
+
- 500000
|
119 |
+
- 600000
|
120 |
+
discriminator_optimizer_type: "Adam" # Discriminator's optimizer type.
|
121 |
+
discriminator_optimizer_params:
|
122 |
+
lr: 1.0e-3 # Discriminator's learning rate.
|
123 |
+
eps: 1.0e-7 # Discriminator's epsilon.
|
124 |
+
weight_decay: 0.0 # Discriminator's weight decay coefficient.
|
125 |
+
amsgrad: true
|
126 |
+
discriminator_grad_norm: -1 # Discriminator's gradient norm.
|
127 |
+
discriminator_scheduler_type: "MultiStepLR" # Discriminator's scheduler type.
|
128 |
+
discriminator_scheduler_params:
|
129 |
+
gamma: 0.5 # Discriminator's scheduler gamma.
|
130 |
+
milestones: # At each milestone, lr will be multiplied by gamma.
|
131 |
+
- 100000
|
132 |
+
- 200000
|
133 |
+
- 300000
|
134 |
+
- 400000
|
135 |
+
- 500000
|
136 |
+
- 600000
|
137 |
+
|
138 |
+
###########################################################
|
139 |
+
# INTERVAL SETTING #
|
140 |
+
###########################################################
|
141 |
+
discriminator_train_start_steps: 200000 # Number of steps to start to train discriminator.
|
142 |
+
train_max_steps: 1000000 # Number of training steps.
|
143 |
+
save_interval_steps: 50000 # Interval steps to save checkpoint.
|
144 |
+
eval_interval_steps: 1000 # Interval steps to evaluate the network.
|
145 |
+
log_interval_steps: 1000 # Interval steps to record the training log.
|
146 |
+
|
147 |
+
###########################################################
|
148 |
+
# OTHER SETTING #
|
149 |
+
###########################################################
|
150 |
+
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
|
ParallelWaveGAN/egs/jsut/voc1/conf/parallel_wavegan.v1.yaml
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This is the hyperparameter configuration file for Parallel WaveGAN.
|
2 |
+
# Please make sure this is adjusted for the JSUT dataset. If you want to
|
3 |
+
# apply to the other dataset, you might need to carefully change some parameters.
|
4 |
+
# This configuration requires 12 GB GPU memory and takes ~3 days on RTX TITAN.
|
5 |
+
|
6 |
+
###########################################################
|
7 |
+
# FEATURE EXTRACTION SETTING #
|
8 |
+
###########################################################
|
9 |
+
sampling_rate: 24000 # Sampling rate.
|
10 |
+
fft_size: 2048 # FFT size.
|
11 |
+
hop_size: 300 # Hop size.
|
12 |
+
win_length: 1200 # Window length.
|
13 |
+
# If set to null, it will be the same as fft_size.
|
14 |
+
window: "hann" # Window function.
|
15 |
+
num_mels: 80 # Number of mel basis.
|
16 |
+
fmin: 80 # Minimum freq in mel basis calculation.
|
17 |
+
fmax: 7600 # Maximum frequency in mel basis calculation.
|
18 |
+
global_gain_scale: 1.0 # Will be multiplied to all of waveform.
|
19 |
+
trim_silence: false # Whether to trim the start and end of silence.
|
20 |
+
trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
|
21 |
+
trim_frame_size: 2048 # Frame size in trimming.
|
22 |
+
trim_hop_size: 512 # Hop size in trimming.
|
23 |
+
format: "hdf5" # Feature file format. "npy" or "hdf5" is supported.
|
24 |
+
|
25 |
+
###########################################################
|
26 |
+
# GENERATOR NETWORK ARCHITECTURE SETTING #
|
27 |
+
###########################################################
|
28 |
+
generator_params:
|
29 |
+
in_channels: 1 # Number of input channels.
|
30 |
+
out_channels: 1 # Number of output channels.
|
31 |
+
kernel_size: 3 # Kernel size of dilated convolution.
|
32 |
+
layers: 30 # Number of residual block layers.
|
33 |
+
stacks: 3 # Number of stacks i.e., dilation cycles.
|
34 |
+
residual_channels: 64 # Number of channels in residual conv.
|
35 |
+
gate_channels: 128 # Number of channels in gated conv.
|
36 |
+
skip_channels: 64 # Number of channels in skip conv.
|
37 |
+
aux_channels: 80 # Number of channels for auxiliary feature conv.
|
38 |
+
# Must be the same as num_mels.
|
39 |
+
aux_context_window: 2 # Context window size for auxiliary feature.
|
40 |
+
# If set to 2, previous 2 and future 2 frames will be considered.
|
41 |
+
dropout: 0.0 # Dropout rate. 0.0 means no dropout applied.
|
42 |
+
use_weight_norm: true # Whether to use weight norm.
|
43 |
+
# If set to true, it will be applied to all of the conv layers.
|
44 |
+
upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
|
45 |
+
upsample_params: # Upsampling network parameters.
|
46 |
+
upsample_scales: [4, 5, 3, 5] # Upsampling scales. Prodcut of these must be the same as hop size.
|
47 |
+
|
48 |
+
###########################################################
|
49 |
+
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
|
50 |
+
###########################################################
|
51 |
+
discriminator_params:
|
52 |
+
in_channels: 1 # Number of input channels.
|
53 |
+
out_channels: 1 # Number of output channels.
|
54 |
+
kernel_size: 3 # Number of output channels.
|
55 |
+
layers: 10 # Number of conv layers.
|
56 |
+
conv_channels: 64 # Number of chnn layers.
|
57 |
+
bias: true # Whether to use bias parameter in conv.
|
58 |
+
use_weight_norm: true # Whether to use weight norm.
|
59 |
+
# If set to true, it will be applied to all of the conv layers.
|
60 |
+
nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
|
61 |
+
nonlinear_activation_params: # Nonlinear function parameters
|
62 |
+
negative_slope: 0.2 # Alpha in LeakyReLU.
|
63 |
+
|
64 |
+
###########################################################
|
65 |
+
# STFT LOSS SETTING #
|
66 |
+
###########################################################
|
67 |
+
stft_loss_params:
|
68 |
+
fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
|
69 |
+
hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss
|
70 |
+
win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
|
71 |
+
window: "hann_window" # Window function for STFT-based loss
|
72 |
+
|
73 |
+
###########################################################
|
74 |
+
# ADVERSARIAL LOSS SETTING #
|
75 |
+
###########################################################
|
76 |
+
lambda_adv: 4.0 # Loss balancing coefficient.
|
77 |
+
|
78 |
+
###########################################################
|
79 |
+
# DATA LOADER SETTING #
|
80 |
+
###########################################################
|
81 |
+
batch_size: 6 # Batch size.
|
82 |
+
batch_max_steps: 25500 # Length of each audio in batch. Make sure dividable by hop_size.
|
83 |
+
pin_memory: true # Whether to pin memory in Pytorch DataLoader.
|
84 |
+
num_workers: 2 # Number of workers in Pytorch DataLoader.
|
85 |
+
remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
|
86 |
+
allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.
|
87 |
+
|
88 |
+
###########################################################
|
89 |
+
# OPTIMIZER & SCHEDULER SETTING #
|
90 |
+
###########################################################
|
91 |
+
generator_optimizer_params:
|
92 |
+
lr: 0.0001 # Generator's learning rate.
|
93 |
+
eps: 1.0e-6 # Generator's epsilon.
|
94 |
+
weight_decay: 0.0 # Generator's weight decay coefficient.
|
95 |
+
generator_scheduler_params:
|
96 |
+
step_size: 200000 # Generator's scheduler step size.
|
97 |
+
gamma: 0.5 # Generator's scheduler gamma.
|
98 |
+
# At each step size, lr will be multiplied by this parameter.
|
99 |
+
generator_grad_norm: 10 # Generator's gradient norm.
|
100 |
+
discriminator_optimizer_params:
|
101 |
+
lr: 0.00005 # Discriminator's learning rate.
|
102 |
+
eps: 1.0e-6 # Discriminator's epsilon.
|
103 |
+
weight_decay: 0.0 # Discriminator's weight decay coefficient.
|
104 |
+
discriminator_scheduler_params:
|
105 |
+
step_size: 200000 # Discriminator's scheduler step size.
|
106 |
+
gamma: 0.5 # Discriminator's scheduler gamma.
|
107 |
+
# At each step size, lr will be multiplied by this parameter.
|
108 |
+
discriminator_grad_norm: 1 # Discriminator's gradient norm.
|
109 |
+
|
110 |
+
###########################################################
|
111 |
+
# INTERVAL SETTING #
|
112 |
+
###########################################################
|
113 |
+
discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
|
114 |
+
train_max_steps: 400000 # Number of training steps.
|
115 |
+
save_interval_steps: 5000 # Interval steps to save checkpoint.
|
116 |
+
eval_interval_steps: 1000 # Interval steps to evaluate the network.
|
117 |
+
log_interval_steps: 100 # Interval steps to record the training log.
|
118 |
+
|
119 |
+
###########################################################
|
120 |
+
# OTHER SETTING #
|
121 |
+
###########################################################
|
122 |
+
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
|
ParallelWaveGAN/egs/jsut/voc1/conf/slurm.conf
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Default configuration
|
2 |
+
command sbatch --export=PATH --ntasks-per-node=1
|
3 |
+
option time=* --time $0
|
4 |
+
option mem=* --mem-per-cpu $0
|
5 |
+
option mem=0 # Do not add anything to qsub_opts
|
6 |
+
option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
|
7 |
+
option num_threads=1 --cpus-per-task 1 --ntasks-per-node=1 # Do not add anything to qsub_opts
|
8 |
+
default gpu=0
|
9 |
+
option gpu=0 -p cpu
|
10 |
+
option gpu=* -p gpu --gres=gpu:$0
|
11 |
+
# note: the --max-jobs-run option is supported as a special case
|
12 |
+
# by slurm.pl and you don't have to handle it in the config file.
|
ParallelWaveGAN/egs/jsut/voc1/conf/style_melgan.v1.yaml
ADDED
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This is the configuration file for JSUT dataset.
|
2 |
+
# This configuration is based on StyleMelGAN paper but
|
3 |
+
# uses MSE loss instead of Hinge loss. And I found that
|
4 |
+
# batch_size = 8 is also working good. So maybe if you
|
5 |
+
# want to accelerate the training, you can reduce the
|
6 |
+
# batch size (e.g. 8 or 16). Upsampling scales is modified
|
7 |
+
# to fit the shift size 300 pt.
|
8 |
+
|
9 |
+
###########################################################
|
10 |
+
# FEATURE EXTRACTION SETTING #
|
11 |
+
###########################################################
|
12 |
+
sampling_rate: 24000 # Sampling rate.
|
13 |
+
fft_size: 2048 # FFT size.
|
14 |
+
hop_size: 300 # Hop size.
|
15 |
+
win_length: 1200 # Window length.
|
16 |
+
# If set to null, it will be the same as fft_size.
|
17 |
+
window: "hann" # Window function.
|
18 |
+
num_mels: 80 # Number of mel basis.
|
19 |
+
fmin: 80 # Minimum freq in mel basis calculation.
|
20 |
+
fmax: 7600 # Maximum frequency in mel basis calculation.
|
21 |
+
global_gain_scale: 1.0 # Will be multiplied to all of waveform.
|
22 |
+
trim_silence: false # Whether to trim the start and end of silence.
|
23 |
+
trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
|
24 |
+
trim_frame_size: 1024 # Frame size in trimming.
|
25 |
+
trim_hop_size: 256 # Hop size in trimming.
|
26 |
+
format: "hdf5" # Feature file format. " npy " or " hdf5 " is supported.
|
27 |
+
|
28 |
+
###########################################################
|
29 |
+
# GENERATOR NETWORK ARCHITECTURE SETTING #
|
30 |
+
###########################################################
|
31 |
+
generator_type: "StyleMelGANGenerator" # Generator type.
|
32 |
+
generator_params:
|
33 |
+
in_channels: 128
|
34 |
+
aux_channels: 80
|
35 |
+
channels: 64
|
36 |
+
out_channels: 1
|
37 |
+
kernel_size: 9
|
38 |
+
dilation: 2
|
39 |
+
bias: True
|
40 |
+
noise_upsample_scales: [10, 2, 2, 2]
|
41 |
+
noise_upsample_activation: "LeakyReLU"
|
42 |
+
noise_upsample_activation_params:
|
43 |
+
negative_slope: 0.2
|
44 |
+
upsample_scales: [5, 1, 5, 1, 3, 1, 2, 2, 1]
|
45 |
+
upsample_mode: "nearest"
|
46 |
+
gated_function: "softmax"
|
47 |
+
use_weight_norm: True
|
48 |
+
|
49 |
+
###########################################################
|
50 |
+
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
|
51 |
+
###########################################################
|
52 |
+
discriminator_type: "StyleMelGANDiscriminator" # Discriminator type.
|
53 |
+
discriminator_params:
|
54 |
+
repeats: 4
|
55 |
+
window_sizes: [512, 1024, 2048, 4096]
|
56 |
+
pqmf_params:
|
57 |
+
- [1, None, None, None]
|
58 |
+
- [2, 62, 0.26700, 9.0]
|
59 |
+
- [4, 62, 0.14200, 9.0]
|
60 |
+
- [8, 62, 0.07949, 9.0]
|
61 |
+
discriminator_params:
|
62 |
+
out_channels: 1
|
63 |
+
kernel_sizes: [5, 3]
|
64 |
+
channels: 16
|
65 |
+
max_downsample_channels: 512
|
66 |
+
bias: True
|
67 |
+
downsample_scales: [4, 4, 4, 1]
|
68 |
+
nonlinear_activation: "LeakyReLU"
|
69 |
+
nonlinear_activation_params:
|
70 |
+
negative_slope: 0.2
|
71 |
+
use_weight_norm: True
|
72 |
+
|
73 |
+
###########################################################
|
74 |
+
# STFT LOSS SETTING #
|
75 |
+
###########################################################
|
76 |
+
stft_loss_params:
|
77 |
+
fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
|
78 |
+
hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss
|
79 |
+
win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
|
80 |
+
window: "hann_window" # Window function for STFT-based loss
|
81 |
+
lambda_aux: 1.0 # Loss balancing coefficient for aux loss.
|
82 |
+
|
83 |
+
###########################################################
|
84 |
+
# ADVERSARIAL LOSS SETTING #
|
85 |
+
###########################################################
|
86 |
+
lambda_adv: 1.0 # Loss balancing coefficient for adv loss.
|
87 |
+
generator_adv_loss_params:
|
88 |
+
average_by_discriminators: false # Whether to average loss by #discriminators.
|
89 |
+
discriminator_adv_loss_params:
|
90 |
+
average_by_discriminators: false # Whether to average loss by #discriminators.
|
91 |
+
|
92 |
+
###########################################################
|
93 |
+
# DATA LOADER SETTING #
|
94 |
+
###########################################################
|
95 |
+
batch_size: 32 # Batch size.
|
96 |
+
batch_max_steps: 24000 # Length of each audio in batch. Make sure dividable by hop_size.
|
97 |
+
pin_memory: true # Whether to pin memory in Pytorch DataLoader.
|
98 |
+
num_workers: 2 # Number of workers in Pytorch DataLoader.
|
99 |
+
remove_short_samples: false # Whether to remove samples the length of which are less than batch_max_steps.
|
100 |
+
allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.
|
101 |
+
|
102 |
+
###########################################################
|
103 |
+
# OPTIMIZER & SCHEDULER SETTING #
|
104 |
+
###########################################################
|
105 |
+
generator_optimizer_type: Adam
|
106 |
+
generator_optimizer_params:
|
107 |
+
lr: 1.0e-4
|
108 |
+
betas: [0.5, 0.9]
|
109 |
+
weight_decay: 0.0
|
110 |
+
generator_scheduler_type: MultiStepLR
|
111 |
+
generator_scheduler_params:
|
112 |
+
gamma: 0.5
|
113 |
+
milestones:
|
114 |
+
- 100000
|
115 |
+
- 300000
|
116 |
+
- 500000
|
117 |
+
- 700000
|
118 |
+
- 900000
|
119 |
+
generator_grad_norm: -1
|
120 |
+
discriminator_optimizer_type: Adam
|
121 |
+
discriminator_optimizer_params:
|
122 |
+
lr: 2.0e-4
|
123 |
+
betas: [0.5, 0.9]
|
124 |
+
weight_decay: 0.0
|
125 |
+
discriminator_scheduler_type: MultiStepLR
|
126 |
+
discriminator_scheduler_params:
|
127 |
+
gamma: 0.5
|
128 |
+
milestones:
|
129 |
+
- 200000
|
130 |
+
- 400000
|
131 |
+
- 600000
|
132 |
+
- 800000
|
133 |
+
discriminator_grad_norm: -1
|
134 |
+
|
135 |
+
###########################################################
|
136 |
+
# INTERVAL SETTING #
|
137 |
+
###########################################################
|
138 |
+
discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
|
139 |
+
train_max_steps: 1500000 # Number of training steps.
|
140 |
+
save_interval_steps: 50000 # Interval steps to save checkpoint.
|
141 |
+
eval_interval_steps: 1000 # Interval steps to evaluate the network.
|
142 |
+
log_interval_steps: 100 # Interval steps to record the training log.
|
143 |
+
|
144 |
+
###########################################################
|
145 |
+
# OTHER SETTING #
|
146 |
+
###########################################################
|
147 |
+
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
|
ParallelWaveGAN/egs/jsut/voc1/local/data_download.sh
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# Copyright 2019 Tomoki Hayashi
|
4 |
+
# MIT License (https://opensource.org/licenses/MIT)
|
5 |
+
|
6 |
+
download_dir=$1
|
7 |
+
|
8 |
+
# check arguments
|
9 |
+
if [ $# != 1 ]; then
|
10 |
+
echo "Usage: $0 <download_dir>"
|
11 |
+
exit 1
|
12 |
+
fi
|
13 |
+
|
14 |
+
set -euo pipefail
|
15 |
+
|
16 |
+
cwd=$(pwd)
|
17 |
+
if [ ! -e "${download_dir}/jsut_ver1.1" ]; then
|
18 |
+
mkdir -p "${download_dir}"
|
19 |
+
cd "${download_dir}" || exit 1;
|
20 |
+
wget http://ss-takashi.sakura.ne.jp/corpus/jsut_ver1.1.zip
|
21 |
+
unzip -o ./*.zip
|
22 |
+
rm ./*.zip
|
23 |
+
cd "${cwd}" || exit 1;
|
24 |
+
echo "Successfully downloaded data."
|
25 |
+
else
|
26 |
+
echo "Already exists. Skipped."
|
27 |
+
fi
|
28 |
+
|
29 |
+
if [ ! -e "${download_dir}/jsut_lab" ]; then
|
30 |
+
cd "${download_dir}" || exit 1;
|
31 |
+
git clone https://github.com/r9y9/jsut-lab
|
32 |
+
for name in loanword128 repeat500 voiceactress100 basic5000 onomatopee300 travel1000 countersuffix26 precedent130 utparaphrase512; do
|
33 |
+
cp -vr "jsut-lab/${name}" jsut_ver1.1/
|
34 |
+
done
|
35 |
+
cd - || exit 1;
|
36 |
+
echo "Successfully downloaded context label."
|
37 |
+
else
|
38 |
+
echo "Already exists. Skipped."
|
39 |
+
fi
|
ParallelWaveGAN/egs/jsut/voc1/local/data_prep.sh
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# Copyright 2019 Tomoki Hayashi
|
4 |
+
# MIT License (https://opensource.org/licenses/MIT)
|
5 |
+
|
6 |
+
# shellcheck disable=SC1091
|
7 |
+
. ./path.sh || exit 1;
|
8 |
+
|
9 |
+
fs=24000
|
10 |
+
num_dev=250
|
11 |
+
num_eval=250
|
12 |
+
train_set="train_nodev"
|
13 |
+
dev_set="dev"
|
14 |
+
eval_set="eval"
|
15 |
+
shuffle=false
|
16 |
+
|
17 |
+
# shellcheck disable=SC1091
|
18 |
+
. utils/parse_options.sh || exit 1;
|
19 |
+
|
20 |
+
db_root=$1
|
21 |
+
data_dir=$2
|
22 |
+
|
23 |
+
# check arguments
|
24 |
+
if [ $# != 2 ]; then
|
25 |
+
echo "Usage: $0 [Options] <db_root> <data_dir>"
|
26 |
+
echo "e.g.: $0 downloads/jsut_ver1.1 data"
|
27 |
+
echo ""
|
28 |
+
echo "Options:"
|
29 |
+
echo " --fs: target sampling rate (default=24000)."
|
30 |
+
echo " --num_dev: number of development uttreances (default=250)."
|
31 |
+
echo " --num_eval: number of evaluation uttreances (default=250)."
|
32 |
+
echo " --train_set: name of train set (default=train_nodev)."
|
33 |
+
echo " --dev_set: name of dev set (default=dev)."
|
34 |
+
echo " --eval_set: name of eval set (default=eval)."
|
35 |
+
echo " --shuffle: whether to perform shuffle in making dev / eval set (default=false)."
|
36 |
+
exit 1
|
37 |
+
fi
|
38 |
+
|
39 |
+
set -euo pipefail
|
40 |
+
|
41 |
+
[ ! -e "${data_dir}/all" ] && mkdir -p "${data_dir}/all"
|
42 |
+
|
43 |
+
# set filenames
|
44 |
+
scp="${data_dir}/all/wav.scp"
|
45 |
+
segments="${data_dir}/all/segments"
|
46 |
+
|
47 |
+
# check file existence
|
48 |
+
[ -e "${scp}" ] && rm "${scp}"
|
49 |
+
[ -e "${segments}" ] && rm "${segments}"
|
50 |
+
|
51 |
+
# make scp
|
52 |
+
find "${db_root}" -follow -name "*.wav" | sort | while read -r filename; do
|
53 |
+
id=$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")
|
54 |
+
echo "${id} cat ${filename} | sox -t wav - -c 1 -b 16 -t wav - rate ${fs} |" >> "${scp}"
|
55 |
+
done
|
56 |
+
|
57 |
+
# make segments
|
58 |
+
find "${db_root}" -name "*.lab" -follow | sort | while read -r filename; do
|
59 |
+
id=$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")
|
60 |
+
start_nsec=$(head -n 1 "${filename}" | cut -d " " -f 2)
|
61 |
+
end_nsec=$(tail -n 1 "${filename}" | cut -d " " -f 1)
|
62 |
+
start_sec=$(echo "${start_nsec}*0.0000001" | bc | sed "s/^\./0./")
|
63 |
+
end_sec=$(echo "${end_nsec}*0.0000001" | bc | sed "s/^\./0./")
|
64 |
+
echo "${id} ${id} ${start_sec} ${end_sec}" >> "${segments}"
|
65 |
+
done
|
66 |
+
|
67 |
+
# check
|
68 |
+
diff -q <(awk '{print $1}' "${scp}") <(awk '{print $1}' "${segments}") > /dev/null
|
69 |
+
|
70 |
+
# split
|
71 |
+
num_all=$(wc -l < "${scp}")
|
72 |
+
num_deveval=$((num_dev + num_eval))
|
73 |
+
num_train=$((num_all - num_deveval))
|
74 |
+
utils/split_data.sh \
|
75 |
+
--num_first "${num_train}" \
|
76 |
+
--num_second "${num_deveval}" \
|
77 |
+
--shuffle "${shuffle}" \
|
78 |
+
"${data_dir}/all" \
|
79 |
+
"${data_dir}/${train_set}" \
|
80 |
+
"${data_dir}/deveval"
|
81 |
+
utils/split_data.sh \
|
82 |
+
--num_first "${num_dev}" \
|
83 |
+
--num_second "${num_eval}" \
|
84 |
+
--shuffle "${shuffle}" \
|
85 |
+
"${data_dir}/deveval" \
|
86 |
+
"${data_dir}/${dev_set}" \
|
87 |
+
"${data_dir}/${eval_set}"
|
88 |
+
|
89 |
+
# remove tmp directories
|
90 |
+
rm -rf "${data_dir}/all"
|
91 |
+
rm -rf "${data_dir}/deveval"
|
92 |
+
|
93 |
+
echo "Successfully prepared data."
|
ParallelWaveGAN/egs/jsut/voc1/path.sh
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# cuda related
|
2 |
+
export CUDA_HOME=/usr/local/cuda-10.0
|
3 |
+
export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
|
4 |
+
|
5 |
+
# path related
|
6 |
+
export PRJ_ROOT="${PWD}/../../.."
|
7 |
+
if [ -e "${PRJ_ROOT}/tools/venv/bin/activate" ]; then
|
8 |
+
# shellcheck disable=SC1090
|
9 |
+
. "${PRJ_ROOT}/tools/venv/bin/activate"
|
10 |
+
fi
|
11 |
+
|
12 |
+
# python related
|
13 |
+
export OMP_NUM_THREADS=1
|
14 |
+
export PYTHONIOENCODING=UTF-8
|
15 |
+
export MPL_BACKEND=Agg
|
16 |
+
|
17 |
+
# check installation
|
18 |
+
if ! command -v parallel-wavegan-train > /dev/null; then
|
19 |
+
echo "Error: It seems setup is not finished." >&2
|
20 |
+
echo "Error: Please setup your environment by following README.md" >&2
|
21 |
+
return 1
|
22 |
+
fi
|
23 |
+
if ! command -v jq > /dev/null; then
|
24 |
+
echo "Error: It seems jq is not installed." >&2
|
25 |
+
echo "Error: Please install via \`sudo apt-get install jq\`." >&2
|
26 |
+
echo "Error: If you do not have sudo, please download from https://stedolan.github.io/jq/download/." >&2
|
27 |
+
return 1
|
28 |
+
fi
|
29 |
+
if ! command -v yq > /dev/null; then
|
30 |
+
echo "Error: It seems yq is not installed." >&2
|
31 |
+
echo "Error: Please install via \`pip install yq\`." >&2
|
32 |
+
return 1
|
33 |
+
fi
|