Spaces:
Running
Running
Commit
·
9791162
1
Parent(s):
827297f
Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .DS_Store +0 -0
- .gitattributes +2 -0
- .python-version +1 -0
- LICENSE +21 -0
- README.md +493 -5
- README_ZH.md +418 -0
- __pycache__/svc_inference.cpython-310.pyc +0 -0
- app.py +444 -0
- colab.ipynb +374 -0
- configs/base.yaml +72 -0
- configs/singers/singer0001.npy +3 -0
- configs/singers/singer0002.npy +3 -0
- configs/singers/singer0003.npy +3 -0
- configs/singers/singer0004.npy +3 -0
- configs/singers/singer0005.npy +3 -0
- configs/singers/singer0006.npy +3 -0
- configs/singers/singer0007.npy +3 -0
- configs/singers/singer0008.npy +3 -0
- configs/singers/singer0009.npy +3 -0
- configs/singers/singer0010.npy +3 -0
- configs/singers/singer0011.npy +3 -0
- configs/singers/singer0012.npy +3 -0
- configs/singers/singer0013.npy +3 -0
- configs/singers/singer0014.npy +3 -0
- configs/singers/singer0015.npy +3 -0
- configs/singers/singer0016.npy +3 -0
- configs/singers/singer0017.npy +3 -0
- configs/singers/singer0018.npy +3 -0
- configs/singers/singer0019.npy +3 -0
- configs/singers/singer0020.npy +3 -0
- configs/singers/singer0021.npy +3 -0
- configs/singers/singer0022.npy +3 -0
- configs/singers/singer0023.npy +3 -0
- configs/singers/singer0024.npy +3 -0
- configs/singers/singer0025.npy +3 -0
- configs/singers/singer0026.npy +3 -0
- configs/singers/singer0027.npy +3 -0
- configs/singers/singer0028.npy +3 -0
- configs/singers/singer0029.npy +3 -0
- configs/singers/singer0030.npy +3 -0
- configs/singers/singer0031.npy +3 -0
- configs/singers/singer0032.npy +3 -0
- configs/singers/singer0033.npy +3 -0
- configs/singers/singer0034.npy +3 -0
- configs/singers/singer0035.npy +3 -0
- configs/singers/singer0036.npy +3 -0
- configs/singers/singer0037.npy +3 -0
- configs/singers/singer0038.npy +3 -0
- configs/singers/singer0039.npy +3 -0
- configs/singers/singer0040.npy +3 -0
.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
test.wav filter=lfs diff=lfs merge=lfs -text
|
37 |
+
vad/assets/silero_vad.jit filter=lfs diff=lfs merge=lfs -text
|
.python-version
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
3.10.9
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2023 PlayVoice
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
CHANGED
@@ -1,12 +1,500 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
|
|
4 |
colorFrom: blue
|
5 |
colorTo: purple
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 5.
|
8 |
-
app_file:
|
9 |
pinned: false
|
|
|
10 |
---
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: Whisper Vits SVC
|
3 |
+
emoji: 🎵
|
4 |
+
python_version: 3.10.12
|
5 |
colorFrom: blue
|
6 |
colorTo: purple
|
7 |
sdk: gradio
|
8 |
+
sdk_version: 5.7.1
|
9 |
+
app_file: main.py
|
10 |
pinned: false
|
11 |
+
license: mit
|
12 |
---
|
13 |
|
14 |
+
<div align="center">
|
15 |
+
<h1> Variational Inference with adversarial learning for end-to-end Singing Voice Conversion based on VITS </h1>
|
16 |
+
|
17 |
+
[](https://huggingface.co/spaces/maxmax20160403/sovits5.0)
|
18 |
+
<img alt="GitHub Repo stars" src="https://img.shields.io/github/stars/PlayVoice/so-vits-svc-5.0">
|
19 |
+
<img alt="GitHub forks" src="https://img.shields.io/github/forks/PlayVoice/so-vits-svc-5.0">
|
20 |
+
<img alt="GitHub issues" src="https://img.shields.io/github/issues/PlayVoice/so-vits-svc-5.0">
|
21 |
+
<img alt="GitHub" src="https://img.shields.io/github/license/PlayVoice/so-vits-svc-5.0">
|
22 |
+
|
23 |
+
[中文文档](./README_ZH.md)
|
24 |
+
|
25 |
+
The tree [bigvgan-mix-v2](https://github.com/PlayVoice/whisper-vits-svc/tree/bigvgan-mix-v2) has good audio quality
|
26 |
+
|
27 |
+
The tree [RoFormer-HiFTNet](https://github.com/PlayVoice/whisper-vits-svc/tree/RoFormer-HiFTNet) has fast infer speed
|
28 |
+
|
29 |
+
No More Upgrade
|
30 |
+
|
31 |
+
</div>
|
32 |
+
|
33 |
+
- This project targets deep learning beginners, basic knowledge of Python and PyTorch are the prerequisites for this project;
|
34 |
+
- This project aims to help deep learning beginners get rid of boring pure theoretical learning, and master the basic knowledge of deep learning by combining it with practices;
|
35 |
+
- This project does not support real-time voice converting; (need to replace whisper if real-time voice converting is what you are looking for)
|
36 |
+
- This project will not develop one-click packages for other purposes;
|
37 |
+
|
38 |
+

|
39 |
+
|
40 |
+
- A minimum VRAM requirement of 6GB for training
|
41 |
+
|
42 |
+
- Support for multiple speakers
|
43 |
+
|
44 |
+
- Create unique speakers through speaker mixing
|
45 |
+
|
46 |
+
- It can even convert voices with light accompaniment
|
47 |
+
|
48 |
+
- You can edit F0 using Excel
|
49 |
+
|
50 |
+
https://github.com/PlayVoice/so-vits-svc-5.0/assets/16432329/6a09805e-ab93-47fe-9a14-9cbc1e0e7c3a
|
51 |
+
|
52 |
+
Powered by [@ShadowVap](https://space.bilibili.com/491283091)
|
53 |
+
|
54 |
+
## Model properties
|
55 |
+
|
56 |
+
| Feature | From | Status | Function |
|
57 |
+
| :--- | :--- | :--- | :--- |
|
58 |
+
| whisper | OpenAI | ✅ | strong noise immunity |
|
59 |
+
| bigvgan | NVIDA | ✅ | alias and snake | The formant is clearer and the sound quality is obviously improved |
|
60 |
+
| natural speech | Microsoft | ✅ | reduce mispronunciation |
|
61 |
+
| neural source-filter | Xin Wang | ✅ | solve the problem of audio F0 discontinuity |
|
62 |
+
| pitch quantization | Xin Wang | ✅ | quantize the F0 for embedding |
|
63 |
+
| speaker encoder | Google | ✅ | Timbre Encoding and Clustering |
|
64 |
+
| GRL for speaker | Ubisoft |✅ | Preventing Encoder Leakage Timbre |
|
65 |
+
| SNAC | Samsung | ✅ | One Shot Clone of VITS |
|
66 |
+
| SCLN | Microsoft | ✅ | Improve Clone |
|
67 |
+
| Diffusion | HuaWei | ✅ | Improve sound quality |
|
68 |
+
| PPG perturbation | this project | ✅ | Improved noise immunity and de-timbre |
|
69 |
+
| HuBERT perturbation | this project | ✅ | Improved noise immunity and de-timbre |
|
70 |
+
| VAE perturbation | this project | ✅ | Improve sound quality |
|
71 |
+
| MIX encoder | this project | ✅ | Improve conversion stability |
|
72 |
+
| USP infer | this project | ✅ | Improve conversion stability |
|
73 |
+
| HiFTNet | Columbia University | ✅ | NSF-iSTFTNet for speed up |
|
74 |
+
| RoFormer | Zhuiyi Technology | ✅ | Rotary Positional Embeddings |
|
75 |
+
|
76 |
+
due to the use of data perturbation, it takes longer to train than other projects.
|
77 |
+
|
78 |
+
**USP : Unvoice and Silence with Pitch when infer**
|
79 |
+

|
80 |
+
|
81 |
+
## Why mix
|
82 |
+
|
83 |
+

|
84 |
+
|
85 |
+
## Plug-In-Diffusion
|
86 |
+
|
87 |
+

|
88 |
+
|
89 |
+
## Setup Environment
|
90 |
+
|
91 |
+
1. Install [PyTorch](https://pytorch.org/get-started/locally/).
|
92 |
+
|
93 |
+
2. Install project dependencies
|
94 |
+
```shell
|
95 |
+
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -r requirements.txt
|
96 |
+
```
|
97 |
+
**Note: whisper is already built-in, do not install it again otherwise it will cuase conflict and error**
|
98 |
+
3. Download the Timbre Encoder: [Speaker-Encoder by @mueller91](https://drive.google.com/drive/folders/15oeBYf6Qn1edONkVLXe82MzdIi3O_9m3), put `best_model.pth.tar` into `speaker_pretrain/`.
|
99 |
+
|
100 |
+
4. Download whisper model [whisper-large-v2](https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt). Make sure to download `large-v2.pt`,put it into `whisper_pretrain/`.
|
101 |
+
|
102 |
+
5. Download [hubert_soft model](https://github.com/bshall/hubert/releases/tag/v0.1),put `hubert-soft-0d54a1f4.pt` into `hubert_pretrain/`.
|
103 |
+
|
104 |
+
6. Download pitch extractor [crepe full](https://github.com/maxrmorrison/torchcrepe/tree/master/torchcrepe/assets),put `full.pth` into `crepe/assets`.
|
105 |
+
|
106 |
+
**Note: crepe full.pth is 84.9 MB, not 6kb**
|
107 |
+
|
108 |
+
7. Download pretrain model [sovits5.0.pretrain.pth](https://github.com/PlayVoice/so-vits-svc-5.0/releases/tag/5.0/), and put it into `vits_pretrain/`.
|
109 |
+
```shell
|
110 |
+
python svc_inference.py --config configs/base.yaml --model ./vits_pretrain/sovits5.0.pretrain.pth --spk ./configs/singers/singer0001.npy --wave test.wav
|
111 |
+
```
|
112 |
+
|
113 |
+
## Dataset preparation
|
114 |
+
|
115 |
+
Necessary pre-processing:
|
116 |
+
1. Separate voice and accompaniment with [UVR](https://github.com/Anjok07/ultimatevocalremovergui) (skip if no accompaniment)
|
117 |
+
2. Cut audio input to shorter length with [slicer](https://github.com/flutydeer/audio-slicer), whisper takes input less than 30 seconds.
|
118 |
+
3. Manually check generated audio input, remove inputs shorter than 2 seconds or with obivous noise.
|
119 |
+
4. Adjust loudness if necessary, recommend Adobe Audiiton.
|
120 |
+
5. Put the dataset into the `dataset_raw` directory following the structure below.
|
121 |
+
```
|
122 |
+
dataset_raw
|
123 |
+
├───speaker0
|
124 |
+
│ ├───000001.wav
|
125 |
+
│ ├───...
|
126 |
+
│ └───000xxx.wav
|
127 |
+
└───speaker1
|
128 |
+
├───000001.wav
|
129 |
+
├───...
|
130 |
+
└───000xxx.wav
|
131 |
+
```
|
132 |
+
|
133 |
+
## Data preprocessing
|
134 |
+
```shell
|
135 |
+
python svc_preprocessing.py -t 2
|
136 |
+
```
|
137 |
+
`-t`: threading, max number should not exceed CPU core count, usually 2 is enough.
|
138 |
+
After preprocessing you will get an output with following structure.
|
139 |
+
```
|
140 |
+
data_svc/
|
141 |
+
└── waves-16k
|
142 |
+
│ └── speaker0
|
143 |
+
│ │ ├── 000001.wav
|
144 |
+
│ │ └── 000xxx.wav
|
145 |
+
│ └── speaker1
|
146 |
+
│ ├── 000001.wav
|
147 |
+
│ └── 000xxx.wav
|
148 |
+
└── waves-32k
|
149 |
+
│ └── speaker0
|
150 |
+
│ │ ├── 000001.wav
|
151 |
+
│ │ └── 000xxx.wav
|
152 |
+
│ └── speaker1
|
153 |
+
│ ├── 000001.wav
|
154 |
+
│ └── 000xxx.wav
|
155 |
+
└── pitch
|
156 |
+
│ └── speaker0
|
157 |
+
│ │ ├── 000001.pit.npy
|
158 |
+
│ │ └── 000xxx.pit.npy
|
159 |
+
│ └── speaker1
|
160 |
+
│ ├── 000001.pit.npy
|
161 |
+
│ └── 000xxx.pit.npy
|
162 |
+
└── hubert
|
163 |
+
│ └── speaker0
|
164 |
+
│ │ ├── 000001.vec.npy
|
165 |
+
│ │ └── 000xxx.vec.npy
|
166 |
+
│ └── speaker1
|
167 |
+
│ ├── 000001.vec.npy
|
168 |
+
│ └── 000xxx.vec.npy
|
169 |
+
└── whisper
|
170 |
+
│ └── speaker0
|
171 |
+
│ │ ├── 000001.ppg.npy
|
172 |
+
│ │ └── 000xxx.ppg.npy
|
173 |
+
│ └── speaker1
|
174 |
+
│ ├── 000001.ppg.npy
|
175 |
+
│ └── 000xxx.ppg.npy
|
176 |
+
└── speaker
|
177 |
+
│ └── speaker0
|
178 |
+
│ │ ├── 000001.spk.npy
|
179 |
+
│ │ └── 000xxx.spk.npy
|
180 |
+
│ └── speaker1
|
181 |
+
│ ├── 000001.spk.npy
|
182 |
+
│ └── 000xxx.spk.npy
|
183 |
+
└── singer
|
184 |
+
│ ├── speaker0.spk.npy
|
185 |
+
│ └── speaker1.spk.npy
|
186 |
+
|
|
187 |
+
└── indexes
|
188 |
+
├── speaker0
|
189 |
+
│ ├── some_prefix_hubert.index
|
190 |
+
│ └── some_prefix_whisper.index
|
191 |
+
└── speaker1
|
192 |
+
├── hubert.index
|
193 |
+
└── whisper.index
|
194 |
+
```
|
195 |
+
|
196 |
+
1. Re-sampling
|
197 |
+
- Generate audio with a sampling rate of 16000Hz in `./data_svc/waves-16k`
|
198 |
+
```
|
199 |
+
python prepare/preprocess_a.py -w ./dataset_raw -o ./data_svc/waves-16k -s 16000
|
200 |
+
```
|
201 |
+
|
202 |
+
- Generate audio with a sampling rate of 32000Hz in `./data_svc/waves-32k`
|
203 |
+
```
|
204 |
+
python prepare/preprocess_a.py -w ./dataset_raw -o ./data_svc/waves-32k -s 32000
|
205 |
+
```
|
206 |
+
2. Use 16K audio to extract pitch
|
207 |
+
```
|
208 |
+
python prepare/preprocess_crepe.py -w data_svc/waves-16k/ -p data_svc/pitch
|
209 |
+
```
|
210 |
+
3. Use 16K audio to extract ppg
|
211 |
+
```
|
212 |
+
python prepare/preprocess_ppg.py -w data_svc/waves-16k/ -p data_svc/whisper
|
213 |
+
```
|
214 |
+
4. Use 16K audio to extract hubert
|
215 |
+
```
|
216 |
+
python prepare/preprocess_hubert.py -w data_svc/waves-16k/ -v data_svc/hubert
|
217 |
+
```
|
218 |
+
5. Use 16k audio to extract timbre code
|
219 |
+
```
|
220 |
+
python prepare/preprocess_speaker.py data_svc/waves-16k/ data_svc/speaker
|
221 |
+
```
|
222 |
+
6. Extract the average value of the timbre code for inference; it can also replace a single audio timbre in generating the training index, and use it as the unified timbre of the speaker for training
|
223 |
+
```
|
224 |
+
python prepare/preprocess_speaker_ave.py data_svc/speaker/ data_svc/singer
|
225 |
+
```
|
226 |
+
7. Use 32k audio to extract the linear spectrum
|
227 |
+
```
|
228 |
+
python prepare/preprocess_spec.py -w data_svc/waves-32k/ -s data_svc/specs
|
229 |
+
```
|
230 |
+
8. Use 32k audio to generate training index
|
231 |
+
```
|
232 |
+
python prepare/preprocess_train.py
|
233 |
+
```
|
234 |
+
11. Training file debugging
|
235 |
+
```
|
236 |
+
python prepare/preprocess_zzz.py
|
237 |
+
```
|
238 |
+
|
239 |
+
## Train
|
240 |
+
1. If fine-tuning is based on the pre-trained model, you need to download the pre-trained model: [sovits5.0.pretrain.pth](https://github.com/PlayVoice/so-vits-svc-5.0/releases/tag/5.0). Put pretrained model under project root, change this line
|
241 |
+
```
|
242 |
+
pretrain: "./vits_pretrain/sovits5.0.pretrain.pth"
|
243 |
+
```
|
244 |
+
in `configs/base.yaml`,and adjust the learning rate appropriately, eg 5e-5.
|
245 |
+
|
246 |
+
`batch_size`: for GPU with 6G VRAM, 6 is the recommended value, 8 will work but step speed will be much slower.
|
247 |
+
2. Start training
|
248 |
+
```
|
249 |
+
python svc_trainer.py -c configs/base.yaml -n sovits5.0
|
250 |
+
```
|
251 |
+
3. Resume training
|
252 |
+
```
|
253 |
+
python svc_trainer.py -c configs/base.yaml -n sovits5.0 -p chkpt/sovits5.0/sovits5.0_***.pt
|
254 |
+
```
|
255 |
+
4. Log visualization
|
256 |
+
```
|
257 |
+
tensorboard --logdir logs/
|
258 |
+
```
|
259 |
+
|
260 |
+

|
261 |
+
|
262 |
+

|
263 |
+
|
264 |
+
## Inference
|
265 |
+
|
266 |
+
1. Export inference model: text encoder, Flow network, Decoder network
|
267 |
+
```
|
268 |
+
python svc_export.py --config configs/base.yaml --checkpoint_path chkpt/sovits5.0/***.pt
|
269 |
+
```
|
270 |
+
2. Inference
|
271 |
+
- if there is no need to adjust `f0`, just run the following command.
|
272 |
+
```
|
273 |
+
python svc_inference.py --config configs/base.yaml --model sovits5.0.pth --spk ./data_svc/singer/your_singer.spk.npy --wave test.wav --shift 0
|
274 |
+
```
|
275 |
+
- if `f0` will be adjusted manually, follow the steps:
|
276 |
+
1. use whisper to extract content encoding, generate `test.vec.npy`.
|
277 |
+
```
|
278 |
+
python whisper/inference.py -w test.wav -p test.ppg.npy
|
279 |
+
```
|
280 |
+
2. use hubert to extract content vector, without using one-click reasoning, in order to reduce GPU memory usage
|
281 |
+
```
|
282 |
+
python hubert/inference.py -w test.wav -v test.vec.npy
|
283 |
+
```
|
284 |
+
3. extract the F0 parameter to the csv text format, open the csv file in Excel, and manually modify the wrong F0 according to Audition or SonicVisualiser
|
285 |
+
```
|
286 |
+
python pitch/inference.py -w test.wav -p test.csv
|
287 |
+
```
|
288 |
+
4. final inference
|
289 |
+
```
|
290 |
+
python svc_inference.py --config configs/base.yaml --model sovits5.0.pth --spk ./data_svc/singer/your_singer.spk.npy --wave test.wav --ppg test.ppg.npy --vec test.vec.npy --pit test.csv --shift 0
|
291 |
+
```
|
292 |
+
3. Notes
|
293 |
+
|
294 |
+
- when `--ppg` is specified, when the same audio is reasoned multiple times, it can avoid repeated extraction of audio content codes; if it is not specified, it will be automatically extracted;
|
295 |
+
|
296 |
+
- when `--vec` is specified, when the same audio is reasoned multiple times, it can avoid repeated extraction of audio content codes; if it is not specified, it will be automatically extracted;
|
297 |
+
|
298 |
+
- when `--pit` is specified, the manually tuned F0 parameter can be loaded; if not specified, it will be automatically extracted;
|
299 |
+
|
300 |
+
- generate files in the current directory:svc_out.wav
|
301 |
+
|
302 |
+
4. Arguments ref
|
303 |
+
|
304 |
+
| args |--config | --model | --spk | --wave | --ppg | --vec | --pit | --shift |
|
305 |
+
| :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
|
306 |
+
| name | config path | model path | speaker | wave input | wave ppg | wave hubert | wave pitch | pitch shift |
|
307 |
+
|
308 |
+
5. post by vad
|
309 |
+
```
|
310 |
+
python svc_inference_post.py --ref test.wav --svc svc_out.wav --out svc_out_post.wav
|
311 |
+
```
|
312 |
+
|
313 |
+
## Train Feature Retrieval Index (Optional)
|
314 |
+
|
315 |
+
To increase the stability of the generated timbre, you can use the method described in the
|
316 |
+
[Retrieval-based-Voice-Conversion](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/docs/en/README.en.md)
|
317 |
+
repository. This method consists of 2 steps:
|
318 |
+
|
319 |
+
1. Training the retrieval index on hubert and whisper features
|
320 |
+
Run training with default settings:
|
321 |
+
```
|
322 |
+
python svc_train_retrieval.py
|
323 |
+
```
|
324 |
+
|
325 |
+
If the number of vectors is more than 200_000 they will be compressed to 10_000 using the MiniBatchKMeans algorithm.
|
326 |
+
You can change these settings using command line options:
|
327 |
+
```
|
328 |
+
usage: crate faiss indexes for feature retrieval [-h] [--debug] [--prefix PREFIX] [--speakers SPEAKERS [SPEAKERS ...]] [--compress-features-after COMPRESS_FEATURES_AFTER]
|
329 |
+
[--n-clusters N_CLUSTERS] [--n-parallel N_PARALLEL]
|
330 |
+
|
331 |
+
options:
|
332 |
+
-h, --help show this help message and exit
|
333 |
+
--debug
|
334 |
+
--prefix PREFIX add prefix to index filename
|
335 |
+
--speakers SPEAKERS [SPEAKERS ...]
|
336 |
+
speaker names to create an index. By default all speakers are from data_svc
|
337 |
+
--compress-features-after COMPRESS_FEATURES_AFTER
|
338 |
+
If the number of features is greater than the value compress feature vectors using MiniBatchKMeans.
|
339 |
+
--n-clusters N_CLUSTERS
|
340 |
+
Number of centroids to which features will be compressed
|
341 |
+
--n-parallel N_PARALLEL
|
342 |
+
Nuber of parallel job of MinibatchKmeans. Default is cpus-1
|
343 |
+
```
|
344 |
+
Compression of training vectors can speed up index inference, but reduces the quality of the retrieve.
|
345 |
+
Use vector count compression if you really have a lot of them.
|
346 |
+
|
347 |
+
The resulting indexes will be stored in the "indexes" folder as:
|
348 |
+
```
|
349 |
+
data_svc
|
350 |
+
...
|
351 |
+
└── indexes
|
352 |
+
├── speaker0
|
353 |
+
│ ├── some_prefix_hubert.index
|
354 |
+
│ └── some_prefix_whisper.index
|
355 |
+
└── speaker1
|
356 |
+
├── hubert.index
|
357 |
+
└── whisper.index
|
358 |
+
```
|
359 |
+
2. At the inference stage adding the n closest features in a certain proportion of the vits model
|
360 |
+
Enable Feature Retrieval with settings:
|
361 |
+
```
|
362 |
+
python svc_inference.py --config configs/base.yaml --model sovits5.0.pth --spk ./data_svc/singer/your_singer.spk.npy --wave test.wav --shift 0 \
|
363 |
+
--enable-retrieval \
|
364 |
+
--retrieval-ratio 0.5 \
|
365 |
+
--n-retrieval-vectors 3
|
366 |
+
```
|
367 |
+
For a better retrieval effect, you can try to cycle through different parameters: `--retrieval-ratio` and `--n-retrieval-vectors`
|
368 |
+
|
369 |
+
If you have multiple sets of indexes, you can specify a specific set via the parameter: `--retrieval-index-prefix`
|
370 |
+
|
371 |
+
You can explicitly specify the paths to the hubert and whisper indexes using the parameters: `--hubert-index-path` and `--whisper-index-path`
|
372 |
+
|
373 |
+
|
374 |
+
## Create singer
|
375 |
+
named by pure coincidence:average -> ave -> eva,eve(eva) represents conception and reproduction
|
376 |
+
|
377 |
+
```
|
378 |
+
python svc_eva.py
|
379 |
+
```
|
380 |
+
|
381 |
+
```python
|
382 |
+
eva_conf = {
|
383 |
+
'./configs/singers/singer0022.npy': 0,
|
384 |
+
'./configs/singers/singer0030.npy': 0,
|
385 |
+
'./configs/singers/singer0047.npy': 0.5,
|
386 |
+
'./configs/singers/singer0051.npy': 0.5,
|
387 |
+
}
|
388 |
+
```
|
389 |
+
|
390 |
+
the generated singer file will be `eva.spk.npy`.
|
391 |
+
|
392 |
+
## Data set
|
393 |
+
|
394 |
+
| Name | URL |
|
395 |
+
| :--- | :--- |
|
396 |
+
|KiSing |http://shijt.site/index.php/2021/05/16/kising-the-first-open-source-mandarin-singing-voice-synthesis-corpus/|
|
397 |
+
|PopCS |https://github.com/MoonInTheRiver/DiffSinger/blob/master/resources/apply_form.md|
|
398 |
+
|opencpop |https://wenet.org.cn/opencpop/download/|
|
399 |
+
|Multi-Singer |https://github.com/Multi-Singer/Multi-Singer.github.io|
|
400 |
+
|M4Singer |https://github.com/M4Singer/M4Singer/blob/master/apply_form.md|
|
401 |
+
|CSD |https://zenodo.org/record/4785016#.YxqrTbaOMU4|
|
402 |
+
|KSS |https://www.kaggle.com/datasets/bryanpark/korean-single-speaker-speech-dataset|
|
403 |
+
|JVS MuSic |https://sites.google.com/site/shinnosuketakamichi/research-topics/jvs_music|
|
404 |
+
|PJS |https://sites.google.com/site/shinnosuketakamichi/research-topics/pjs_corpus|
|
405 |
+
|JUST Song |https://sites.google.com/site/shinnosuketakamichi/publication/jsut-song|
|
406 |
+
|MUSDB18 |https://sigsep.github.io/datasets/musdb.html#musdb18-compressed-stems|
|
407 |
+
|DSD100 |https://sigsep.github.io/datasets/dsd100.html|
|
408 |
+
|Aishell-3 |http://www.aishelltech.com/aishell_3|
|
409 |
+
|VCTK |https://datashare.ed.ac.uk/handle/10283/2651|
|
410 |
+
|Korean Songs |http://urisori.co.kr/urisori-en/doku.php/|
|
411 |
+
|
412 |
+
## Code sources and references
|
413 |
+
|
414 |
+
https://github.com/facebookresearch/speech-resynthesis [paper](https://arxiv.org/abs/2104.00355)
|
415 |
+
|
416 |
+
https://github.com/jaywalnut310/vits [paper](https://arxiv.org/abs/2106.06103)
|
417 |
+
|
418 |
+
https://github.com/openai/whisper/ [paper](https://arxiv.org/abs/2212.04356)
|
419 |
+
|
420 |
+
https://github.com/NVIDIA/BigVGAN [paper](https://arxiv.org/abs/2206.04658)
|
421 |
+
|
422 |
+
https://github.com/mindslab-ai/univnet [paper](https://arxiv.org/abs/2106.07889)
|
423 |
+
|
424 |
+
https://github.com/nii-yamagishilab/project-NN-Pytorch-scripts/tree/master/project/01-nsf
|
425 |
+
|
426 |
+
https://github.com/huawei-noah/Speech-Backbones/tree/main/Grad-TTS
|
427 |
+
|
428 |
+
https://github.com/brentspell/hifi-gan-bwe
|
429 |
+
|
430 |
+
https://github.com/mozilla/TTS
|
431 |
+
|
432 |
+
https://github.com/bshall/soft-vc
|
433 |
+
|
434 |
+
https://github.com/maxrmorrison/torchcrepe
|
435 |
+
|
436 |
+
https://github.com/MoonInTheRiver/DiffSinger
|
437 |
+
|
438 |
+
https://github.com/OlaWod/FreeVC [paper](https://arxiv.org/abs/2210.15418)
|
439 |
+
|
440 |
+
https://github.com/yl4579/HiFTNet [paper](https://arxiv.org/abs/2309.09493)
|
441 |
+
|
442 |
+
[Autoregressive neural f0 model for statistical parametric speech synthesis](https://web.archive.org/web/20210718024752id_/https://ieeexplore.ieee.org/ielx7/6570655/8356719/08341752.pdf)
|
443 |
+
|
444 |
+
[One-shot Voice Conversion by Separating Speaker and Content Representations with Instance Normalization](https://arxiv.org/abs/1904.05742)
|
445 |
+
|
446 |
+
[SNAC : Speaker-normalized Affine Coupling Layer in Flow-based Architecture for Zero-Shot Multi-Speaker Text-to-Speech](https://github.com/hcy71o/SNAC)
|
447 |
+
|
448 |
+
[Adapter-Based Extension of Multi-Speaker Text-to-Speech Model for New Speakers](https://arxiv.org/abs/2211.00585)
|
449 |
+
|
450 |
+
[AdaSpeech: Adaptive Text to Speech for Custom Voice](https://arxiv.org/pdf/2103.00993.pdf)
|
451 |
+
|
452 |
+
[AdaVITS: Tiny VITS for Low Computing Resource Speaker Adaptation](https://arxiv.org/pdf/2206.00208.pdf)
|
453 |
+
|
454 |
+
[Cross-Speaker Prosody Transfer on Any Text for Expressive Speech Synthesis](https://github.com/ubisoft/ubisoft-laforge-daft-exprt)
|
455 |
+
|
456 |
+
[Learn to Sing by Listening: Building Controllable Virtual Singer by Unsupervised Learning from Voice Recordings](https://arxiv.org/abs/2305.05401)
|
457 |
+
|
458 |
+
[Adversarial Speaker Disentanglement Using Unannotated External Data for Self-supervised Representation Based Voice Conversion](https://arxiv.org/pdf/2305.09167.pdf)
|
459 |
+
|
460 |
+
[Multilingual Speech Synthesis and Cross-Language Voice Cloning: GRL](https://arxiv.org/abs/1907.04448)
|
461 |
+
|
462 |
+
[RoFormer: Enhanced Transformer with rotary position embedding](https://arxiv.org/abs/2104.09864)
|
463 |
+
|
464 |
+
## Method of Preventing Timbre Leakage Based on Data Perturbation
|
465 |
+
|
466 |
+
https://github.com/auspicious3000/contentvec/blob/main/contentvec/data/audio/audio_utils_1.py
|
467 |
+
|
468 |
+
https://github.com/revsic/torch-nansy/blob/main/utils/augment/praat.py
|
469 |
+
|
470 |
+
https://github.com/revsic/torch-nansy/blob/main/utils/augment/peq.py
|
471 |
+
|
472 |
+
https://github.com/biggytruck/SpeechSplit2/blob/main/utils.py
|
473 |
+
|
474 |
+
https://github.com/OlaWod/FreeVC/blob/main/preprocess_sr.py
|
475 |
+
|
476 |
+
## Contributors
|
477 |
+
|
478 |
+
<a href="https://github.com/PlayVoice/so-vits-svc/graphs/contributors">
|
479 |
+
<img src="https://contrib.rocks/image?repo=PlayVoice/so-vits-svc" />
|
480 |
+
</a>
|
481 |
+
|
482 |
+
## Thanks to
|
483 |
+
|
484 |
+
https://github.com/Francis-Komizu/Sovits
|
485 |
+
|
486 |
+
## Relevant Projects
|
487 |
+
- [LoRA-SVC](https://github.com/PlayVoice/lora-svc): decoder only svc
|
488 |
+
- [Grad-SVC](https://github.com/PlayVoice/Grad-SVC): diffusion based svc
|
489 |
+
|
490 |
+
## Original evidence
|
491 |
+
2022.04.12 https://mp.weixin.qq.com/s/autNBYCsG4_SvWt2-Ll_zA
|
492 |
+
|
493 |
+
2022.04.22 https://github.com/PlayVoice/VI-SVS
|
494 |
+
|
495 |
+
2022.07.26 https://mp.weixin.qq.com/s/qC4TJy-4EVdbpvK2cQb1TA
|
496 |
+
|
497 |
+
2022.09.08 https://github.com/PlayVoice/VI-SVC
|
498 |
+
|
499 |
+
## Be copied by svc-develop-team/so-vits-svc
|
500 |
+

|
README_ZH.md
ADDED
@@ -0,0 +1,418 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<div align="center">
|
2 |
+
<h1> Variational Inference with adversarial learning for end-to-end Singing Voice Conversion based on VITS </h1>
|
3 |
+
|
4 |
+
[](https://huggingface.co/spaces/maxmax20160403/sovits5.0)
|
5 |
+
<img alt="GitHub Repo stars" src="https://img.shields.io/github/stars/PlayVoice/so-vits-svc-5.0">
|
6 |
+
<img alt="GitHub forks" src="https://img.shields.io/github/forks/PlayVoice/so-vits-svc-5.0">
|
7 |
+
<img alt="GitHub issues" src="https://img.shields.io/github/issues/PlayVoice/so-vits-svc-5.0">
|
8 |
+
<img alt="GitHub" src="https://img.shields.io/github/license/PlayVoice/so-vits-svc-5.0">
|
9 |
+
|
10 |
+
</div>
|
11 |
+
|
12 |
+
### 本项目使用简洁明了的代码结构,用于深度学习技术的研究
|
13 |
+
### 基于学习的目的,本项目并不追求效果极限、而更多的为学生笔记本考虑,采用了低配置参数、最终预训练模型为202M(包括生成器和判别器,且为float32模型),远远小于同类项目模型大小
|
14 |
+
### 如果你寻找的是直接可用的项目,本项目并不适合你
|
15 |
+
|
16 |
+
- 本项目的目标群体是:深度学习初学者,具备Python和PyTorch的基本操作是使用本项目的前置条件;
|
17 |
+
- 本项目旨在帮助深度学习初学者,摆脱枯燥的纯理论学习,通过与实践结合,熟练掌握深度学习基本知识;
|
18 |
+
- 本项目不支持实时变声;(支持需要换掉whisper)
|
19 |
+
- 本项目不会开发用于其他用途的一键包
|
20 |
+
### 代码详解课程
|
21 |
+
- 1-整体框架 https://www.bilibili.com/video/BV1Tj411e7pQ
|
22 |
+
- 2-数据准备和预处理 https://www.bilibili.com/video/BV1uj411v7zW
|
23 |
+
- 3-先验后验编码器 https://www.bilibili.com/video/BV1Be411Q7r5
|
24 |
+
- 4-decoder部分 https://www.bilibili.com/video/BV19u4y1b73U
|
25 |
+
- 5-蛇形激活函数 https://www.bilibili.com/video/BV1HN4y1D7AR
|
26 |
+
- 6-Flow部分 https://www.bilibili.com/video/BV1ju411F7Fs
|
27 |
+
- 7-训练及损失函数部分 https://www.bilibili.com/video/BV1qw411W73B
|
28 |
+
- 8-训练推理以及基频矫正 https://www.bilibili.com/video/BV1eb4y1u7ER
|
29 |
+
|
30 |
+

|
31 |
+
|
32 |
+
- 【无 泄漏】支持多发音人
|
33 |
+
|
34 |
+
- 【捏 音色】创造独有发音人
|
35 |
+
|
36 |
+
- 【带 伴奏】也能进行转换,轻度伴奏
|
37 |
+
|
38 |
+
- 【用 Excel】进行原始调教,纯手工
|
39 |
+
|
40 |
+
https://github.com/PlayVoice/so-vits-svc-5.0/assets/16432329/63858332-cc0d-40e1-a216-6fe8bf638f7c
|
41 |
+
|
42 |
+
Powered by [@ShadowVap](https://space.bilibili.com/491283091)
|
43 |
+
|
44 |
+
## 模型特点:
|
45 |
+
|
46 |
+
| Feature | From | Status | Function |
|
47 |
+
| :--- | :--- | :--- | :--- |
|
48 |
+
| whisper | OpenAI | ✅ | 强大的抗噪能力 |
|
49 |
+
| bigvgan | NVIDA | ✅ | 抗锯齿与蛇形激活,共振峰更清晰,提升音质明显 |
|
50 |
+
| natural speech | Microsoft | ✅ | 减少发音错误 |
|
51 |
+
| neural source-filter | NII | ✅ | 解决断音问题 |
|
52 |
+
| speaker encoder | Google | ✅ | 音色编码与聚类 |
|
53 |
+
| GRL for speaker | Ubisoft |✅ | 对抗去音色 |
|
54 |
+
| SNAC | Samsung | ✅ | VITS 一句话克隆 |
|
55 |
+
| SCLN | Microsoft | ✅ | 改善克隆 |
|
56 |
+
| PPG perturbation | 本项目 | ✅ | 提升抗噪性和去音色 |
|
57 |
+
| HuBERT perturbation | 本项目 | ✅ | 提升抗噪性和去音色 |
|
58 |
+
| VAE perturbation | 本项目 | ✅ | 提升音质 |
|
59 |
+
| Mix encoder | 本项目 | ✅ | 提升转换稳定性 |
|
60 |
+
| USP 推理 | 本项目 | ✅ | 提升转换稳定性 |
|
61 |
+
|
62 |
+
**USP : 即使unvoice和silence在推理的时候,也有Pitch,这个Pitch平滑链接voice段**
|
63 |
+

|
64 |
+
|
65 |
+
## 为什么要mix
|
66 |
+
|
67 |
+

|
68 |
+
|
69 |
+
## 安装环境
|
70 |
+
|
71 |
+
1. 安装[PyTorch](https://pytorch.org/get-started/locally/)
|
72 |
+
|
73 |
+
2. 安装项目依赖
|
74 |
+
```
|
75 |
+
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -r requirements.txt
|
76 |
+
```
|
77 |
+
**注意:不能额外安装whisper,否则会和代码内置whisper冲突**
|
78 |
+
|
79 |
+
3. 下载[音色编码器](https://drive.google.com/drive/folders/15oeBYf6Qn1edONkVLXe82MzdIi3O_9m3), 把`best_model.pth.tar`放到`speaker_pretrain/`里面 (**不要解压**)
|
80 |
+
|
81 |
+
4. 下载[whisper-large-v2模型](https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt),把`large-v2.pt`放到`whisper_pretrain/`里面
|
82 |
+
|
83 |
+
5. 下载[hubert_soft模型](https://github.com/bshall/hubert/releases/tag/v0.1),把`hubert-soft-0d54a1f4.pt`放到`hubert_pretrain/`里面
|
84 |
+
|
85 |
+
6. 下载音高提取模型[crepe full](https://github.com/maxrmorrison/torchcrepe/tree/master/torchcrepe/assets),把`full.pth`放到`crepe/assets`里面
|
86 |
+
|
87 |
+
**注意:full.pth为84.9M,请确认文件大小无误**
|
88 |
+
|
89 |
+
7. 下载[sovits5.0.pretrain.pth](https://github.com/PlayVoice/so-vits-svc-5.0/releases/tag/5.0/), 把它放到`vits_pretrain/`里面,推理测试
|
90 |
+
|
91 |
+
> python svc_inference.py --config configs/base.yaml --model ./vits_pretrain/sovits5.0.pretrain.pth --spk ./configs/singers/singer0001.npy --wave test.wav
|
92 |
+
|
93 |
+
## 数据集准备
|
94 |
+
1. 人声分离,如果数据集没有BGM直接跳过此步骤(推荐使用[UVR](https://github.com/Anjok07/ultimatevocalremovergui)中的3_HP-Vocal-UVR模型或者htdemucs_ft模型抠出数据集中的人声)
|
95 |
+
2. 用[slicer](https://github.com/flutydeer/audio-slicer)剪切音频,whisper要求为小于30秒(建议丢弃不足2秒的音频,短音频大多没有音素,有可能会影响训练效果)
|
96 |
+
3. 手动筛选经过第1步和第2步处理过的音频,裁剪或者丢弃杂音明显的音频,如果数据集没有BGM直接跳过此步骤
|
97 |
+
4. 用Adobe Audition进行响度平衡处理
|
98 |
+
5. 按下面文件结构,将数据集放入dataset_raw目录
|
99 |
+
```shell
|
100 |
+
dataset_raw
|
101 |
+
├───speaker0
|
102 |
+
│ ├───000001.wav
|
103 |
+
│ ├───...
|
104 |
+
│ └───000xxx.wav
|
105 |
+
└───speaker1
|
106 |
+
├───000001.wav
|
107 |
+
├───...
|
108 |
+
└───000xxx.wav
|
109 |
+
```
|
110 |
+
|
111 |
+
## 数据预处理
|
112 |
+
|
113 |
+
```shell
|
114 |
+
python svc_preprocessing.py -t 2
|
115 |
+
```
|
116 |
+
-t:指定线程数,必须是正整数且不得超过CPU总核心数,一般写2就可以了
|
117 |
+
|
118 |
+
预处理完成后文件夹结构如下面所示
|
119 |
+
```shell
|
120 |
+
data_svc/
|
121 |
+
└── waves-16k
|
122 |
+
│ └── speaker0
|
123 |
+
│ │ ├── 000001.wav
|
124 |
+
│ │ └── 000xxx.wav
|
125 |
+
│ └── speaker1
|
126 |
+
│ ├── 000001.wav
|
127 |
+
│ └── 000xxx.wav
|
128 |
+
└── waves-32k
|
129 |
+
│ └── speaker0
|
130 |
+
│ │ ├── 000001.wav
|
131 |
+
│ │ └── 000xxx.wav
|
132 |
+
│ └── speaker1
|
133 |
+
│ ├── 000001.wav
|
134 |
+
│ └── 000xxx.wav
|
135 |
+
└── pitch
|
136 |
+
│ └── speaker0
|
137 |
+
│ │ ├── 000001.pit.npy
|
138 |
+
│ │ └── 000xxx.pit.npy
|
139 |
+
│ └── speaker1
|
140 |
+
│ ├── 000001.pit.npy
|
141 |
+
│ └── 000xxx.pit.npy
|
142 |
+
└── hubert
|
143 |
+
│ └── speaker0
|
144 |
+
│ │ ├── 000001.vec.npy
|
145 |
+
│ │ └── 000xxx.vec.npy
|
146 |
+
│ └── speaker1
|
147 |
+
│ ├── 000001.vec.npy
|
148 |
+
│ └── 000xxx.vec.npy
|
149 |
+
└── whisper
|
150 |
+
│ └── speaker0
|
151 |
+
│ │ ├── 000001.ppg.npy
|
152 |
+
│ │ └── 000xxx.ppg.npy
|
153 |
+
│ └── speaker1
|
154 |
+
│ ├── 000001.ppg.npy
|
155 |
+
│ └── 000xxx.ppg.npy
|
156 |
+
└── speaker
|
157 |
+
│ └── speaker0
|
158 |
+
│ │ ├── 000001.spk.npy
|
159 |
+
│ │ └── 000xxx.spk.npy
|
160 |
+
│ └── speaker1
|
161 |
+
│ ├── 000001.spk.npy
|
162 |
+
│ └── 000xxx.spk.npy
|
163 |
+
└── singer
|
164 |
+
├── speaker0.spk.npy
|
165 |
+
└── speaker1.spk.npy
|
166 |
+
```
|
167 |
+
|
168 |
+
如果您有编程基础,推荐,逐步完成数据处理,也利于学习内部工作原理
|
169 |
+
|
170 |
+
- 1, 重采样
|
171 |
+
|
172 |
+
生成采样率16000Hz音频, 存储路径为:./data_svc/waves-16k
|
173 |
+
|
174 |
+
> python prepare/preprocess_a.py -w ./dataset_raw -o ./data_svc/waves-16k -s 16000
|
175 |
+
|
176 |
+
生成采样率32000Hz音频, 存储路径为:./data_svc/waves-32k
|
177 |
+
|
178 |
+
> python prepare/preprocess_a.py -w ./dataset_raw -o ./data_svc/waves-32k -s 32000
|
179 |
+
|
180 |
+
- 2, 使用16K音频,提取音高
|
181 |
+
|
182 |
+
> python prepare/preprocess_crepe.py -w data_svc/waves-16k/ -p data_svc/pitch
|
183 |
+
|
184 |
+
- 3, 使用16k音频,提取内容编码
|
185 |
+
> python prepare/preprocess_ppg.py -w data_svc/waves-16k/ -p data_svc/whisper
|
186 |
+
|
187 |
+
- 4, 使用16k音频,提取内容编码
|
188 |
+
> python prepare/preprocess_hubert.py -w data_svc/waves-16k/ -v data_svc/hubert
|
189 |
+
|
190 |
+
- 5, 使用16k音频,提取音色编码
|
191 |
+
> python prepare/preprocess_speaker.py data_svc/waves-16k/ data_svc/speaker
|
192 |
+
|
193 |
+
- 6, 提取音色编码均值;用于推理,也可作为发音人统一音色用于生成训练索引(数据音色变化不大的情况下)
|
194 |
+
> python prepare/preprocess_speaker_ave.py data_svc/speaker/ data_svc/singer
|
195 |
+
|
196 |
+
- 7, 使用32k音频,提取线性谱
|
197 |
+
> python prepare/preprocess_spec.py -w data_svc/waves-32k/ -s data_svc/specs
|
198 |
+
|
199 |
+
- 8, 使用32k音频,生成训练索引
|
200 |
+
> python prepare/preprocess_train.py
|
201 |
+
|
202 |
+
- 9, 训练文件调试
|
203 |
+
> python prepare/preprocess_zzz.py
|
204 |
+
|
205 |
+
## 训练
|
206 |
+
0. 参数调整
|
207 |
+
如果基于预训练模型微调,需要下载预训练模型[sovits5.0.pretrain.pth](https://github.com/PlayVoice/so-vits-svc-5.0/releases/tag/5.0)并且放在项目根目录下面<br>
|
208 |
+
并且修改`configs/base.yaml`的参数`pretrain: "./vits_pretrain/sovits5.0.pretrain.pth"`,并适当调小学习率(建议从5e-5开始尝试)<br>
|
209 |
+
**learning_rate & batch_size & accum_step 为三个紧密相关的参数,需要仔细调节**<br>
|
210 |
+
**batch_size 乘以 accum_step 通常等于 16 或 32,对于低显存GPU,可以尝试 batch_size = 4,accum_step = 4**
|
211 |
+
|
212 |
+
1. 开始训练
|
213 |
+
```
|
214 |
+
python svc_trainer.py -c configs/base.yaml -n sovits5.0
|
215 |
+
```
|
216 |
+
2. 恢复训练
|
217 |
+
```
|
218 |
+
python svc_trainer.py -c configs/base.yaml -n sovits5.0 -p chkpt/sovits5.0/sovits5.0_***.pt
|
219 |
+
```
|
220 |
+
3. 训练日志可视化
|
221 |
+
```
|
222 |
+
tensorboard --logdir logs/
|
223 |
+
```
|
224 |
+
|
225 |
+

|
226 |
+
|
227 |
+

|
228 |
+
|
229 |
+
## 推理
|
230 |
+
1. 导出推理模型:文本编码器,Flow网络,Decoder网络;判别器和后验编码器等只在训练中使用
|
231 |
+
```
|
232 |
+
python svc_export.py --config configs/base.yaml --checkpoint_path chkpt/sovits5.0/***.pt
|
233 |
+
```
|
234 |
+
2. 推理
|
235 |
+
- 如果不想手动调整f0,只需要最终的推理结果,运行下面的命令即可
|
236 |
+
```
|
237 |
+
python svc_inference.py --config configs/base.yaml --model sovits5.0.pth --spk ./data_svc/singer/修改成对应的名称.npy --wave test.wav --shift 0
|
238 |
+
```
|
239 |
+
- 如果需要手动调整f0,依据下面的流程操作
|
240 |
+
|
241 |
+
- 使用whisper提取内容编码,生成test.ppg.npy
|
242 |
+
```
|
243 |
+
python whisper/inference.py -w test.wav -p test.ppg.npy
|
244 |
+
```
|
245 |
+
|
246 |
+
- 使用hubert提取内容编码,生成test.vec.npy
|
247 |
+
```
|
248 |
+
python hubert/inference.py -w test.wav -v test.vec.npy
|
249 |
+
```
|
250 |
+
|
251 |
+
- 提取csv文本格式F0参数,用Excel打开csv文件,对照Audition或者SonicVisualiser手动修改错误的F0
|
252 |
+
```
|
253 |
+
python pitch/inference.py -w test.wav -p test.csv
|
254 |
+
```
|
255 |
+
- 最终推理
|
256 |
+
```
|
257 |
+
python svc_inference.py --config configs/base.yaml --model sovits5.0.pth --spk ./data_svc/singer/修改成对应的名称.npy --wave test.wav --ppg test.ppg.npy --vec test.vec.npy --pit test.csv --shift 0
|
258 |
+
```
|
259 |
+
|
260 |
+
3. 一些注意点
|
261 |
+
当指定--ppg后,多次推理同一个音频时,可以避免重复提取音频内容编码;没有指定,也会自动提取
|
262 |
+
|
263 |
+
当指定--vec后,多次推理同一个音频时,可以避免重复提取音频内容编码;没有指定,也会自动提取
|
264 |
+
|
265 |
+
当指定--pit后,可以加载手工调教的F0参数;没有指定,也会自动提取
|
266 |
+
|
267 |
+
生成文件在当前目录svc_out.wav
|
268 |
+
|
269 |
+
| args | --config | --model | --spk | --wave | --ppg | --vec | --pit | --shift |
|
270 |
+
| :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
|
271 |
+
| name | 配置文件 | 模型文件 | 音色文件 | 音频文件 | ppg内容 | hubert内容 | 音高内容 | 升降调 |
|
272 |
+
|
273 |
+
4. 去噪后处理
|
274 |
+
```
|
275 |
+
python svc_inference_post.py --ref test.wav --svc svc_out.wav --out svc_out_post.wav
|
276 |
+
```
|
277 |
+
|
278 |
+
## 两种训练模式
|
279 |
+
- 分散模式:训练索引中,音色文件使用音频音色
|
280 |
+
- 统一模式:训练索引中,音色文件使用发音人音色
|
281 |
+
|
282 |
+
**问题:哪种情况下,哪个模式更好**
|
283 |
+
|
284 |
+
## 模型融合
|
285 |
+
```
|
286 |
+
python svc_merge.py --model1 模型1.pt --model1 模型2.pt --rate 模型1占比(0~1)
|
287 |
+
```
|
288 |
+
对不同epoch的模型进行融合,可以获得比较平均的性能、削弱过拟合
|
289 |
+
|
290 |
+
例如:python svc_merge.py --model1 chkpt\sovits5.0\sovits5.0_1045.pt --model2 chkpt\sovits5.0\sovits5.0_1050.pt --rate 0.4
|
291 |
+
|
292 |
+
## 捏音色
|
293 |
+
纯属巧合的取名:average -> ave -> eva,夏娃代表者孕育和繁衍
|
294 |
+
```
|
295 |
+
python svc_eva.py
|
296 |
+
```
|
297 |
+
```python
|
298 |
+
eva_conf = {
|
299 |
+
'./configs/singers/singer0022.npy': 0,
|
300 |
+
'./configs/singers/singer0030.npy': 0,
|
301 |
+
'./configs/singers/singer0047.npy': 0.5,
|
302 |
+
'./configs/singers/singer0051.npy': 0.5,
|
303 |
+
}
|
304 |
+
```
|
305 |
+
|
306 |
+
生成的音色文件为:eva.spk.npy
|
307 |
+
|
308 |
+
## 数据集
|
309 |
+
|
310 |
+
| Name | URL |
|
311 |
+
| :--- | :--- |
|
312 |
+
|KiSing |http://shijt.site/index.php/2021/05/16/kising-the-first-open-source-mandarin-singing-voice-synthesis-corpus/|
|
313 |
+
|PopCS |https://github.com/MoonInTheRiver/DiffSinger/blob/master/resources/apply_form.md|
|
314 |
+
|opencpop |https://wenet.org.cn/opencpop/download/|
|
315 |
+
|Multi-Singer |https://github.com/Multi-Singer/Multi-Singer.github.io|
|
316 |
+
|M4Singer |https://github.com/M4Singer/M4Singer/blob/master/apply_form.md|
|
317 |
+
|CSD |https://zenodo.org/record/4785016#.YxqrTbaOMU4|
|
318 |
+
|KSS |https://www.kaggle.com/datasets/bryanpark/korean-single-speaker-speech-dataset|
|
319 |
+
|JVS MuSic |https://sites.google.com/site/shinnosuketakamichi/research-topics/jvs_music|
|
320 |
+
|PJS |https://sites.google.com/site/shinnosuketakamichi/research-topics/pjs_corpus|
|
321 |
+
|JUST Song |https://sites.google.com/site/shinnosuketakamichi/publication/jsut-song|
|
322 |
+
|MUSDB18 |https://sigsep.github.io/datasets/musdb.html#musdb18-compressed-stems|
|
323 |
+
|DSD100 |https://sigsep.github.io/datasets/dsd100.html|
|
324 |
+
|Aishell-3 |http://www.aishelltech.com/aishell_3|
|
325 |
+
|VCTK |https://datashare.ed.ac.uk/handle/10283/2651|
|
326 |
+
|Korean Songs |http://urisori.co.kr/urisori-en/doku.php/|
|
327 |
+
|
328 |
+
## 代码来源和参考文献
|
329 |
+
|
330 |
+
https://github.com/facebookresearch/speech-resynthesis [paper](https://arxiv.org/abs/2104.00355)
|
331 |
+
|
332 |
+
https://github.com/jaywalnut310/vits [paper](https://arxiv.org/abs/2106.06103)
|
333 |
+
|
334 |
+
https://github.com/openai/whisper/ [paper](https://arxiv.org/abs/2212.04356)
|
335 |
+
|
336 |
+
https://github.com/NVIDIA/BigVGAN [paper](https://arxiv.org/abs/2206.04658)
|
337 |
+
|
338 |
+
https://github.com/mindslab-ai/univnet [paper](https://arxiv.org/abs/2106.07889)
|
339 |
+
|
340 |
+
https://github.com/nii-yamagishilab/project-NN-Pytorch-scripts/tree/master/project/01-nsf
|
341 |
+
|
342 |
+
https://github.com/huawei-noah/Speech-Backbones/tree/main/Grad-TTS
|
343 |
+
|
344 |
+
https://github.com/brentspell/hifi-gan-bwe
|
345 |
+
|
346 |
+
https://github.com/mozilla/TTS
|
347 |
+
|
348 |
+
https://github.com/bshall/soft-vc
|
349 |
+
|
350 |
+
https://github.com/maxrmorrison/torchcrepe
|
351 |
+
|
352 |
+
https://github.com/MoonInTheRiver/DiffSinger
|
353 |
+
|
354 |
+
https://github.com/OlaWod/FreeVC [paper](https://arxiv.org/abs/2210.15418)
|
355 |
+
|
356 |
+
https://github.com/yl4579/HiFTNet [paper](https://arxiv.org/abs/2309.09493)
|
357 |
+
|
358 |
+
[One-shot Voice Conversion by Separating Speaker and Content Representations with Instance Normalization](https://arxiv.org/abs/1904.05742)
|
359 |
+
|
360 |
+
[SNAC : Speaker-normalized Affine Coupling Layer in Flow-based Architecture for Zero-Shot Multi-Speaker Text-to-Speech](https://github.com/hcy71o/SNAC)
|
361 |
+
|
362 |
+
[Adapter-Based Extension of Multi-Speaker Text-to-Speech Model for New Speakers](https://arxiv.org/abs/2211.00585)
|
363 |
+
|
364 |
+
[AdaSpeech: Adaptive Text to Speech for Custom Voice](https://arxiv.org/pdf/2103.00993.pdf)
|
365 |
+
|
366 |
+
[AdaVITS: Tiny VITS for Low Computing Resource Speaker Adaptation](https://arxiv.org/pdf/2206.00208.pdf)
|
367 |
+
|
368 |
+
[Cross-Speaker Prosody Transfer on Any Text for Expressive Speech Synthesis](https://github.com/ubisoft/ubisoft-laforge-daft-exprt)
|
369 |
+
|
370 |
+
[Learn to Sing by Listening: Building Controllable Virtual Singer by Unsupervised Learning from Voice Recordings](https://arxiv.org/abs/2305.05401)
|
371 |
+
|
372 |
+
[Adversarial Speaker Disentanglement Using Unannotated External Data for Self-supervised Representation Based Voice Conversion](https://arxiv.org/pdf/2305.09167.pdf)
|
373 |
+
|
374 |
+
[Multilingual Speech Synthesis and Cross-Language Voice Cloning: GRL](https://arxiv.org/abs/1907.04448)
|
375 |
+
|
376 |
+
[RoFormer: Enhanced Transformer with rotary position embedding](https://arxiv.org/abs/2104.09864))https://github.com/facebookresearch/speech-resynthesis [paper](https://arxiv.org/abs/2104.00355)
|
377 |
+
|
378 |
+
## 基于数据扰动防止音色泄露的方法
|
379 |
+
|
380 |
+
https://github.com/auspicious3000/contentvec/blob/main/contentvec/data/audio/audio_utils_1.py
|
381 |
+
|
382 |
+
https://github.com/revsic/torch-nansy/blob/main/utils/augment/praat.py
|
383 |
+
|
384 |
+
https://github.com/revsic/torch-nansy/blob/main/utils/augment/peq.py
|
385 |
+
|
386 |
+
https://github.com/biggytruck/SpeechSplit2/blob/main/utils.py
|
387 |
+
|
388 |
+
https://github.com/OlaWod/FreeVC/blob/main/preprocess_sr.py
|
389 |
+
|
390 |
+
## 贡献者
|
391 |
+
|
392 |
+
<a href="https://github.com/PlayVoice/so-vits-svc/graphs/contributors">
|
393 |
+
<img src="https://contrib.rocks/image?repo=PlayVoice/so-vits-svc" />
|
394 |
+
</a>
|
395 |
+
|
396 |
+
## 特别感谢
|
397 |
+
|
398 |
+
https://github.com/Francis-Komizu/Sovits
|
399 |
+
|
400 |
+
## 原创过程
|
401 |
+
2022.04.12 https://mp.weixin.qq.com/s/autNBYCsG4_SvWt2-Ll_zA
|
402 |
+
|
403 |
+
2022.04.22 https://github.com/PlayVoice/VI-SVS
|
404 |
+
|
405 |
+
2022.07.26 https://mp.weixin.qq.com/s/qC4TJy-4EVdbpvK2cQb1TA
|
406 |
+
|
407 |
+
2022.09.08 https://github.com/PlayVoice/VI-SVC
|
408 |
+
|
409 |
+
## 被这个项目拷贝:svc-develop-team/so-vits-svc
|
410 |
+

|
411 |
+
|
412 |
+

|
413 |
+
|
414 |
+

|
415 |
+
|
416 |
+
## Rcell对拷贝的真实回应
|
417 |
+
|
418 |
+

|
__pycache__/svc_inference.cpython-310.pyc
ADDED
Binary file (6.85 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,444 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import subprocess
|
3 |
+
import yaml
|
4 |
+
import sys
|
5 |
+
import webbrowser
|
6 |
+
import gradio as gr
|
7 |
+
from ruamel.yaml import YAML
|
8 |
+
import shutil
|
9 |
+
import soundfile
|
10 |
+
import shlex
|
11 |
+
import locale
|
12 |
+
|
13 |
+
class WebUI:
|
14 |
+
def __init__(self):
|
15 |
+
self.train_config_path = 'configs/train.yaml'
|
16 |
+
self.info = Info()
|
17 |
+
self.names = []
|
18 |
+
self.names2 = []
|
19 |
+
self.voice_names = []
|
20 |
+
self.base_config_path = 'configs/base.yaml'
|
21 |
+
if not os.path.exists(self.train_config_path):
|
22 |
+
shutil.copyfile(self.base_config_path, self.train_config_path)
|
23 |
+
print(i18n("初始化成功"))
|
24 |
+
else:
|
25 |
+
print(i18n("就绪"))
|
26 |
+
self.main_ui()
|
27 |
+
|
28 |
+
def main_ui(self):
|
29 |
+
with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.green)) as ui:
|
30 |
+
|
31 |
+
gr.Markdown('# so-vits-svc5.0 WebUI')
|
32 |
+
|
33 |
+
with gr.Tab(i18n("预处理-训练")):
|
34 |
+
|
35 |
+
with gr.Accordion(i18n('训练说明'), open=False):
|
36 |
+
|
37 |
+
gr.Markdown(self.info.train)
|
38 |
+
|
39 |
+
gr.Markdown(i18n('### 预处理参数设置'))
|
40 |
+
|
41 |
+
with gr.Row():
|
42 |
+
|
43 |
+
self.model_name = gr.Textbox(value='sovits5.0', label='model', info=i18n('模型名称'), interactive=True) #建议设置为不可修改
|
44 |
+
|
45 |
+
self.f0_extractor = gr.Textbox(value='crepe', label='f0_extractor', info=i18n('f0提取器'), interactive=False)
|
46 |
+
|
47 |
+
self.thread_count = gr.Slider(minimum=1, maximum=os.cpu_count(), step=1, value=2, label='thread_count', info=i18n('预处理线程数'), interactive=True)
|
48 |
+
|
49 |
+
gr.Markdown(i18n('### 训练参数设置'))
|
50 |
+
|
51 |
+
with gr.Row():
|
52 |
+
|
53 |
+
self.learning_rate = gr.Number(value=5e-5, label='learning_rate', info=i18n('学习率'), interactive=True)
|
54 |
+
|
55 |
+
self.batch_size = gr.Slider(minimum=1, maximum=50, step=1, value=6, label='batch_size', info=i18n('批大小'), interactive=True)
|
56 |
+
|
57 |
+
with gr.Row():
|
58 |
+
|
59 |
+
self.info_interval = gr.Number(value=50, label='info_interval', info=i18n('训练日志记录间隔(step)'), interactive=True)
|
60 |
+
|
61 |
+
self.eval_interval = gr.Number(value=1, label='eval_interval', info=i18n('验证集验证间隔(epoch)'), interactive=True)
|
62 |
+
|
63 |
+
self.save_interval = gr.Number(value=5, label='save_interval', info=i18n('检查点保存间隔(epoch)'), interactive=True)
|
64 |
+
|
65 |
+
self.keep_ckpts = gr.Number(value=0, label='keep_ckpts', info=i18n('保留最新的检查点文件(0保存全部)'),interactive=True)
|
66 |
+
|
67 |
+
with gr.Row():
|
68 |
+
|
69 |
+
self.slow_model = gr.Checkbox(label=i18n("是否添加底模"), value=True, interactive=True)
|
70 |
+
|
71 |
+
gr.Markdown(i18n('### 开始训练'))
|
72 |
+
|
73 |
+
with gr.Row():
|
74 |
+
|
75 |
+
self.bt_open_dataset_folder = gr.Button(value=i18n('打开数据集文件夹'))
|
76 |
+
|
77 |
+
self.bt_onekey_train = gr.Button(i18n('一键训练'), variant="primary")
|
78 |
+
|
79 |
+
self.bt_tb = gr.Button(i18n('启动Tensorboard'), variant="primary")
|
80 |
+
|
81 |
+
gr.Markdown(i18n('### 恢复训练'))
|
82 |
+
|
83 |
+
with gr.Row():
|
84 |
+
|
85 |
+
self.resume_model = gr.Dropdown(choices=sorted(self.names), label='Resume training progress from checkpoints', info=i18n('从检查点恢复训练进度'), interactive=True)
|
86 |
+
|
87 |
+
with gr.Column():
|
88 |
+
|
89 |
+
self.bt_refersh = gr.Button(i18n('刷新'))
|
90 |
+
|
91 |
+
self.bt_resume_train = gr.Button(i18n('恢复训练'), variant="primary")
|
92 |
+
|
93 |
+
with gr.Tab(i18n("推理")):
|
94 |
+
|
95 |
+
with gr.Accordion(i18n('推理说明'), open=False):
|
96 |
+
|
97 |
+
gr.Markdown(self.info.inference)
|
98 |
+
|
99 |
+
gr.Markdown(i18n('### 推理参数设置'))
|
100 |
+
|
101 |
+
with gr.Row():
|
102 |
+
|
103 |
+
with gr.Column():
|
104 |
+
|
105 |
+
self.keychange = gr.Slider(-24, 24, value=0, step=1, label=i18n('变调'))
|
106 |
+
|
107 |
+
self.file_list = gr.Markdown(value="", label=i18n("文件列表"))
|
108 |
+
|
109 |
+
with gr.Row():
|
110 |
+
|
111 |
+
self.resume_model2 = gr.Dropdown(choices=sorted(self.names2), label='Select the model you want to export',
|
112 |
+
info=i18n('选择要导出的模型'), interactive=True)
|
113 |
+
with gr.Column():
|
114 |
+
|
115 |
+
self.bt_refersh2 = gr.Button(value=i18n('刷新模型和音色'))
|
116 |
+
|
117 |
+
|
118 |
+
self.bt_out_model = gr.Button(value=i18n('导出模型'), variant="primary")
|
119 |
+
|
120 |
+
with gr.Row():
|
121 |
+
|
122 |
+
self.resume_voice = gr.Dropdown(choices=sorted(self.voice_names), label='Select the sound file',
|
123 |
+
info=i18n('选择音色文件'), interactive=True)
|
124 |
+
|
125 |
+
with gr.Row():
|
126 |
+
|
127 |
+
self.input_wav = gr.Audio(type='filepath', label=i18n('选择待转换音频'), source='upload')
|
128 |
+
|
129 |
+
with gr.Row():
|
130 |
+
|
131 |
+
self.bt_infer = gr.Button(value=i18n('开始转换'), variant="primary")
|
132 |
+
|
133 |
+
with gr.Row():
|
134 |
+
|
135 |
+
self.output_wav = gr.Audio(label=i18n('输出音频'), interactive=False)
|
136 |
+
|
137 |
+
self.bt_open_dataset_folder.click(fn=self.openfolder)
|
138 |
+
self.bt_onekey_train.click(fn=self.onekey_training,inputs=[self.model_name, self.thread_count,self.learning_rate,self.batch_size, self.info_interval, self.eval_interval,self.save_interval, self.keep_ckpts, self.slow_model])
|
139 |
+
self.bt_out_model.click(fn=self.out_model, inputs=[self.model_name, self.resume_model2])
|
140 |
+
self.bt_tb.click(fn=self.tensorboard)
|
141 |
+
self.bt_refersh.click(fn=self.refresh_model, inputs=[self.model_name], outputs=[self.resume_model])
|
142 |
+
self.bt_resume_train.click(fn=self.resume_train, inputs=[self.model_name, self.resume_model, self.learning_rate,self.batch_size, self.info_interval, self.eval_interval,self.save_interval, self.keep_ckpts, self.slow_model])
|
143 |
+
self.bt_infer.click(fn=self.inference, inputs=[self.input_wav, self.resume_voice, self.keychange], outputs=[self.output_wav])
|
144 |
+
self.bt_refersh2.click(fn=self.refresh_model_and_voice, inputs=[self.model_name],outputs=[self.resume_model2, self.resume_voice])
|
145 |
+
|
146 |
+
ui.launch(inbrowser=True, server_port=2333, share=True)
|
147 |
+
|
148 |
+
def openfolder(self):
|
149 |
+
|
150 |
+
try:
|
151 |
+
if sys.platform.startswith('win'):
|
152 |
+
os.startfile('dataset_raw')
|
153 |
+
elif sys.platform.startswith('linux'):
|
154 |
+
subprocess.call(['xdg-open', 'dataset_raw'])
|
155 |
+
elif sys.platform.startswith('darwin'):
|
156 |
+
subprocess.call(['open', 'dataset_raw'])
|
157 |
+
else:
|
158 |
+
print(i18n('打开文件夹失败!'))
|
159 |
+
except BaseException:
|
160 |
+
print(i18n('打开文件夹失败!'))
|
161 |
+
|
162 |
+
def preprocessing(self, thread_count):
|
163 |
+
print(i18n('开始预处理'))
|
164 |
+
train_process = subprocess.Popen('python -u svc_preprocessing.py -t ' + str(thread_count), stdout=subprocess.PIPE)
|
165 |
+
while train_process.poll() is None:
|
166 |
+
output = train_process.stdout.readline().decode('utf-8')
|
167 |
+
print(output, end='')
|
168 |
+
|
169 |
+
def create_config(self, model_name, learning_rate, batch_size, info_interval, eval_interval, save_interval,
|
170 |
+
keep_ckpts, slow_model):
|
171 |
+
yaml = YAML()
|
172 |
+
yaml.preserve_quotes = True
|
173 |
+
yaml.width = 1024
|
174 |
+
with open("configs/train.yaml", "r") as f:
|
175 |
+
config = yaml.load(f)
|
176 |
+
config['train']['model'] = model_name
|
177 |
+
config['train']['learning_rate'] = learning_rate
|
178 |
+
config['train']['batch_size'] = batch_size
|
179 |
+
config["log"]["info_interval"] = int(info_interval)
|
180 |
+
config["log"]["eval_interval"] = int(eval_interval)
|
181 |
+
config["log"]["save_interval"] = int(save_interval)
|
182 |
+
config["log"]["keep_ckpts"] = int(keep_ckpts)
|
183 |
+
if slow_model:
|
184 |
+
config["train"]["pretrain"] = "vits_pretrain\sovits5.0.pretrain.pth"
|
185 |
+
else:
|
186 |
+
config["train"]["pretrain"] = ""
|
187 |
+
with open("configs/train.yaml", "w") as f:
|
188 |
+
yaml.dump(config, f)
|
189 |
+
return f"{config['log']}"
|
190 |
+
|
191 |
+
def training(self, model_name):
|
192 |
+
print(i18n('开始训练'))
|
193 |
+
train_process = subprocess.Popen('python -u svc_trainer.py -c ' + self.train_config_path + ' -n ' + str(model_name), stdout=subprocess.PIPE, creationflags=subprocess.CREATE_NEW_CONSOLE)
|
194 |
+
while train_process.poll() is None:
|
195 |
+
output = train_process.stdout.readline().decode('utf-8')
|
196 |
+
print(output, end='')
|
197 |
+
|
198 |
+
def onekey_training(self, model_name, thread_count, learning_rate, batch_size, info_interval, eval_interval, save_interval, keep_ckpts, slow_model):
|
199 |
+
print(self, model_name, thread_count, learning_rate, batch_size, info_interval, eval_interval,
|
200 |
+
save_interval, keep_ckpts)
|
201 |
+
self.create_config(model_name, learning_rate, batch_size, info_interval, eval_interval, save_interval, keep_ckpts, slow_model)
|
202 |
+
self.preprocessing(thread_count)
|
203 |
+
self.training(model_name)
|
204 |
+
|
205 |
+
def out_model(self, model_name, resume_model2):
|
206 |
+
print(i18n('开始导出模型'))
|
207 |
+
try:
|
208 |
+
subprocess.Popen('python -u svc_export.py -c {} -p "chkpt/{}/{}"'.format(self.train_config_path, model_name, resume_model2),stdout=subprocess.PIPE)
|
209 |
+
print(i18n('导出模型成功'))
|
210 |
+
except Exception as e:
|
211 |
+
print(i18n("出现错误:"), e)
|
212 |
+
|
213 |
+
|
214 |
+
def tensorboard(self):
|
215 |
+
if sys.platform.startswith('win'):
|
216 |
+
tb_process = subprocess.Popen('tensorboard --logdir=logs --port=6006', stdout=subprocess.PIPE)
|
217 |
+
webbrowser.open("http://localhost:6006")
|
218 |
+
else:
|
219 |
+
p1 = subprocess.Popen(["ps", "-ef"], stdout=subprocess.PIPE) #ps -ef | grep tensorboard | awk '{print $2}' | xargs kill -9
|
220 |
+
p2 = subprocess.Popen(["grep", "tensorboard"], stdin=p1.stdout, stdout=subprocess.PIPE)
|
221 |
+
p3 = subprocess.Popen(["awk", "{print $2}"], stdin=p2.stdout, stdout=subprocess.PIPE)
|
222 |
+
p4 = subprocess.Popen(["xargs", "kill", "-9"], stdin=p3.stdout)
|
223 |
+
p1.stdout.close()
|
224 |
+
p2.stdout.close()
|
225 |
+
p3.stdout.close()
|
226 |
+
p4.communicate()
|
227 |
+
tb_process = subprocess.Popen('tensorboard --logdir=logs --port=6007', stdout=subprocess.PIPE) # AutoDL端口设置为6007
|
228 |
+
while tb_process.poll() is None:
|
229 |
+
output = tb_process.stdout.readline().decode('utf-8')
|
230 |
+
print(output)
|
231 |
+
|
232 |
+
def refresh_model(self, model_name):
|
233 |
+
self.script_dir = os.path.dirname(os.path.abspath(__file__))
|
234 |
+
self.model_root = os.path.join(self.script_dir, f"chkpt/{model_name}")
|
235 |
+
self.names = []
|
236 |
+
try:
|
237 |
+
for self.name in os.listdir(self.model_root):
|
238 |
+
if self.name.endswith(".pt"):
|
239 |
+
self.names.append(self.name)
|
240 |
+
return {"choices": sorted(self.names), "__type__": "update"}
|
241 |
+
except FileNotFoundError:
|
242 |
+
return {"label": i18n("缺少模型文件"), "__type__": "update"}
|
243 |
+
|
244 |
+
def refresh_model2(self, model_name):
|
245 |
+
self.script_dir = os.path.dirname(os.path.abspath(__file__))
|
246 |
+
self.model_root = os.path.join(self.script_dir, f"chkpt/{model_name}")
|
247 |
+
self.names2 = []
|
248 |
+
try:
|
249 |
+
for self.name in os.listdir(self.model_root):
|
250 |
+
if self.name.endswith(".pt"):
|
251 |
+
self.names2.append(self.name)
|
252 |
+
return {"choices": sorted(self.names2), "__type__": "update"}
|
253 |
+
except FileNotFoundError:
|
254 |
+
return {"label": i18n("缺少模型文件"), "__type__": "update"}
|
255 |
+
|
256 |
+
def refresh_voice(self):
|
257 |
+
self.script_dir = os.path.dirname(os.path.abspath(__file__))
|
258 |
+
self.model_root = os.path.join(self.script_dir, "data_svc/singer")
|
259 |
+
self.voice_names = []
|
260 |
+
try:
|
261 |
+
for self.name in os.listdir(self.model_root):
|
262 |
+
if self.name.endswith(".npy"):
|
263 |
+
self.voice_names.append(self.name)
|
264 |
+
return {"choices": sorted(self.voice_names), "__type__": "update"}
|
265 |
+
except FileNotFoundError:
|
266 |
+
return {"label": i18n("缺少文件"), "__type__": "update"}
|
267 |
+
|
268 |
+
def refresh_model_and_voice(self, model_name):
|
269 |
+
model_update = self.refresh_model2(model_name)
|
270 |
+
voice_update = self.refresh_voice()
|
271 |
+
return model_update, voice_update
|
272 |
+
|
273 |
+
def resume_train(self, model_name, resume_model ,learning_rate, batch_size, info_interval, eval_interval, save_interval, keep_ckpts, slow_model):
|
274 |
+
print(i18n('开始恢复训练'))
|
275 |
+
self.create_config(model_name, learning_rate, batch_size, info_interval, eval_interval, save_interval,keep_ckpts, slow_model)
|
276 |
+
train_process = subprocess.Popen('python -u svc_trainer.py -c {} -n {} -p "chkpt/{}/{}"'.format(self.train_config_path, model_name, model_name, resume_model), stdout=subprocess.PIPE, creationflags=subprocess.CREATE_NEW_CONSOLE)
|
277 |
+
while train_process.poll() is None:
|
278 |
+
output = train_process.stdout.readline().decode('utf-8')
|
279 |
+
print(output, end='')
|
280 |
+
|
281 |
+
def inference(self, input, resume_voice, keychange):
|
282 |
+
if os.path.exists("test.wav"):
|
283 |
+
os.remove("test.wav")
|
284 |
+
print(i18n("已清理残留文件"))
|
285 |
+
else:
|
286 |
+
print(i18n("无需清理残留文件"))
|
287 |
+
self.train_config_path = 'configs/train.yaml'
|
288 |
+
print(i18n('开始推理'))
|
289 |
+
shutil.copy(input, ".")
|
290 |
+
input_name = os.path.basename(input)
|
291 |
+
os.rename(input_name, "test.wav")
|
292 |
+
input_name = "test.wav"
|
293 |
+
if not input_name.endswith(".wav"):
|
294 |
+
data, samplerate = soundfile.read(input_name)
|
295 |
+
input_name = input_name.rsplit(".", 1)[0] + ".wav"
|
296 |
+
soundfile.write(input_name, data, samplerate)
|
297 |
+
train_config_path = shlex.quote(self.train_config_path)
|
298 |
+
keychange = shlex.quote(str(keychange))
|
299 |
+
cmd = ["python", "-u", "svc_inference.py", "--config", train_config_path, "--model", "sovits5.0.pth", "--spk",
|
300 |
+
f"data_svc/singer/{resume_voice}", "--wave", "test.wav", "--shift", keychange]
|
301 |
+
train_process = subprocess.run(cmd, shell=False, capture_output=True, text=True)
|
302 |
+
print(train_process.stdout)
|
303 |
+
print(train_process.stderr)
|
304 |
+
print(i18n("推理成功"))
|
305 |
+
return "svc_out.wav"
|
306 |
+
|
307 |
+
class Info:
|
308 |
+
def __init__(self) -> None:
|
309 |
+
self.train = i18n('### 2023.7.11|[@OOPPEENN](https://github.com/OOPPEENN)第一次编写|[@thestmitsuk](https://github.com/thestmitsuki)二次补完')
|
310 |
+
|
311 |
+
self.inference = i18n('### 2023.7.11|[@OOPPEENN](https://github.com/OOPPEENN)第一次编写|[@thestmitsuk](https://github.com/thestmitsuki)二次补完')
|
312 |
+
|
313 |
+
|
314 |
+
LANGUAGE_LIST = ['zh_CN', 'en_US']
|
315 |
+
LANGUAGE_ALL = {
|
316 |
+
'zh_CN': {
|
317 |
+
'SUPER': 'END',
|
318 |
+
'LANGUAGE': 'zh_CN',
|
319 |
+
'初始化成功': '初始化成功',
|
320 |
+
'就绪': '就绪',
|
321 |
+
'预处理-训练': '预处理-训练',
|
322 |
+
'训练说明': '训练说明',
|
323 |
+
'### 预处理参数设置': '### 预处理参数设置',
|
324 |
+
'模型名称': '模型名称',
|
325 |
+
'f0提取器': 'f0提取器',
|
326 |
+
'预处理线程数': '预处理线程数',
|
327 |
+
'### 训练参数设置': '### 训练参数设置',
|
328 |
+
'学习率': '学习率',
|
329 |
+
'批大小': '批大小',
|
330 |
+
'训练日志记录间隔(step)': '训练日志记录间隔(step)',
|
331 |
+
'验证集验证间隔(epoch)': '验证集验证间隔(epoch)',
|
332 |
+
'检查点保存间隔(epoch)': '检查点保存间隔(epoch)',
|
333 |
+
'保留最新的检查点文件(0保存全部)': '保留最新的检查点文件(0保存全部)',
|
334 |
+
'是否添加底模': '是否添加底模',
|
335 |
+
'### 开始训练': '### 开始训练',
|
336 |
+
'打开数据集文件夹': '打开数据集文件夹',
|
337 |
+
'一键训练': '一键训练',
|
338 |
+
'启动Tensorboard': '启动Tensorboard',
|
339 |
+
'### 恢复训练': '### 恢复训练',
|
340 |
+
'从检查点恢复训练进度': '从检查点恢复训练进度',
|
341 |
+
'刷新': '刷新',
|
342 |
+
'恢复训练': '恢复训练',
|
343 |
+
'推理': '推理',
|
344 |
+
'推理说明': '推理说明',
|
345 |
+
'### 推理参数设置': '### 推理参数设置',
|
346 |
+
'变调': '变调',
|
347 |
+
'文件列表': '文件列表',
|
348 |
+
'选择要导出的模型': '选择要导出的模型',
|
349 |
+
'刷新模型和音色': '刷新模型和音色',
|
350 |
+
'导出模型': '导出模型',
|
351 |
+
'选择音色文件': '选择音色文件',
|
352 |
+
'选择待转换音频': '选择待转换音频',
|
353 |
+
'开始转换': '开始转换',
|
354 |
+
'输出音频': '输出音频',
|
355 |
+
'打开文件夹失败!': '打开文件夹失败!',
|
356 |
+
'开始预处理': '开始预处理',
|
357 |
+
'开始训练': '开始训练',
|
358 |
+
'开始导出模型': '开始导出模型',
|
359 |
+
'导出模型成功': '导出模型成功',
|
360 |
+
'出现错误:': '出现错误:',
|
361 |
+
'缺少模型文件': '缺少模型文件',
|
362 |
+
'缺少文件': '缺少文件',
|
363 |
+
'已清理残留文件': '已清理残留文件',
|
364 |
+
'无需清理残留文件': '无需清理残留文件',
|
365 |
+
'开始推理': '开始推理',
|
366 |
+
'推理成功': '推理成功',
|
367 |
+
'### 2023.7.11|[@OOPPEENN](https://github.com/OOPPEENN)第一次编写|[@thestmitsuk](https://github.com/thestmitsuki)二次补完': '### 2023.7.11|[@OOPPEENN](https://github.com/OOPPEENN)第一次编写|[@thestmitsuk](https://github.com/thestmitsuki)二次补完'
|
368 |
+
},
|
369 |
+
'en_US': {
|
370 |
+
'SUPER': 'zh_CN',
|
371 |
+
'LANGUAGE': 'en_US',
|
372 |
+
'初始化成功': 'Initialization successful',
|
373 |
+
'就绪': 'Ready',
|
374 |
+
'预处理-训练': 'Preprocessing-Training',
|
375 |
+
'训练说明': 'Training instructions',
|
376 |
+
'### 预处理参数设置': '### Preprocessing parameter settings',
|
377 |
+
'模型名称': 'Model name',
|
378 |
+
'f0提取器': 'f0 extractor',
|
379 |
+
'预处理线程数': 'Preprocessing thread number',
|
380 |
+
'### 训练参数设置': '### Training parameter settings',
|
381 |
+
'学习率': 'Learning rate',
|
382 |
+
'批大小': 'Batch size',
|
383 |
+
'训练日志记录间隔(step)': 'Training log recording interval (step)',
|
384 |
+
'验证集验证间隔(epoch)': 'Validation set validation interval (epoch)',
|
385 |
+
'检查点保存间隔(epoch)': 'Checkpoint save interval (epoch)',
|
386 |
+
'保留最新的检查点文件(0保存全部)': 'Keep the latest checkpoint file (0 save all)',
|
387 |
+
'是否添加底模': 'Whether to add the base model',
|
388 |
+
'### 开始训练': '### Start training',
|
389 |
+
'打开数据集文件夹': 'Open the dataset folder',
|
390 |
+
'一键训练': 'One-click training',
|
391 |
+
'启动Tensorboard': 'Start Tensorboard',
|
392 |
+
'### 恢复训练': '### Resume training',
|
393 |
+
'从检查点恢复训练进度': 'Restore training progress from checkpoint',
|
394 |
+
'刷新': 'Refresh',
|
395 |
+
'恢复训练': 'Resume training',
|
396 |
+
"推理": "Inference",
|
397 |
+
"推理说明": "Inference instructions",
|
398 |
+
"### 推理参数设置": "### Inference parameter settings",
|
399 |
+
"变调": "Pitch shift",
|
400 |
+
"文件列表": "File list",
|
401 |
+
"选择要导出的模型": "Select the model to export",
|
402 |
+
"刷新模型和音色": "Refresh model and timbre",
|
403 |
+
"导出模型": "Export model",
|
404 |
+
"选择音色文件": "Select timbre file",
|
405 |
+
"选择待转换音频": "Select audio to be converted",
|
406 |
+
"开始转换": "Start conversion",
|
407 |
+
"输出音频": "Output audio",
|
408 |
+
"打开文件夹失败!": "Failed to open folder!",
|
409 |
+
"开始预处理": "Start preprocessing",
|
410 |
+
"开始训练": "Start training",
|
411 |
+
"开始导出模型": "Start exporting model",
|
412 |
+
"导出模���成功": "Model exported successfully",
|
413 |
+
"出现错误:": "An error occurred:",
|
414 |
+
"缺少模型文件": "Missing model file",
|
415 |
+
'缺少文件': 'Missing file',
|
416 |
+
"已清理残留文件": "Residual files cleaned up",
|
417 |
+
"无需清理残留文件": "No need to clean up residual files",
|
418 |
+
"开始推理": "Start inference",
|
419 |
+
'### 2023.7.11|[@OOPPEENN](https://github.com/OOPPEENN)第一次编写|[@thestmitsuk](https://github.com/thestmitsuki)二次补完': '### 2023.7.11|[@OOPPEENN](https://github.com/OOPPEENN)first writing|[@thestmitsuk](https://github.com/thestmitsuki)second completion'
|
420 |
+
}
|
421 |
+
}
|
422 |
+
|
423 |
+
class I18nAuto:
|
424 |
+
def __init__(self, language=None):
|
425 |
+
self.language_list = LANGUAGE_LIST
|
426 |
+
self.language_all = LANGUAGE_ALL
|
427 |
+
self.language_map = {}
|
428 |
+
self.language = language or locale.getdefaultlocale()[0]
|
429 |
+
if self.language not in self.language_list:
|
430 |
+
self.language = 'zh_CN'
|
431 |
+
self.read_language(self.language_all['zh_CN'])
|
432 |
+
while self.language_all[self.language]['SUPER'] != 'END':
|
433 |
+
self.read_language(self.language_all[self.language])
|
434 |
+
self.language = self.language_all[self.language]['SUPER']
|
435 |
+
|
436 |
+
def read_language(self, lang_dict: dict):
|
437 |
+
self.language_map.update(lang_dict)
|
438 |
+
|
439 |
+
def __call__(self, key):
|
440 |
+
return self.language_map[key]
|
441 |
+
|
442 |
+
if __name__ == "__main__":
|
443 |
+
i18n = I18nAuto()
|
444 |
+
webui = WebUI()
|
colab.ipynb
ADDED
@@ -0,0 +1,374 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {
|
6 |
+
"id": "SggegFslkbbK"
|
7 |
+
},
|
8 |
+
"source": [
|
9 |
+
"https://github.com/PlayVoice/so-vits-svc-5.0/\n",
|
10 |
+
"\n",
|
11 |
+
"↑原仓库\n",
|
12 |
+
"\n",
|
13 |
+
"*《colab保持连接的方法》*https://zhuanlan.zhihu.com/p/144629818\n",
|
14 |
+
"\n",
|
15 |
+
"预览版本,可使用预设模型进行推理"
|
16 |
+
]
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"cell_type": "markdown",
|
20 |
+
"metadata": {
|
21 |
+
"id": "M1MdDryJP73G"
|
22 |
+
},
|
23 |
+
"source": [
|
24 |
+
"# **环境配置&必要文件下载**\n"
|
25 |
+
]
|
26 |
+
},
|
27 |
+
{
|
28 |
+
"cell_type": "code",
|
29 |
+
"execution_count": null,
|
30 |
+
"metadata": {
|
31 |
+
"id": "xfJWCr_EkO2i"
|
32 |
+
},
|
33 |
+
"outputs": [],
|
34 |
+
"source": [
|
35 |
+
"#@title 看看抽了个啥卡~~基本都是T4~~\n",
|
36 |
+
"!nvidia-smi"
|
37 |
+
]
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"cell_type": "code",
|
41 |
+
"execution_count": null,
|
42 |
+
"metadata": {
|
43 |
+
"id": "nMspj8t3knR6"
|
44 |
+
},
|
45 |
+
"outputs": [],
|
46 |
+
"source": [
|
47 |
+
"#@title 克隆github仓库\n",
|
48 |
+
"!git clone https://github.com/PlayVoice/so-vits-svc-5.0/ -b bigvgan-mix-v2"
|
49 |
+
]
|
50 |
+
},
|
51 |
+
{
|
52 |
+
"cell_type": "code",
|
53 |
+
"execution_count": null,
|
54 |
+
"metadata": {
|
55 |
+
"id": "Kj2j81K6kubj"
|
56 |
+
},
|
57 |
+
"outputs": [],
|
58 |
+
"source": [
|
59 |
+
"#@title 安装依赖&下载必要文件\n",
|
60 |
+
"%cd /content/so-vits-svc-5.0\n",
|
61 |
+
"\n",
|
62 |
+
"!pip install -r requirements.txt\n",
|
63 |
+
"!pip install --upgrade pip setuptools numpy numba\n",
|
64 |
+
"\n",
|
65 |
+
"!wget -P hubert_pretrain/ https://github.com/bshall/hubert/releases/download/v0.1/hubert-soft-0d54a1f4.pt\n",
|
66 |
+
"!wget -P whisper_pretrain/ https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt\n",
|
67 |
+
"!wget -P speaker_pretrain/ https://github.com/PlayVoice/so-vits-svc-5.0/releases/download/dependency/best_model.pth.tar\n",
|
68 |
+
"!wget -P crepe/assets https://github.com/PlayVoice/so-vits-svc-5.0/releases/download/dependency/full.pth\n",
|
69 |
+
"!wget -P vits_pretrain https://github.com/PlayVoice/so-vits-svc-5.0/releases/download/5.0/sovits5.0.pretrain.pth"
|
70 |
+
]
|
71 |
+
},
|
72 |
+
{
|
73 |
+
"cell_type": "code",
|
74 |
+
"execution_count": null,
|
75 |
+
"metadata": {
|
76 |
+
"id": "v9zHS9VXly9b"
|
77 |
+
},
|
78 |
+
"outputs": [],
|
79 |
+
"source": [
|
80 |
+
"#@title 加载Google云端硬盘\n",
|
81 |
+
"from google.colab import drive\n",
|
82 |
+
"drive.mount('/content/drive')"
|
83 |
+
]
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"cell_type": "markdown",
|
87 |
+
"metadata": {
|
88 |
+
"id": "hZ5KH8NgQ7os"
|
89 |
+
},
|
90 |
+
"source": [
|
91 |
+
"# 包含多说话人的推理预览"
|
92 |
+
]
|
93 |
+
},
|
94 |
+
{
|
95 |
+
"cell_type": "code",
|
96 |
+
"execution_count": null,
|
97 |
+
"metadata": {
|
98 |
+
"id": "2o6m3D0IsphU"
|
99 |
+
},
|
100 |
+
"outputs": [],
|
101 |
+
"source": [
|
102 |
+
"#@title 提取内容编码\n",
|
103 |
+
"\n",
|
104 |
+
"#@markdown **将处理好的\" .wav \"输入源文件上传到云盘根目录,并修改以下选项**\n",
|
105 |
+
"\n",
|
106 |
+
"#@markdown **\" .wav \"文件【文件名】**\n",
|
107 |
+
"input = \"\\u30AE\\u30BF\\u30FC\\u3068\\u5B64\\u72EC\\u3068\\u84BC\\u3044\\u60D1\\u661F\" #@param {type:\"string\"}\n",
|
108 |
+
"input_path = \"/content/drive/MyDrive/\"\n",
|
109 |
+
"input_name = input_path + input\n",
|
110 |
+
"!PYTHONPATH=. python whisper/inference.py -w {input_name}.wav -p test.ppg.npy"
|
111 |
+
]
|
112 |
+
},
|
113 |
+
{
|
114 |
+
"cell_type": "code",
|
115 |
+
"execution_count": null,
|
116 |
+
"metadata": {
|
117 |
+
"id": "A7nvX5mRlwJ7"
|
118 |
+
},
|
119 |
+
"outputs": [],
|
120 |
+
"source": [
|
121 |
+
"#@title 推理\n",
|
122 |
+
"\n",
|
123 |
+
"#@markdown **将处理好的\" .wav \"输入源文件上传到云盘根目录,并修改以下选项**\n",
|
124 |
+
"\n",
|
125 |
+
"#@markdown **\" .wav \"文件【文件名】**\n",
|
126 |
+
"input = \"\\u30AE\\u30BF\\u30FC\\u3068\\u5B64\\u72EC\\u3068\\u84BC\\u3044\\u60D1\\u661F\" #@param {type:\"string\"}\n",
|
127 |
+
"input_path = \"/content/drive/MyDrive/\"\n",
|
128 |
+
"input_name = input_path + input\n",
|
129 |
+
"#@markdown **指定说话人(0001~0056)(推荐0022、0030、0047、0051)**\n",
|
130 |
+
"speaker = \"0002\" #@param {type:\"string\"}\n",
|
131 |
+
"!PYTHONPATH=. python svc_inference.py --config configs/base.yaml --model vits_pretrain/sovits5.0.pretrain.pth --spk ./configs/singers/singer{speaker}.npy --wave {input_name}.wav --ppg test.ppg.npy"
|
132 |
+
]
|
133 |
+
},
|
134 |
+
{
|
135 |
+
"cell_type": "markdown",
|
136 |
+
"metadata": {
|
137 |
+
"id": "F8oerogXyd3u"
|
138 |
+
},
|
139 |
+
"source": [
|
140 |
+
"推理结果保存在根目录,文件名为svc_out.wav"
|
141 |
+
]
|
142 |
+
},
|
143 |
+
{
|
144 |
+
"cell_type": "markdown",
|
145 |
+
"metadata": {
|
146 |
+
"id": "qKX17GElPuso"
|
147 |
+
},
|
148 |
+
"source": [
|
149 |
+
"# 训练"
|
150 |
+
]
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"cell_type": "markdown",
|
154 |
+
"metadata": {
|
155 |
+
"id": "sVe0lEGWQBLU"
|
156 |
+
},
|
157 |
+
"source": [
|
158 |
+
"将音频剪裁为小于30秒的音���段,响度匹配并修改为单声道,预处理时会进行重采样所以对采样率无要求。(但是降低采样率的操作会降低你的数据质量)\n",
|
159 |
+
"\n",
|
160 |
+
"**使用Adobe Audition™的响度匹配功能可以一次性完成重采样修改声道和响度匹配。**\n",
|
161 |
+
"\n",
|
162 |
+
"之后将音频文件保存为以下文件结构:\n",
|
163 |
+
"```\n",
|
164 |
+
"dataset_raw\n",
|
165 |
+
"├───speaker0\n",
|
166 |
+
"│ ├───xxx1-xxx1.wav\n",
|
167 |
+
"│ ├───...\n",
|
168 |
+
"│ └───Lxx-0xx8.wav\n",
|
169 |
+
"└───speaker1\n",
|
170 |
+
" ├───xx2-0xxx2.wav\n",
|
171 |
+
" ├───...\n",
|
172 |
+
" └───xxx7-xxx007.wav\n",
|
173 |
+
"```\n",
|
174 |
+
"\n",
|
175 |
+
"打包为zip格式,命名为data.zip,上传到网盘根目录。"
|
176 |
+
]
|
177 |
+
},
|
178 |
+
{
|
179 |
+
"cell_type": "code",
|
180 |
+
"execution_count": null,
|
181 |
+
"metadata": {
|
182 |
+
"id": "vC8IthV8VYgy"
|
183 |
+
},
|
184 |
+
"outputs": [],
|
185 |
+
"source": [
|
186 |
+
"#@title 从云盘获取数据集\n",
|
187 |
+
"!unzip -d /content/so-vits-svc-5.0/ /content/drive/MyDrive/data.zip #自行修改路径与文件名"
|
188 |
+
]
|
189 |
+
},
|
190 |
+
{
|
191 |
+
"cell_type": "code",
|
192 |
+
"execution_count": null,
|
193 |
+
"metadata": {
|
194 |
+
"id": "J101PiFUSL1N"
|
195 |
+
},
|
196 |
+
"outputs": [],
|
197 |
+
"source": [
|
198 |
+
"#@title 重采样\n",
|
199 |
+
"# 生成采样率16000Hz音频, 存储路径为:./data_svc/waves-16k\n",
|
200 |
+
"!python prepare/preprocess_a.py -w ./dataset_raw -o ./data_svc/waves-16k -s 16000\n",
|
201 |
+
"# 生成采样率32000Hz音频, 存储路径为:./data_svc/waves-32k\n",
|
202 |
+
"!python prepare/preprocess_a.py -w ./dataset_raw -o ./data_svc/waves-32k -s 32000"
|
203 |
+
]
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"cell_type": "code",
|
207 |
+
"execution_count": null,
|
208 |
+
"metadata": {
|
209 |
+
"id": "ZpxeYJCBSbgf"
|
210 |
+
},
|
211 |
+
"outputs": [],
|
212 |
+
"source": [
|
213 |
+
"#@title 提取f0\n",
|
214 |
+
"!python prepare/preprocess_f0.py -w data_svc/waves-16k/ -p data_svc/pitch"
|
215 |
+
]
|
216 |
+
},
|
217 |
+
{
|
218 |
+
"cell_type": "code",
|
219 |
+
"execution_count": null,
|
220 |
+
"metadata": {
|
221 |
+
"id": "7VasDGhDSlP5"
|
222 |
+
},
|
223 |
+
"outputs": [],
|
224 |
+
"source": [
|
225 |
+
"#@title 使用16k音频,提取内容编码\n",
|
226 |
+
"!PYTHONPATH=. python prepare/preprocess_ppg.py -w data_svc/waves-16k/ -p data_svc/whisper"
|
227 |
+
]
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"cell_type": "code",
|
231 |
+
"execution_count": null,
|
232 |
+
"metadata": {},
|
233 |
+
"outputs": [],
|
234 |
+
"source": [
|
235 |
+
"#@title 使用16k音频,提取内容编码\n",
|
236 |
+
"!PYTHONPATH=. python prepare/preprocess_hubert.py -w data_svc/waves-16k/ -v data_svc/hubert"
|
237 |
+
]
|
238 |
+
},
|
239 |
+
{
|
240 |
+
"cell_type": "code",
|
241 |
+
"execution_count": null,
|
242 |
+
"metadata": {
|
243 |
+
"id": "ovRqQUINSoII"
|
244 |
+
},
|
245 |
+
"outputs": [],
|
246 |
+
"source": [
|
247 |
+
"#@title 提取音色特征\n",
|
248 |
+
"!PYTHONPATH=. python prepare/preprocess_speaker.py data_svc/waves-16k/ data_svc/speaker"
|
249 |
+
]
|
250 |
+
},
|
251 |
+
{
|
252 |
+
"cell_type": "code",
|
253 |
+
"execution_count": null,
|
254 |
+
"metadata": {
|
255 |
+
"id": "s8Ba8Fd1bzzX"
|
256 |
+
},
|
257 |
+
"outputs": [],
|
258 |
+
"source": [
|
259 |
+
"#(解决“.ipynb_checkpoints”相关的错)\n",
|
260 |
+
"!rm -rf \"find -type d -name .ipynb_checkpoints\""
|
261 |
+
]
|
262 |
+
},
|
263 |
+
{
|
264 |
+
"cell_type": "code",
|
265 |
+
"execution_count": null,
|
266 |
+
"metadata": {
|
267 |
+
"id": "ic9q599_b0Ae"
|
268 |
+
},
|
269 |
+
"outputs": [],
|
270 |
+
"source": [
|
271 |
+
"#(解决“.ipynb_checkpoints”相关的错)\n",
|
272 |
+
"!rm -rf .ipynb_checkpoints\n",
|
273 |
+
"!find . -name \".ipynb_checkpoints\" -exec rm -rf {} \\;"
|
274 |
+
]
|
275 |
+
},
|
276 |
+
{
|
277 |
+
"cell_type": "code",
|
278 |
+
"execution_count": null,
|
279 |
+
"metadata": {
|
280 |
+
"id": "QamG3_B6o3vF"
|
281 |
+
},
|
282 |
+
"outputs": [],
|
283 |
+
"source": [
|
284 |
+
"#@title 提取平均音色\n",
|
285 |
+
"!PYTHONPATH=. python prepare/preprocess_speaker_ave.py data_svc/speaker/ data_svc/singer"
|
286 |
+
]
|
287 |
+
},
|
288 |
+
{
|
289 |
+
"cell_type": "code",
|
290 |
+
"execution_count": null,
|
291 |
+
"metadata": {
|
292 |
+
"id": "3wBmyQHvSs6K"
|
293 |
+
},
|
294 |
+
"outputs": [],
|
295 |
+
"source": [
|
296 |
+
"#@title 提取spec\n",
|
297 |
+
"!PYTHONPATH=. python prepare/preprocess_spec.py -w data_svc/waves-32k/ -s data_svc/specs"
|
298 |
+
]
|
299 |
+
},
|
300 |
+
{
|
301 |
+
"cell_type": "code",
|
302 |
+
"execution_count": null,
|
303 |
+
"metadata": {
|
304 |
+
"id": "tUcljCLbS5O3"
|
305 |
+
},
|
306 |
+
"outputs": [],
|
307 |
+
"source": [
|
308 |
+
"#@title 生成索引\n",
|
309 |
+
"!python prepare/preprocess_train.py"
|
310 |
+
]
|
311 |
+
},
|
312 |
+
{
|
313 |
+
"cell_type": "code",
|
314 |
+
"execution_count": null,
|
315 |
+
"metadata": {
|
316 |
+
"id": "30fXnscFS7Wo"
|
317 |
+
},
|
318 |
+
"outputs": [],
|
319 |
+
"source": [
|
320 |
+
"#@title 训练文件调试\n",
|
321 |
+
"!PYTHONPATH=. python prepare/preprocess_zzz.py"
|
322 |
+
]
|
323 |
+
},
|
324 |
+
{
|
325 |
+
"cell_type": "code",
|
326 |
+
"execution_count": null,
|
327 |
+
"metadata": {
|
328 |
+
"id": "hacR8qDFVOWo"
|
329 |
+
},
|
330 |
+
"outputs": [],
|
331 |
+
"source": [
|
332 |
+
"#@title 设定模型备份\n",
|
333 |
+
"#@markdown **是否备份模型到云盘,colab随时爆炸建议备份,默认保存到云盘根目录Sovits5.0文件夹**\n",
|
334 |
+
"Save_to_drive = True #@param {type:\"boolean\"}\n",
|
335 |
+
"if Save_to_drive:\n",
|
336 |
+
" !mkdir -p /content/so-vits-svc-5.0/chkpt/\n",
|
337 |
+
" !rm -rf /content/so-vits-svc-5.0/chkpt/\n",
|
338 |
+
" !mkdir -p /content/drive/MyDrive/Sovits5.0\n",
|
339 |
+
" !ln -s /content/drive/MyDrive/Sovits5.0 /content/so-vits-svc-5.0/chkpt/"
|
340 |
+
]
|
341 |
+
},
|
342 |
+
{
|
343 |
+
"cell_type": "code",
|
344 |
+
"execution_count": null,
|
345 |
+
"metadata": {
|
346 |
+
"id": "5BIiKIAoU3Kd"
|
347 |
+
},
|
348 |
+
"outputs": [],
|
349 |
+
"source": [
|
350 |
+
"#@title 开始训练\n",
|
351 |
+
"%load_ext tensorboard\n",
|
352 |
+
"%tensorboard --logdir /content/so-vits-svc-5.0/logs/\n",
|
353 |
+
"\n",
|
354 |
+
"!PYTHONPATH=. python svc_trainer.py -c configs/base.yaml -n sovits5.0"
|
355 |
+
]
|
356 |
+
}
|
357 |
+
],
|
358 |
+
"metadata": {
|
359 |
+
"accelerator": "GPU",
|
360 |
+
"colab": {
|
361 |
+
"provenance": []
|
362 |
+
},
|
363 |
+
"gpuClass": "standard",
|
364 |
+
"kernelspec": {
|
365 |
+
"display_name": "Python 3",
|
366 |
+
"name": "python3"
|
367 |
+
},
|
368 |
+
"language_info": {
|
369 |
+
"name": "python"
|
370 |
+
}
|
371 |
+
},
|
372 |
+
"nbformat": 4,
|
373 |
+
"nbformat_minor": 0
|
374 |
+
}
|
configs/base.yaml
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
train:
|
2 |
+
model: "sovits"
|
3 |
+
seed: 1234
|
4 |
+
epochs: 10000
|
5 |
+
learning_rate: 5e-5
|
6 |
+
betas: [0.8, 0.99]
|
7 |
+
lr_decay: 0.999875
|
8 |
+
eps: 1e-9
|
9 |
+
batch_size: 8
|
10 |
+
accum_step: 2
|
11 |
+
c_stft: 9
|
12 |
+
c_mel: 1.
|
13 |
+
c_kl: 0.2
|
14 |
+
port: 8001
|
15 |
+
pretrain: "./vits_pretrain/sovits5.0.pretrain.pth"
|
16 |
+
#############################
|
17 |
+
data:
|
18 |
+
training_files: "files/train.txt"
|
19 |
+
validation_files: "files/valid.txt"
|
20 |
+
segment_size: 8000 # WARNING: base on hop_length
|
21 |
+
max_wav_value: 32768.0
|
22 |
+
sampling_rate: 32000
|
23 |
+
filter_length: 1024
|
24 |
+
hop_length: 320
|
25 |
+
win_length: 1024
|
26 |
+
mel_channels: 100
|
27 |
+
mel_fmin: 50.0
|
28 |
+
mel_fmax: 16000.0
|
29 |
+
#############################
|
30 |
+
vits:
|
31 |
+
ppg_dim: 1280
|
32 |
+
vec_dim: 256
|
33 |
+
spk_dim: 256
|
34 |
+
gin_channels: 256
|
35 |
+
inter_channels: 192
|
36 |
+
hidden_channels: 192
|
37 |
+
filter_channels: 640
|
38 |
+
#############################
|
39 |
+
gen:
|
40 |
+
upsample_input: 192
|
41 |
+
upsample_rates: [5,4,4,2,2]
|
42 |
+
upsample_kernel_sizes: [15,8,8,4,4]
|
43 |
+
upsample_initial_channel: 320
|
44 |
+
resblock_kernel_sizes: [3,7,11]
|
45 |
+
resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
|
46 |
+
#############################
|
47 |
+
mpd:
|
48 |
+
periods: [2,3,5,7,11]
|
49 |
+
kernel_size: 5
|
50 |
+
stride: 3
|
51 |
+
use_spectral_norm: False
|
52 |
+
lReLU_slope: 0.2
|
53 |
+
#############################
|
54 |
+
mrd:
|
55 |
+
resolutions: "[(1024, 120, 600), (2048, 240, 1200), (4096, 480, 2400), (512, 50, 240)]" # (filter_length, hop_length, win_length)
|
56 |
+
use_spectral_norm: False
|
57 |
+
lReLU_slope: 0.2
|
58 |
+
#############################
|
59 |
+
log:
|
60 |
+
info_interval: 100
|
61 |
+
eval_interval: 1
|
62 |
+
save_interval: 5
|
63 |
+
num_audio: 6
|
64 |
+
pth_dir: 'chkpt'
|
65 |
+
log_dir: 'logs'
|
66 |
+
keep_ckpts: 0
|
67 |
+
#############################
|
68 |
+
dist_config:
|
69 |
+
dist_backend: "nccl"
|
70 |
+
dist_url: "tcp://localhost:54321"
|
71 |
+
world_size: 1
|
72 |
+
|
configs/singers/singer0001.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e2879921d43bdbf11fc5d6ac91f434f905a2c5e59d75368bfbf3c6bdbddcb3cf
|
3 |
+
size 1152
|
configs/singers/singer0002.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fbe5c7925c2fdb514e2c5b450de1d2737ec7f86f1c65eeb488c1888c0b9a7069
|
3 |
+
size 1152
|
configs/singers/singer0003.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5665126aeb6c6fab89c79b90debf2ce2e64b321076dcb414089eff8848eac8b4
|
3 |
+
size 1152
|
configs/singers/singer0004.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:79f0fe5993e9adcaeae25b0fa68265d40c9c1b5539ca12d6e438477de2177819
|
3 |
+
size 1152
|
configs/singers/singer0005.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1158fb447929cf9400a31675cf9992fd3ed7558e061562189d9e6bf56d83fb2a
|
3 |
+
size 1152
|
configs/singers/singer0006.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:06c1fd3a9afaa7944e4b81b7ca787e667b0dae8c7e90c6d24177245449f4e940
|
3 |
+
size 1152
|
configs/singers/singer0007.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:36611b9e57545332b9fb97fd35a356fbe8d60258f2f5e2232168481bb6dfab5b
|
3 |
+
size 1152
|
configs/singers/singer0008.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8584ad6f3569a1307082cd410085d9a562807e962274b89b72487c7bc79124d4
|
3 |
+
size 1152
|
configs/singers/singer0009.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b069db4e3e5ca389ffba974c74eab46caf4c60545773e5f7e5e253310619073e
|
3 |
+
size 1152
|
configs/singers/singer0010.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7d4d92735e4bac1618e89198d113013db09061b6c1f74ba0c500b70b097cd407
|
3 |
+
size 1152
|
configs/singers/singer0011.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:942388b4276dc06ee365f59c324ce1642e4bf810dcc99992739787e3b9ad135d
|
3 |
+
size 1152
|
configs/singers/singer0012.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3411efcf4ee4f534cea2b742c2eca166ae971efbceab21fb41b77b8923a1ba3a
|
3 |
+
size 1152
|
configs/singers/singer0013.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6e8e30cd1bce61405db194278dd7bf207d16abf656dd22f9a20f29e3657674f3
|
3 |
+
size 1152
|
configs/singers/singer0014.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f9cc8200753b4ba7605c9a13bf454b100025965135c5d816f7440ec53a2e6dd4
|
3 |
+
size 1152
|
configs/singers/singer0015.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dcb58688e51dbdeb22e5dd85d27ff3904c4594c78420b8e9c9ab481adbecc5fe
|
3 |
+
size 1152
|
configs/singers/singer0016.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:66a3c6162b8c937e9e8bbdc806b873866afce4b110664831642f7b41922bbf39
|
3 |
+
size 1152
|
configs/singers/singer0017.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:84782c98c930bd980f350837f4b3e8e193c49ef46aef9f92471c6136659975a9
|
3 |
+
size 1152
|
configs/singers/singer0018.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:731ebafda06aecedfd79941978149a0f87595f04e24eab7ed5300defe9070fc0
|
3 |
+
size 1152
|
configs/singers/singer0019.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3d88e620994e4413c4c58ffb9239ef46ded60ff3eab0715c7af96cbe4092198f
|
3 |
+
size 1152
|
configs/singers/singer0020.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3e5abaabe5457a20161351dcf5f8737d63a2a92fb1de1842ea9e92e47b9ca6fe
|
3 |
+
size 1152
|
configs/singers/singer0021.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1d7f99c92c89a44c1f2dd0688f033f0593c8c88b0537b092928bfbaa63a8d3e9
|
3 |
+
size 1152
|
configs/singers/singer0022.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:33becb1da48b12ba4957a0ef0b25bbd51e100d5762ebc4c7d381f6b957e682a2
|
3 |
+
size 1152
|
configs/singers/singer0023.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7f49cbaf3f7653f48f80854a513a334f31dca719a09cca66e257995ce4a741a9
|
3 |
+
size 1152
|
configs/singers/singer0024.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:92ed584994d56473c8bab0799d213e927c5a2928facef2b93a2f95f764d868b4
|
3 |
+
size 1152
|
configs/singers/singer0025.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:14b7e1f55393d5beaa2f3bbd0ef7f2be7e108993c680acb265ff24df19f7062b
|
3 |
+
size 1152
|
configs/singers/singer0026.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:92ecc9aa68f136960c00e98aaca16e92c38960bc7eb9687aee90190972974726
|
3 |
+
size 1152
|
configs/singers/singer0027.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f5a8a1c2a445179d38664fb55c84ee9a36350beee50efa9f850d29b394447bfa
|
3 |
+
size 1152
|
configs/singers/singer0028.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b79b8266c8d368dc99f49a347b2631e1e5cfb44056b5a9ab4470b42f9851ee35
|
3 |
+
size 1152
|
configs/singers/singer0029.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:60fa5fd9e8ba14d7f6d67304842f16382f7d2e739969bde9551222ff8c282775
|
3 |
+
size 1152
|
configs/singers/singer0030.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2f5070e4196c91fa713aed20aedb2a570a7b2ad8301ee61f59821dafaea3c6a7
|
3 |
+
size 1152
|
configs/singers/singer0031.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:47f4f8c065be1c5448c1b80e5c99087e7357cf1f8a8a55f2d844ccf1ca4931e6
|
3 |
+
size 1152
|
configs/singers/singer0032.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:019f40cf49cb7ccb44fb9c6a9f6345e84f837185a1642623144b4e2969c8738b
|
3 |
+
size 1152
|
configs/singers/singer0033.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2e05e212c93fc9e7b13174dd76721ee891bb4ea8bb1638a4c43523ed65d30f67
|
3 |
+
size 1152
|
configs/singers/singer0034.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:715a089dd9b3e5cbf021b0f41055f59208911e49cccf375ecf8b82544f325c3d
|
3 |
+
size 1152
|
configs/singers/singer0035.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9af8cd05182ec53ff573bce53dad049759bea1de5656915f414910eaf47f61ed
|
3 |
+
size 1152
|
configs/singers/singer0036.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3cec474244d86acfd24d6abf7e033b24b40b838cba2fcd3b4d0e5611313d67ef
|
3 |
+
size 1152
|
configs/singers/singer0037.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:316e3435d373e352fe95fcb2ec0ab1c8afdeb270ce9f13c940ba91187eecdcf3
|
3 |
+
size 1152
|
configs/singers/singer0038.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0e6458e251512dab86abce504490de6762f9c2de66ddbc853c24c3d05eb39c96
|
3 |
+
size 1152
|
configs/singers/singer0039.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c2e484ae33eef7ac92dd784e9e3b9bca7e6c0838d50b43c674da47620f281f20
|
3 |
+
size 1152
|
configs/singers/singer0040.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7b3a104163ad4cf87caff70b845b2c3e70190ce430a8f21247d350ef102071dc
|
3 |
+
size 1152
|